LLVM OpenMP* Runtime Library
Loading...
Searching...
No Matches
kmp_runtime.cpp
1/*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_affinity.h"
15#include "kmp_atomic.h"
16#include "kmp_environment.h"
17#include "kmp_error.h"
18#include "kmp_i18n.h"
19#include "kmp_io.h"
20#include "kmp_itt.h"
21#include "kmp_settings.h"
22#include "kmp_stats.h"
23#include "kmp_str.h"
24#include "kmp_wait_release.h"
25#include "kmp_wrapper_getpid.h"
26#include "kmp_dispatch.h"
27#if KMP_USE_HIER_SCHED
28#include "kmp_dispatch_hier.h"
29#endif
30
31#if OMPT_SUPPORT
32#include "ompt-specific.h"
33#endif
34#if OMPD_SUPPORT
35#include "ompd-specific.h"
36#endif
37
38#if OMP_PROFILING_SUPPORT
39#include "llvm/Support/TimeProfiler.h"
40static char *ProfileTraceFile = nullptr;
41#endif
42
43/* these are temporary issues to be dealt with */
44#define KMP_USE_PRCTL 0
45
46#if KMP_OS_WINDOWS
47#include <process.h>
48#endif
49
50#if KMP_OS_WINDOWS
51// windows does not need include files as it doesn't use shared memory
52#else
53#include <sys/mman.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#define SHM_SIZE 1024
57#endif
58
59#if defined(KMP_GOMP_COMPAT)
60char const __kmp_version_alt_comp[] =
61 KMP_VERSION_PREFIX "alternative compiler support: yes";
62#endif /* defined(KMP_GOMP_COMPAT) */
63
64char const __kmp_version_omp_api[] =
65 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66
67#ifdef KMP_DEBUG
68char const __kmp_version_lock[] =
69 KMP_VERSION_PREFIX "lock type: run time selectable";
70#endif /* KMP_DEBUG */
71
72#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73
74/* ------------------------------------------------------------------------ */
75
76#if KMP_USE_MONITOR
77kmp_info_t __kmp_monitor;
78#endif
79
80/* Forward declarations */
81
82void __kmp_cleanup(void);
83
84static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85 int gtid);
86static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87 kmp_internal_control_t *new_icvs,
88 ident_t *loc);
89#if KMP_AFFINITY_SUPPORTED
90static void __kmp_partition_places(kmp_team_t *team,
91 int update_master_only = 0);
92#endif
93static void __kmp_do_serial_initialize(void);
94void __kmp_fork_barrier(int gtid, int tid);
95void __kmp_join_barrier(int gtid);
96void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97 kmp_internal_control_t *new_icvs, ident_t *loc);
98
99#ifdef USE_LOAD_BALANCE
100static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101#endif
102
103static int __kmp_expand_threads(int nNeed);
104#if KMP_OS_WINDOWS
105static int __kmp_unregister_root_other_thread(int gtid);
106#endif
107static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109
110void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111 int new_nthreads);
112void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113
114/* Calculate the identifier of the current thread */
115/* fast (and somewhat portable) way to get unique identifier of executing
116 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117int __kmp_get_global_thread_id() {
118 int i;
119 kmp_info_t **other_threads;
120 size_t stack_data;
121 char *stack_addr;
122 size_t stack_size;
123 char *stack_base;
124
125 KA_TRACE(
126 1000,
127 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
128 __kmp_nth, __kmp_all_nth));
129
130 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133 __kmp_init_gtid for this to work. */
134
135 if (!TCR_4(__kmp_init_gtid))
136 return KMP_GTID_DNE;
137
138#ifdef KMP_TDATA_GTID
139 if (TCR_4(__kmp_gtid_mode) >= 3) {
140 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141 return __kmp_gtid;
142 }
143#endif
144 if (TCR_4(__kmp_gtid_mode) >= 2) {
145 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146 return __kmp_gtid_get_specific();
147 }
148 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149
150 stack_addr = (char *)&stack_data;
151 other_threads = __kmp_threads;
152
153 /* ATT: The code below is a source of potential bugs due to unsynchronized
154 access to __kmp_threads array. For example:
155 1. Current thread loads other_threads[i] to thr and checks it, it is
156 non-NULL.
157 2. Current thread is suspended by OS.
158 3. Another thread unregisters and finishes (debug versions of free()
159 may fill memory with something like 0xEF).
160 4. Current thread is resumed.
161 5. Current thread reads junk from *thr.
162 TODO: Fix it. --ln */
163
164 for (i = 0; i < __kmp_threads_capacity; i++) {
165
166 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167 if (!thr)
168 continue;
169
170 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172
173 /* stack grows down -- search through all of the active threads */
174
175 if (stack_addr <= stack_base) {
176 size_t stack_diff = stack_base - stack_addr;
177
178 if (stack_diff <= stack_size) {
179 /* The only way we can be closer than the allocated */
180 /* stack size is if we are running on this thread. */
181 // __kmp_gtid_get_specific can return negative value because this
182 // function can be called by thread destructor. However, before the
183 // thread destructor is called, the value of the corresponding
184 // thread-specific data will be reset to NULL.
185 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
186 __kmp_gtid_get_specific() == i);
187 return i;
188 }
189 }
190 }
191
192 /* get specific to try and determine our gtid */
193 KA_TRACE(1000,
194 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
195 "thread, using TLS\n"));
196 i = __kmp_gtid_get_specific();
197
198 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
199
200 /* if we havn't been assigned a gtid, then return code */
201 if (i < 0)
202 return i;
203
204 // other_threads[i] can be nullptr at this point because the corresponding
205 // thread could have already been destructed. It can happen when this function
206 // is called in end library routine.
207 if (!TCR_SYNC_PTR(other_threads[i]))
208 return i;
209
210 /* dynamically updated stack window for uber threads to avoid get_specific
211 call */
212 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
213 KMP_FATAL(StackOverflow, i);
214 }
215
216 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
217 if (stack_addr > stack_base) {
218 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
219 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
220 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
221 stack_base);
222 } else {
223 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
224 stack_base - stack_addr);
225 }
226
227 /* Reprint stack bounds for ubermaster since they have been refined */
228 if (__kmp_storage_map) {
229 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
230 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
231 __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
232 other_threads[i]->th.th_info.ds.ds_stacksize,
233 "th_%d stack (refinement)", i);
234 }
235 return i;
236}
237
238int __kmp_get_global_thread_id_reg() {
239 int gtid;
240
241 if (!__kmp_init_serial) {
242 gtid = KMP_GTID_DNE;
243 } else
244#ifdef KMP_TDATA_GTID
245 if (TCR_4(__kmp_gtid_mode) >= 3) {
246 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
247 gtid = __kmp_gtid;
248 } else
249#endif
250 if (TCR_4(__kmp_gtid_mode) >= 2) {
251 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
252 gtid = __kmp_gtid_get_specific();
253 } else {
254 KA_TRACE(1000,
255 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
256 gtid = __kmp_get_global_thread_id();
257 }
258
259 /* we must be a new uber master sibling thread */
260 if (gtid == KMP_GTID_DNE) {
261 KA_TRACE(10,
262 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
263 "Registering a new gtid.\n"));
264 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
265 if (!__kmp_init_serial) {
266 __kmp_do_serial_initialize();
267 gtid = __kmp_gtid_get_specific();
268 } else {
269 gtid = __kmp_register_root(FALSE);
270 }
271 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
272 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
273 }
274
275 KMP_DEBUG_ASSERT(gtid >= 0);
276
277 return gtid;
278}
279
280/* caller must hold forkjoin_lock */
281void __kmp_check_stack_overlap(kmp_info_t *th) {
282 int f;
283 char *stack_beg = NULL;
284 char *stack_end = NULL;
285 int gtid;
286
287 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
288 if (__kmp_storage_map) {
289 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
290 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
291
292 gtid = __kmp_gtid_from_thread(th);
293
294 if (gtid == KMP_GTID_MONITOR) {
295 __kmp_print_storage_map_gtid(
296 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
297 "th_%s stack (%s)", "mon",
298 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
299 } else {
300 __kmp_print_storage_map_gtid(
301 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
302 "th_%d stack (%s)", gtid,
303 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
304 }
305 }
306
307 /* No point in checking ubermaster threads since they use refinement and
308 * cannot overlap */
309 gtid = __kmp_gtid_from_thread(th);
310 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
311 KA_TRACE(10,
312 ("__kmp_check_stack_overlap: performing extensive checking\n"));
313 if (stack_beg == NULL) {
314 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
315 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
316 }
317
318 for (f = 0; f < __kmp_threads_capacity; f++) {
319 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
320
321 if (f_th && f_th != th) {
322 char *other_stack_end =
323 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
324 char *other_stack_beg =
325 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
326 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
327 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
328
329 /* Print the other stack values before the abort */
330 if (__kmp_storage_map)
331 __kmp_print_storage_map_gtid(
332 -1, other_stack_beg, other_stack_end,
333 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
334 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
335
336 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
337 __kmp_msg_null);
338 }
339 }
340 }
341 }
342 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
343}
344
345/* ------------------------------------------------------------------------ */
346
347void __kmp_infinite_loop(void) {
348 static int done = FALSE;
349
350 while (!done) {
351 KMP_YIELD(TRUE);
352 }
353}
354
355#define MAX_MESSAGE 512
356
357void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
358 char const *format, ...) {
359 char buffer[MAX_MESSAGE];
360 va_list ap;
361
362 va_start(ap, format);
363 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
364 p2, (unsigned long)size, format);
365 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
366 __kmp_vprintf(kmp_err, buffer, ap);
367#if KMP_PRINT_DATA_PLACEMENT
368 int node;
369 if (gtid >= 0) {
370 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
371 if (__kmp_storage_map_verbose) {
372 node = __kmp_get_host_node(p1);
373 if (node < 0) /* doesn't work, so don't try this next time */
374 __kmp_storage_map_verbose = FALSE;
375 else {
376 char *last;
377 int lastNode;
378 int localProc = __kmp_get_cpu_from_gtid(gtid);
379
380 const int page_size = KMP_GET_PAGE_SIZE();
381
382 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
383 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
384 if (localProc >= 0)
385 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
386 localProc >> 1);
387 else
388 __kmp_printf_no_lock(" GTID %d\n", gtid);
389#if KMP_USE_PRCTL
390 /* The more elaborate format is disabled for now because of the prctl
391 * hanging bug. */
392 do {
393 last = p1;
394 lastNode = node;
395 /* This loop collates adjacent pages with the same host node. */
396 do {
397 (char *)p1 += page_size;
398 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
399 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
400 lastNode);
401 } while (p1 <= p2);
402#else
403 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
404 (char *)p1 + (page_size - 1),
405 __kmp_get_host_node(p1));
406 if (p1 < p2) {
407 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
408 (char *)p2 + (page_size - 1),
409 __kmp_get_host_node(p2));
410 }
411#endif
412 }
413 }
414 } else
415 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
416 }
417#endif /* KMP_PRINT_DATA_PLACEMENT */
418 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
419
420 va_end(ap);
421}
422
423void __kmp_warn(char const *format, ...) {
424 char buffer[MAX_MESSAGE];
425 va_list ap;
426
427 if (__kmp_generate_warnings == kmp_warnings_off) {
428 return;
429 }
430
431 va_start(ap, format);
432
433 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
434 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
435 __kmp_vprintf(kmp_err, buffer, ap);
436 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
437
438 va_end(ap);
439}
440
441void __kmp_abort_process() {
442 // Later threads may stall here, but that's ok because abort() will kill them.
443 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
444
445 if (__kmp_debug_buf) {
446 __kmp_dump_debug_buffer();
447 }
448
449 if (KMP_OS_WINDOWS) {
450 // Let other threads know of abnormal termination and prevent deadlock
451 // if abort happened during library initialization or shutdown
452 __kmp_global.g.g_abort = SIGABRT;
453
454 /* On Windows* OS by default abort() causes pop-up error box, which stalls
455 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
456 boxes. _set_abort_behavior() works well, but this function is not
457 available in VS7 (this is not problem for DLL, but it is a problem for
458 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
459 help, at least in some versions of MS C RTL.
460
461 It seems following sequence is the only way to simulate abort() and
462 avoid pop-up error box. */
463 raise(SIGABRT);
464 _exit(3); // Just in case, if signal ignored, exit anyway.
465 } else {
466 __kmp_unregister_library();
467 abort();
468 }
469
470 __kmp_infinite_loop();
471 __kmp_release_bootstrap_lock(&__kmp_exit_lock);
472
473} // __kmp_abort_process
474
475void __kmp_abort_thread(void) {
476 // TODO: Eliminate g_abort global variable and this function.
477 // In case of abort just call abort(), it will kill all the threads.
478 __kmp_infinite_loop();
479} // __kmp_abort_thread
480
481/* Print out the storage map for the major kmp_info_t thread data structures
482 that are allocated together. */
483
484static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
485 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
486 gtid);
487
488 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
489 sizeof(kmp_desc_t), "th_%d.th_info", gtid);
490
491 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
492 sizeof(kmp_local_t), "th_%d.th_local", gtid);
493
494 __kmp_print_storage_map_gtid(
495 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
496 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
497
498 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
499 &thr->th.th_bar[bs_plain_barrier + 1],
500 sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
501 gtid);
502
503 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
504 &thr->th.th_bar[bs_forkjoin_barrier + 1],
505 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
506 gtid);
507
508#if KMP_FAST_REDUCTION_BARRIER
509 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
510 &thr->th.th_bar[bs_reduction_barrier + 1],
511 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
512 gtid);
513#endif // KMP_FAST_REDUCTION_BARRIER
514}
515
516/* Print out the storage map for the major kmp_team_t team data structures
517 that are allocated together. */
518
519static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
520 int team_id, int num_thr) {
521 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
522 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
523 header, team_id);
524
525 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
526 &team->t.t_bar[bs_last_barrier],
527 sizeof(kmp_balign_team_t) * bs_last_barrier,
528 "%s_%d.t_bar", header, team_id);
529
530 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
531 &team->t.t_bar[bs_plain_barrier + 1],
532 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
533 header, team_id);
534
535 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
536 &team->t.t_bar[bs_forkjoin_barrier + 1],
537 sizeof(kmp_balign_team_t),
538 "%s_%d.t_bar[forkjoin]", header, team_id);
539
540#if KMP_FAST_REDUCTION_BARRIER
541 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
542 &team->t.t_bar[bs_reduction_barrier + 1],
543 sizeof(kmp_balign_team_t),
544 "%s_%d.t_bar[reduction]", header, team_id);
545#endif // KMP_FAST_REDUCTION_BARRIER
546
547 __kmp_print_storage_map_gtid(
548 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
549 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
550
551 __kmp_print_storage_map_gtid(
552 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
553 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
554
555 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
556 &team->t.t_disp_buffer[num_disp_buff],
557 sizeof(dispatch_shared_info_t) * num_disp_buff,
558 "%s_%d.t_disp_buffer", header, team_id);
559}
560
561static void __kmp_init_allocator() {
562 __kmp_init_memkind();
563 __kmp_init_target_mem();
564}
565static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
566
567/* ------------------------------------------------------------------------ */
568
569#if ENABLE_LIBOMPTARGET
570static void __kmp_init_omptarget() {
571 __kmp_init_target_task();
572}
573#endif
574
575/* ------------------------------------------------------------------------ */
576
577#if KMP_DYNAMIC_LIB
578#if KMP_OS_WINDOWS
579
580BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
581 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
582
583 switch (fdwReason) {
584
585 case DLL_PROCESS_ATTACH:
586 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
587
588 return TRUE;
589
590 case DLL_PROCESS_DETACH:
591 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
592
593 // According to Windows* documentation for DllMain entry point:
594 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
595 // lpReserved == NULL when FreeLibrary() is called,
596 // lpReserved != NULL when the process is terminated.
597 // When FreeLibrary() is called, worker threads remain alive. So the
598 // runtime's state is consistent and executing proper shutdown is OK.
599 // When the process is terminated, worker threads have exited or been
600 // forcefully terminated by the OS and only the shutdown thread remains.
601 // This can leave the runtime in an inconsistent state.
602 // Hence, only attempt proper cleanup when FreeLibrary() is called.
603 // Otherwise, rely on OS to reclaim resources.
604 if (lpReserved == NULL)
605 __kmp_internal_end_library(__kmp_gtid_get_specific());
606
607 return TRUE;
608
609 case DLL_THREAD_ATTACH:
610 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
611
612 /* if we want to register new siblings all the time here call
613 * __kmp_get_gtid(); */
614 return TRUE;
615
616 case DLL_THREAD_DETACH:
617 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
618
619 __kmp_internal_end_thread(__kmp_gtid_get_specific());
620 return TRUE;
621 }
622
623 return TRUE;
624}
625
626#endif /* KMP_OS_WINDOWS */
627#endif /* KMP_DYNAMIC_LIB */
628
629/* __kmp_parallel_deo -- Wait until it's our turn. */
630void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
631 int gtid = *gtid_ref;
632#ifdef BUILD_PARALLEL_ORDERED
633 kmp_team_t *team = __kmp_team_from_gtid(gtid);
634#endif /* BUILD_PARALLEL_ORDERED */
635
636 if (__kmp_env_consistency_check) {
637 if (__kmp_threads[gtid]->th.th_root->r.r_active)
638#if KMP_USE_DYNAMIC_LOCK
639 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
640#else
641 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
642#endif
643 }
644#ifdef BUILD_PARALLEL_ORDERED
645 if (!team->t.t_serialized) {
646 KMP_MB();
647 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
648 NULL);
649 KMP_MB();
650 }
651#endif /* BUILD_PARALLEL_ORDERED */
652}
653
654/* __kmp_parallel_dxo -- Signal the next task. */
655void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
656 int gtid = *gtid_ref;
657#ifdef BUILD_PARALLEL_ORDERED
658 int tid = __kmp_tid_from_gtid(gtid);
659 kmp_team_t *team = __kmp_team_from_gtid(gtid);
660#endif /* BUILD_PARALLEL_ORDERED */
661
662 if (__kmp_env_consistency_check) {
663 if (__kmp_threads[gtid]->th.th_root->r.r_active)
664 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
665 }
666#ifdef BUILD_PARALLEL_ORDERED
667 if (!team->t.t_serialized) {
668 KMP_MB(); /* Flush all pending memory write invalidates. */
669
670 /* use the tid of the next thread in this team */
671 /* TODO replace with general release procedure */
672 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
673
674 KMP_MB(); /* Flush all pending memory write invalidates. */
675 }
676#endif /* BUILD_PARALLEL_ORDERED */
677}
678
679/* ------------------------------------------------------------------------ */
680/* The BARRIER for a SINGLE process section is always explicit */
681
682int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
683 int status;
684 kmp_info_t *th;
685 kmp_team_t *team;
686
687 if (!TCR_4(__kmp_init_parallel))
688 __kmp_parallel_initialize();
689 __kmp_resume_if_soft_paused();
690
691 th = __kmp_threads[gtid];
692 team = th->th.th_team;
693 status = 0;
694
695 th->th.th_ident = id_ref;
696
697 if (team->t.t_serialized) {
698 status = 1;
699 } else {
700 kmp_int32 old_this = th->th.th_local.this_construct;
701
702 ++th->th.th_local.this_construct;
703 /* try to set team count to thread count--success means thread got the
704 single block */
705 /* TODO: Should this be acquire or release? */
706 if (team->t.t_construct == old_this) {
707 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
708 th->th.th_local.this_construct);
709 }
710#if USE_ITT_BUILD
711 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
712 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
713 team->t.t_active_level == 1) {
714 // Only report metadata by primary thread of active team at level 1
715 __kmp_itt_metadata_single(id_ref);
716 }
717#endif /* USE_ITT_BUILD */
718 }
719
720 if (__kmp_env_consistency_check) {
721 if (status && push_ws) {
722 __kmp_push_workshare(gtid, ct_psingle, id_ref);
723 } else {
724 __kmp_check_workshare(gtid, ct_psingle, id_ref);
725 }
726 }
727#if USE_ITT_BUILD
728 if (status) {
729 __kmp_itt_single_start(gtid);
730 }
731#endif /* USE_ITT_BUILD */
732 return status;
733}
734
735void __kmp_exit_single(int gtid) {
736#if USE_ITT_BUILD
737 __kmp_itt_single_end(gtid);
738#endif /* USE_ITT_BUILD */
739 if (__kmp_env_consistency_check)
740 __kmp_pop_workshare(gtid, ct_psingle, NULL);
741}
742
743/* determine if we can go parallel or must use a serialized parallel region and
744 * how many threads we can use
745 * set_nproc is the number of threads requested for the team
746 * returns 0 if we should serialize or only use one thread,
747 * otherwise the number of threads to use
748 * The forkjoin lock is held by the caller. */
749static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
750 int master_tid, int set_nthreads,
751 int enter_teams) {
752 int capacity;
753 int new_nthreads;
754 KMP_DEBUG_ASSERT(__kmp_init_serial);
755 KMP_DEBUG_ASSERT(root && parent_team);
756 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
757
758 // If dyn-var is set, dynamically adjust the number of desired threads,
759 // according to the method specified by dynamic_mode.
760 new_nthreads = set_nthreads;
761 if (!get__dynamic_2(parent_team, master_tid)) {
762 ;
763 }
764#ifdef USE_LOAD_BALANCE
765 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
766 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
767 if (new_nthreads == 1) {
768 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
769 "reservation to 1 thread\n",
770 master_tid));
771 return 1;
772 }
773 if (new_nthreads < set_nthreads) {
774 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
775 "reservation to %d threads\n",
776 master_tid, new_nthreads));
777 }
778 }
779#endif /* USE_LOAD_BALANCE */
780 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
781 new_nthreads = __kmp_avail_proc - __kmp_nth +
782 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
783 if (new_nthreads <= 1) {
784 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
785 "reservation to 1 thread\n",
786 master_tid));
787 return 1;
788 }
789 if (new_nthreads < set_nthreads) {
790 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
791 "reservation to %d threads\n",
792 master_tid, new_nthreads));
793 } else {
794 new_nthreads = set_nthreads;
795 }
796 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
797 if (set_nthreads > 2) {
798 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
799 new_nthreads = (new_nthreads % set_nthreads) + 1;
800 if (new_nthreads == 1) {
801 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
802 "reservation to 1 thread\n",
803 master_tid));
804 return 1;
805 }
806 if (new_nthreads < set_nthreads) {
807 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
808 "reservation to %d threads\n",
809 master_tid, new_nthreads));
810 }
811 }
812 } else {
813 KMP_ASSERT(0);
814 }
815
816 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
817 if (__kmp_nth + new_nthreads -
818 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
819 __kmp_max_nth) {
820 int tl_nthreads = __kmp_max_nth - __kmp_nth +
821 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
822 if (tl_nthreads <= 0) {
823 tl_nthreads = 1;
824 }
825
826 // If dyn-var is false, emit a 1-time warning.
827 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
828 __kmp_reserve_warn = 1;
829 __kmp_msg(kmp_ms_warning,
830 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
831 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
832 }
833 if (tl_nthreads == 1) {
834 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
835 "reduced reservation to 1 thread\n",
836 master_tid));
837 return 1;
838 }
839 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
840 "reservation to %d threads\n",
841 master_tid, tl_nthreads));
842 new_nthreads = tl_nthreads;
843 }
844
845 // Respect OMP_THREAD_LIMIT
846 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
847 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
848 if (cg_nthreads + new_nthreads -
849 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
850 max_cg_threads) {
851 int tl_nthreads = max_cg_threads - cg_nthreads +
852 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
853 if (tl_nthreads <= 0) {
854 tl_nthreads = 1;
855 }
856
857 // If dyn-var is false, emit a 1-time warning.
858 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
859 __kmp_reserve_warn = 1;
860 __kmp_msg(kmp_ms_warning,
861 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
862 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
863 }
864 if (tl_nthreads == 1) {
865 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
866 "reduced reservation to 1 thread\n",
867 master_tid));
868 return 1;
869 }
870 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
871 "reservation to %d threads\n",
872 master_tid, tl_nthreads));
873 new_nthreads = tl_nthreads;
874 }
875
876 // Check if the threads array is large enough, or needs expanding.
877 // See comment in __kmp_register_root() about the adjustment if
878 // __kmp_threads[0] == NULL.
879 capacity = __kmp_threads_capacity;
880 if (TCR_PTR(__kmp_threads[0]) == NULL) {
881 --capacity;
882 }
883 // If it is not for initializing the hidden helper team, we need to take
884 // __kmp_hidden_helper_threads_num out of the capacity because it is included
885 // in __kmp_threads_capacity.
886 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
887 capacity -= __kmp_hidden_helper_threads_num;
888 }
889 if (__kmp_nth + new_nthreads -
890 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
891 capacity) {
892 // Expand the threads array.
893 int slotsRequired = __kmp_nth + new_nthreads -
894 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
895 capacity;
896 int slotsAdded = __kmp_expand_threads(slotsRequired);
897 if (slotsAdded < slotsRequired) {
898 // The threads array was not expanded enough.
899 new_nthreads -= (slotsRequired - slotsAdded);
900 KMP_ASSERT(new_nthreads >= 1);
901
902 // If dyn-var is false, emit a 1-time warning.
903 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
904 __kmp_reserve_warn = 1;
905 if (__kmp_tp_cached) {
906 __kmp_msg(kmp_ms_warning,
907 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
908 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
909 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
910 } else {
911 __kmp_msg(kmp_ms_warning,
912 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
913 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
914 }
915 }
916 }
917 }
918
919#ifdef KMP_DEBUG
920 if (new_nthreads == 1) {
921 KC_TRACE(10,
922 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
923 "dead roots and rechecking; requested %d threads\n",
924 __kmp_get_gtid(), set_nthreads));
925 } else {
926 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
927 " %d threads\n",
928 __kmp_get_gtid(), new_nthreads, set_nthreads));
929 }
930#endif // KMP_DEBUG
931 return new_nthreads;
932}
933
934/* Allocate threads from the thread pool and assign them to the new team. We are
935 assured that there are enough threads available, because we checked on that
936 earlier within critical section forkjoin */
937static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
938 kmp_info_t *master_th, int master_gtid,
939 int fork_teams_workers) {
940 int i;
941 int use_hot_team;
942
943 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
944 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
945 KMP_MB();
946
947 /* first, let's setup the primary thread */
948 master_th->th.th_info.ds.ds_tid = 0;
949 master_th->th.th_team = team;
950 master_th->th.th_team_nproc = team->t.t_nproc;
951 master_th->th.th_team_master = master_th;
952 master_th->th.th_team_serialized = FALSE;
953 master_th->th.th_dispatch = &team->t.t_dispatch[0];
954
955/* make sure we are not the optimized hot team */
956#if KMP_NESTED_HOT_TEAMS
957 use_hot_team = 0;
958 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
959 if (hot_teams) { // hot teams array is not allocated if
960 // KMP_HOT_TEAMS_MAX_LEVEL=0
961 int level = team->t.t_active_level - 1; // index in array of hot teams
962 if (master_th->th.th_teams_microtask) { // are we inside the teams?
963 if (master_th->th.th_teams_size.nteams > 1) {
964 ++level; // level was not increased in teams construct for
965 // team_of_masters
966 }
967 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
968 master_th->th.th_teams_level == team->t.t_level) {
969 ++level; // level was not increased in teams construct for
970 // team_of_workers before the parallel
971 } // team->t.t_level will be increased inside parallel
972 }
973 if (level < __kmp_hot_teams_max_level) {
974 if (hot_teams[level].hot_team) {
975 // hot team has already been allocated for given level
976 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
977 use_hot_team = 1; // the team is ready to use
978 } else {
979 use_hot_team = 0; // AC: threads are not allocated yet
980 hot_teams[level].hot_team = team; // remember new hot team
981 hot_teams[level].hot_team_nth = team->t.t_nproc;
982 }
983 } else {
984 use_hot_team = 0;
985 }
986 }
987#else
988 use_hot_team = team == root->r.r_hot_team;
989#endif
990 if (!use_hot_team) {
991
992 /* install the primary thread */
993 team->t.t_threads[0] = master_th;
994 __kmp_initialize_info(master_th, team, 0, master_gtid);
995
996 /* now, install the worker threads */
997 for (i = 1; i < team->t.t_nproc; i++) {
998
999 /* fork or reallocate a new thread and install it in team */
1000 kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1001 team->t.t_threads[i] = thr;
1002 KMP_DEBUG_ASSERT(thr);
1003 KMP_DEBUG_ASSERT(thr->th.th_team == team);
1004 /* align team and thread arrived states */
1005 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1006 "T#%d(%d:%d) join =%llu, plain=%llu\n",
1007 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1008 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1009 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1010 team->t.t_bar[bs_plain_barrier].b_arrived));
1011 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1012 thr->th.th_teams_level = master_th->th.th_teams_level;
1013 thr->th.th_teams_size = master_th->th.th_teams_size;
1014 { // Initialize threads' barrier data.
1015 int b;
1016 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1017 for (b = 0; b < bs_last_barrier; ++b) {
1018 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1019 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1020#if USE_DEBUGGER
1021 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1022#endif
1023 }
1024 }
1025 }
1026
1027#if KMP_AFFINITY_SUPPORTED
1028 // Do not partition the places list for teams construct workers who
1029 // haven't actually been forked to do real work yet. This partitioning
1030 // will take place in the parallel region nested within the teams construct.
1031 if (!fork_teams_workers) {
1032 __kmp_partition_places(team);
1033 }
1034#endif
1035
1036 if (team->t.t_nproc > 1 &&
1037 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1038 team->t.b->update_num_threads(team->t.t_nproc);
1039 __kmp_add_threads_to_team(team, team->t.t_nproc);
1040 }
1041 }
1042
1043 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1044 for (i = 0; i < team->t.t_nproc; i++) {
1045 kmp_info_t *thr = team->t.t_threads[i];
1046 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1047 thr->th.th_prev_level != team->t.t_level) {
1048 team->t.t_display_affinity = 1;
1049 break;
1050 }
1051 }
1052 }
1053
1054 KMP_MB();
1055}
1056
1057#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1058// Propagate any changes to the floating point control registers out to the team
1059// We try to avoid unnecessary writes to the relevant cache line in the team
1060// structure, so we don't make changes unless they are needed.
1061inline static void propagateFPControl(kmp_team_t *team) {
1062 if (__kmp_inherit_fp_control) {
1063 kmp_int16 x87_fpu_control_word;
1064 kmp_uint32 mxcsr;
1065
1066 // Get primary thread's values of FPU control flags (both X87 and vector)
1067 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1068 __kmp_store_mxcsr(&mxcsr);
1069 mxcsr &= KMP_X86_MXCSR_MASK;
1070
1071 // There is no point looking at t_fp_control_saved here.
1072 // If it is TRUE, we still have to update the values if they are different
1073 // from those we now have. If it is FALSE we didn't save anything yet, but
1074 // our objective is the same. We have to ensure that the values in the team
1075 // are the same as those we have.
1076 // So, this code achieves what we need whether or not t_fp_control_saved is
1077 // true. By checking whether the value needs updating we avoid unnecessary
1078 // writes that would put the cache-line into a written state, causing all
1079 // threads in the team to have to read it again.
1080 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1081 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1082 // Although we don't use this value, other code in the runtime wants to know
1083 // whether it should restore them. So we must ensure it is correct.
1084 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1085 } else {
1086 // Similarly here. Don't write to this cache-line in the team structure
1087 // unless we have to.
1088 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1089 }
1090}
1091
1092// Do the opposite, setting the hardware registers to the updated values from
1093// the team.
1094inline static void updateHWFPControl(kmp_team_t *team) {
1095 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1096 // Only reset the fp control regs if they have been changed in the team.
1097 // the parallel region that we are exiting.
1098 kmp_int16 x87_fpu_control_word;
1099 kmp_uint32 mxcsr;
1100 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1101 __kmp_store_mxcsr(&mxcsr);
1102 mxcsr &= KMP_X86_MXCSR_MASK;
1103
1104 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1105 __kmp_clear_x87_fpu_status_word();
1106 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1107 }
1108
1109 if (team->t.t_mxcsr != mxcsr) {
1110 __kmp_load_mxcsr(&team->t.t_mxcsr);
1111 }
1112 }
1113}
1114#else
1115#define propagateFPControl(x) ((void)0)
1116#define updateHWFPControl(x) ((void)0)
1117#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1118
1119static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1120 int realloc); // forward declaration
1121
1122/* Run a parallel region that has been serialized, so runs only in a team of the
1123 single primary thread. */
1124void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1125 kmp_info_t *this_thr;
1126 kmp_team_t *serial_team;
1127
1128 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1129
1130 /* Skip all this code for autopar serialized loops since it results in
1131 unacceptable overhead */
1132 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1133 return;
1134
1135 if (!TCR_4(__kmp_init_parallel))
1136 __kmp_parallel_initialize();
1137 __kmp_resume_if_soft_paused();
1138
1139 this_thr = __kmp_threads[global_tid];
1140 serial_team = this_thr->th.th_serial_team;
1141
1142 /* utilize the serialized team held by this thread */
1143 KMP_DEBUG_ASSERT(serial_team);
1144 KMP_MB();
1145
1146 if (__kmp_tasking_mode != tskm_immediate_exec) {
1147 KMP_DEBUG_ASSERT(
1148 this_thr->th.th_task_team ==
1149 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1150 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1151 NULL);
1152 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1153 "team %p, new task_team = NULL\n",
1154 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1155 this_thr->th.th_task_team = NULL;
1156 }
1157
1158 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1159 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1160 proc_bind = proc_bind_false;
1161 } else if (proc_bind == proc_bind_default) {
1162 // No proc_bind clause was specified, so use the current value
1163 // of proc-bind-var for this parallel region.
1164 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1165 }
1166 // Reset for next parallel region
1167 this_thr->th.th_set_proc_bind = proc_bind_default;
1168
1169 // Reset num_threads for next parallel region
1170 this_thr->th.th_set_nproc = 0;
1171
1172#if OMPT_SUPPORT
1173 ompt_data_t ompt_parallel_data = ompt_data_none;
1174 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1175 if (ompt_enabled.enabled &&
1176 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1177
1178 ompt_task_info_t *parent_task_info;
1179 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1180
1181 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1182 if (ompt_enabled.ompt_callback_parallel_begin) {
1183 int team_size = 1;
1184
1185 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1186 &(parent_task_info->task_data), &(parent_task_info->frame),
1187 &ompt_parallel_data, team_size,
1188 ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1189 }
1190 }
1191#endif // OMPT_SUPPORT
1192
1193 if (this_thr->th.th_team != serial_team) {
1194 // Nested level will be an index in the nested nthreads array
1195 int level = this_thr->th.th_team->t.t_level;
1196
1197 if (serial_team->t.t_serialized) {
1198 /* this serial team was already used
1199 TODO increase performance by making this locks more specific */
1200 kmp_team_t *new_team;
1201
1202 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1203
1204 new_team =
1205 __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1206#if OMPT_SUPPORT
1207 ompt_parallel_data,
1208#endif
1209 proc_bind, &this_thr->th.th_current_task->td_icvs,
1210 0 USE_NESTED_HOT_ARG(NULL));
1211 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1212 KMP_ASSERT(new_team);
1213
1214 /* setup new serialized team and install it */
1215 new_team->t.t_threads[0] = this_thr;
1216 new_team->t.t_parent = this_thr->th.th_team;
1217 serial_team = new_team;
1218 this_thr->th.th_serial_team = serial_team;
1219
1220 KF_TRACE(
1221 10,
1222 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1223 global_tid, serial_team));
1224
1225 /* TODO the above breaks the requirement that if we run out of resources,
1226 then we can still guarantee that serialized teams are ok, since we may
1227 need to allocate a new one */
1228 } else {
1229 KF_TRACE(
1230 10,
1231 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1232 global_tid, serial_team));
1233 }
1234
1235 /* we have to initialize this serial team */
1236 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1237 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1238 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1239 serial_team->t.t_ident = loc;
1240 serial_team->t.t_serialized = 1;
1241 serial_team->t.t_nproc = 1;
1242 serial_team->t.t_parent = this_thr->th.th_team;
1243 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1244 this_thr->th.th_team = serial_team;
1245 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1246
1247 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1248 this_thr->th.th_current_task));
1249 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1250 this_thr->th.th_current_task->td_flags.executing = 0;
1251
1252 __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1253
1254 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1255 implicit task for each serialized task represented by
1256 team->t.t_serialized? */
1257 copy_icvs(&this_thr->th.th_current_task->td_icvs,
1258 &this_thr->th.th_current_task->td_parent->td_icvs);
1259
1260 // Thread value exists in the nested nthreads array for the next nested
1261 // level
1262 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1263 this_thr->th.th_current_task->td_icvs.nproc =
1264 __kmp_nested_nth.nth[level + 1];
1265 }
1266
1267 if (__kmp_nested_proc_bind.used &&
1268 (level + 1 < __kmp_nested_proc_bind.used)) {
1269 this_thr->th.th_current_task->td_icvs.proc_bind =
1270 __kmp_nested_proc_bind.bind_types[level + 1];
1271 }
1272
1273#if USE_DEBUGGER
1274 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1275#endif
1276 this_thr->th.th_info.ds.ds_tid = 0;
1277
1278 /* set thread cache values */
1279 this_thr->th.th_team_nproc = 1;
1280 this_thr->th.th_team_master = this_thr;
1281 this_thr->th.th_team_serialized = 1;
1282
1283 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1284 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1285 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1286
1287 propagateFPControl(serial_team);
1288
1289 /* check if we need to allocate dispatch buffers stack */
1290 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1291 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1292 serial_team->t.t_dispatch->th_disp_buffer =
1293 (dispatch_private_info_t *)__kmp_allocate(
1294 sizeof(dispatch_private_info_t));
1295 }
1296 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1297
1298 KMP_MB();
1299
1300 } else {
1301 /* this serialized team is already being used,
1302 * that's fine, just add another nested level */
1303 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1304 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1305 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1306 ++serial_team->t.t_serialized;
1307 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1308
1309 // Nested level will be an index in the nested nthreads array
1310 int level = this_thr->th.th_team->t.t_level;
1311 // Thread value exists in the nested nthreads array for the next nested
1312 // level
1313 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1314 this_thr->th.th_current_task->td_icvs.nproc =
1315 __kmp_nested_nth.nth[level + 1];
1316 }
1317 serial_team->t.t_level++;
1318 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1319 "of serial team %p to %d\n",
1320 global_tid, serial_team, serial_team->t.t_level));
1321
1322 /* allocate/push dispatch buffers stack */
1323 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1324 {
1325 dispatch_private_info_t *disp_buffer =
1326 (dispatch_private_info_t *)__kmp_allocate(
1327 sizeof(dispatch_private_info_t));
1328 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1329 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1330 }
1331 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1332
1333 KMP_MB();
1334 }
1335 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1336
1337 // Perform the display affinity functionality for
1338 // serialized parallel regions
1339 if (__kmp_display_affinity) {
1340 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1341 this_thr->th.th_prev_num_threads != 1) {
1342 // NULL means use the affinity-format-var ICV
1343 __kmp_aux_display_affinity(global_tid, NULL);
1344 this_thr->th.th_prev_level = serial_team->t.t_level;
1345 this_thr->th.th_prev_num_threads = 1;
1346 }
1347 }
1348
1349 if (__kmp_env_consistency_check)
1350 __kmp_push_parallel(global_tid, NULL);
1351#if OMPT_SUPPORT
1352 serial_team->t.ompt_team_info.master_return_address = codeptr;
1353 if (ompt_enabled.enabled &&
1354 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1355 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1356 OMPT_GET_FRAME_ADDRESS(0);
1357
1358 ompt_lw_taskteam_t lw_taskteam;
1359 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1360 &ompt_parallel_data, codeptr);
1361
1362 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1363 // don't use lw_taskteam after linking. content was swaped
1364
1365 /* OMPT implicit task begin */
1366 if (ompt_enabled.ompt_callback_implicit_task) {
1367 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1368 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1369 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1370 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1371 OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1372 __kmp_tid_from_gtid(global_tid);
1373 }
1374
1375 /* OMPT state */
1376 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1377 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1378 OMPT_GET_FRAME_ADDRESS(0);
1379 }
1380#endif
1381}
1382
1383// Test if this fork is for a team closely nested in a teams construct
1384static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1385 microtask_t microtask, int level,
1386 int teams_level, kmp_va_list ap) {
1387 return (master_th->th.th_teams_microtask && ap &&
1388 microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1389}
1390
1391// Test if this fork is for the teams construct, i.e. to form the outer league
1392// of teams
1393static inline bool __kmp_is_entering_teams(int active_level, int level,
1394 int teams_level, kmp_va_list ap) {
1395 return ((ap == NULL && active_level == 0) ||
1396 (ap && teams_level > 0 && teams_level == level));
1397}
1398
1399// AC: This is start of parallel that is nested inside teams construct.
1400// The team is actual (hot), all workers are ready at the fork barrier.
1401// No lock needed to initialize the team a bit, then free workers.
1402static inline int
1403__kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1404 kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1405 enum fork_context_e call_context, microtask_t microtask,
1406 launch_t invoker, int master_set_numthreads, int level,
1407#if OMPT_SUPPORT
1408 ompt_data_t ompt_parallel_data, void *return_address,
1409#endif
1410 kmp_va_list ap) {
1411 void **argv;
1412 int i;
1413
1414 parent_team->t.t_ident = loc;
1415 __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1416 parent_team->t.t_argc = argc;
1417 argv = (void **)parent_team->t.t_argv;
1418 for (i = argc - 1; i >= 0; --i) {
1419 *argv++ = va_arg(kmp_va_deref(ap), void *);
1420 }
1421 // Increment our nested depth levels, but not increase the serialization
1422 if (parent_team == master_th->th.th_serial_team) {
1423 // AC: we are in serialized parallel
1424 __kmpc_serialized_parallel(loc, gtid);
1425 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1426
1427 if (call_context == fork_context_gnu) {
1428 // AC: need to decrement t_serialized for enquiry functions to work
1429 // correctly, will restore at join time
1430 parent_team->t.t_serialized--;
1431 return TRUE;
1432 }
1433
1434#if OMPD_SUPPORT
1435 parent_team->t.t_pkfn = microtask;
1436#endif
1437
1438#if OMPT_SUPPORT
1439 void *dummy;
1440 void **exit_frame_p;
1441 ompt_data_t *implicit_task_data;
1442 ompt_lw_taskteam_t lw_taskteam;
1443
1444 if (ompt_enabled.enabled) {
1445 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1446 &ompt_parallel_data, return_address);
1447 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1448
1449 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1450 // Don't use lw_taskteam after linking. Content was swapped.
1451
1452 /* OMPT implicit task begin */
1453 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1454 if (ompt_enabled.ompt_callback_implicit_task) {
1455 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1456 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1457 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1458 1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1459 }
1460
1461 /* OMPT state */
1462 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1463 } else {
1464 exit_frame_p = &dummy;
1465 }
1466#endif
1467
1468 // AC: need to decrement t_serialized for enquiry functions to work
1469 // correctly, will restore at join time
1470 parent_team->t.t_serialized--;
1471
1472 {
1473 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1474 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1475 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1476#if OMPT_SUPPORT
1477 ,
1478 exit_frame_p
1479#endif
1480 );
1481 }
1482
1483#if OMPT_SUPPORT
1484 if (ompt_enabled.enabled) {
1485 *exit_frame_p = NULL;
1486 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1487 if (ompt_enabled.ompt_callback_implicit_task) {
1488 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1489 ompt_scope_end, NULL, implicit_task_data, 1,
1490 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1491 }
1492 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1493 __ompt_lw_taskteam_unlink(master_th);
1494 if (ompt_enabled.ompt_callback_parallel_end) {
1495 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1496 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1497 OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1498 }
1499 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1500 }
1501#endif
1502 return TRUE;
1503 }
1504
1505 parent_team->t.t_pkfn = microtask;
1506 parent_team->t.t_invoke = invoker;
1507 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1508 parent_team->t.t_active_level++;
1509 parent_team->t.t_level++;
1510 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1511
1512 // If the threads allocated to the team are less than the thread limit, update
1513 // the thread limit here. th_teams_size.nth is specific to this team nested
1514 // in a teams construct, the team is fully created, and we're about to do
1515 // the actual fork. Best to do this here so that the subsequent uses below
1516 // and in the join have the correct value.
1517 master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1518
1519#if OMPT_SUPPORT
1520 if (ompt_enabled.enabled) {
1521 ompt_lw_taskteam_t lw_taskteam;
1522 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1523 return_address);
1524 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1525 }
1526#endif
1527
1528 /* Change number of threads in the team if requested */
1529 if (master_set_numthreads) { // The parallel has num_threads clause
1530 if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1531 // AC: only can reduce number of threads dynamically, can't increase
1532 kmp_info_t **other_threads = parent_team->t.t_threads;
1533 // NOTE: if using distributed barrier, we need to run this code block
1534 // even when the team size appears not to have changed from the max.
1535 int old_proc = master_th->th.th_teams_size.nth;
1536 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1537 __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1538 __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1539 }
1540 parent_team->t.t_nproc = master_set_numthreads;
1541 for (i = 0; i < master_set_numthreads; ++i) {
1542 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1543 }
1544 }
1545 // Keep extra threads hot in the team for possible next parallels
1546 master_th->th.th_set_nproc = 0;
1547 }
1548
1549#if USE_DEBUGGER
1550 if (__kmp_debugging) { // Let debugger override number of threads.
1551 int nth = __kmp_omp_num_threads(loc);
1552 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1553 master_set_numthreads = nth;
1554 }
1555 }
1556#endif
1557
1558 // Figure out the proc_bind policy for the nested parallel within teams
1559 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1560 // proc_bind_default means don't update
1561 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1562 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1563 proc_bind = proc_bind_false;
1564 } else {
1565 // No proc_bind clause specified; use current proc-bind-var
1566 if (proc_bind == proc_bind_default) {
1567 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1568 }
1569 /* else: The proc_bind policy was specified explicitly on parallel clause.
1570 This overrides proc-bind-var for this parallel region, but does not
1571 change proc-bind-var. */
1572 // Figure the value of proc-bind-var for the child threads.
1573 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1574 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1575 master_th->th.th_current_task->td_icvs.proc_bind)) {
1576 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1577 }
1578 }
1579 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1580 // Need to change the bind-var ICV to correct value for each implicit task
1581 if (proc_bind_icv != proc_bind_default &&
1582 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1583 kmp_info_t **other_threads = parent_team->t.t_threads;
1584 for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1585 other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1586 }
1587 }
1588 // Reset for next parallel region
1589 master_th->th.th_set_proc_bind = proc_bind_default;
1590
1591#if USE_ITT_BUILD && USE_ITT_NOTIFY
1592 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1593 KMP_ITT_DEBUG) &&
1594 __kmp_forkjoin_frames_mode == 3 &&
1595 parent_team->t.t_active_level == 1 // only report frames at level 1
1596 && master_th->th.th_teams_size.nteams == 1) {
1597 kmp_uint64 tmp_time = __itt_get_timestamp();
1598 master_th->th.th_frame_time = tmp_time;
1599 parent_team->t.t_region_time = tmp_time;
1600 }
1601 if (__itt_stack_caller_create_ptr) {
1602 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1603 // create new stack stitching id before entering fork barrier
1604 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1605 }
1606#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1607#if KMP_AFFINITY_SUPPORTED
1608 __kmp_partition_places(parent_team);
1609#endif
1610
1611 KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1612 "master_th=%p, gtid=%d\n",
1613 root, parent_team, master_th, gtid));
1614 __kmp_internal_fork(loc, gtid, parent_team);
1615 KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1616 "master_th=%p, gtid=%d\n",
1617 root, parent_team, master_th, gtid));
1618
1619 if (call_context == fork_context_gnu)
1620 return TRUE;
1621
1622 /* Invoke microtask for PRIMARY thread */
1623 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1624 parent_team->t.t_id, parent_team->t.t_pkfn));
1625
1626 if (!parent_team->t.t_invoke(gtid)) {
1627 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1628 }
1629 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1630 parent_team->t.t_id, parent_team->t.t_pkfn));
1631 KMP_MB(); /* Flush all pending memory write invalidates. */
1632
1633 KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1634
1635 return TRUE;
1636}
1637
1638// Create a serialized parallel region
1639static inline int
1640__kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1641 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1642 kmp_info_t *master_th, kmp_team_t *parent_team,
1643#if OMPT_SUPPORT
1644 ompt_data_t *ompt_parallel_data, void **return_address,
1645 ompt_data_t **parent_task_data,
1646#endif
1647 kmp_va_list ap) {
1648 kmp_team_t *team;
1649 int i;
1650 void **argv;
1651
1652/* josh todo: hypothetical question: what do we do for OS X*? */
1653#if KMP_OS_LINUX && \
1654 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1655 void *args[argc];
1656#else
1657 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1658#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1659 KMP_ARCH_AARCH64) */
1660
1661 KA_TRACE(
1662 20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1663
1664 __kmpc_serialized_parallel(loc, gtid);
1665
1666#if OMPD_SUPPORT
1667 master_th->th.th_serial_team->t.t_pkfn = microtask;
1668#endif
1669
1670 if (call_context == fork_context_intel) {
1671 /* TODO this sucks, use the compiler itself to pass args! :) */
1672 master_th->th.th_serial_team->t.t_ident = loc;
1673 if (!ap) {
1674 // revert change made in __kmpc_serialized_parallel()
1675 master_th->th.th_serial_team->t.t_level--;
1676// Get args from parent team for teams construct
1677
1678#if OMPT_SUPPORT
1679 void *dummy;
1680 void **exit_frame_p;
1681 ompt_task_info_t *task_info;
1682 ompt_lw_taskteam_t lw_taskteam;
1683
1684 if (ompt_enabled.enabled) {
1685 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1686 ompt_parallel_data, *return_address);
1687
1688 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1689 // don't use lw_taskteam after linking. content was swaped
1690 task_info = OMPT_CUR_TASK_INFO(master_th);
1691 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1692 if (ompt_enabled.ompt_callback_implicit_task) {
1693 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1694 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1695 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1696 &(task_info->task_data), 1,
1697 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1698 }
1699
1700 /* OMPT state */
1701 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1702 } else {
1703 exit_frame_p = &dummy;
1704 }
1705#endif
1706
1707 {
1708 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1709 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1710 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1711#if OMPT_SUPPORT
1712 ,
1713 exit_frame_p
1714#endif
1715 );
1716 }
1717
1718#if OMPT_SUPPORT
1719 if (ompt_enabled.enabled) {
1720 *exit_frame_p = NULL;
1721 if (ompt_enabled.ompt_callback_implicit_task) {
1722 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1723 ompt_scope_end, NULL, &(task_info->task_data), 1,
1724 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1725 }
1726 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1727 __ompt_lw_taskteam_unlink(master_th);
1728 if (ompt_enabled.ompt_callback_parallel_end) {
1729 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1730 ompt_parallel_data, *parent_task_data,
1731 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1732 }
1733 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1734 }
1735#endif
1736 } else if (microtask == (microtask_t)__kmp_teams_master) {
1737 KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1738 team = master_th->th.th_team;
1739 // team->t.t_pkfn = microtask;
1740 team->t.t_invoke = invoker;
1741 __kmp_alloc_argv_entries(argc, team, TRUE);
1742 team->t.t_argc = argc;
1743 argv = (void **)team->t.t_argv;
1744 if (ap) {
1745 for (i = argc - 1; i >= 0; --i)
1746 *argv++ = va_arg(kmp_va_deref(ap), void *);
1747 } else {
1748 for (i = 0; i < argc; ++i)
1749 // Get args from parent team for teams construct
1750 argv[i] = parent_team->t.t_argv[i];
1751 }
1752 // AC: revert change made in __kmpc_serialized_parallel()
1753 // because initial code in teams should have level=0
1754 team->t.t_level--;
1755 // AC: call special invoker for outer "parallel" of teams construct
1756 invoker(gtid);
1757#if OMPT_SUPPORT
1758 if (ompt_enabled.enabled) {
1759 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1760 if (ompt_enabled.ompt_callback_implicit_task) {
1761 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1762 ompt_scope_end, NULL, &(task_info->task_data), 0,
1763 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1764 }
1765 if (ompt_enabled.ompt_callback_parallel_end) {
1766 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1767 ompt_parallel_data, *parent_task_data,
1768 OMPT_INVOKER(call_context) | ompt_parallel_league,
1769 *return_address);
1770 }
1771 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1772 }
1773#endif
1774 } else {
1775 argv = args;
1776 for (i = argc - 1; i >= 0; --i)
1777 *argv++ = va_arg(kmp_va_deref(ap), void *);
1778 KMP_MB();
1779
1780#if OMPT_SUPPORT
1781 void *dummy;
1782 void **exit_frame_p;
1783 ompt_task_info_t *task_info;
1784 ompt_lw_taskteam_t lw_taskteam;
1785 ompt_data_t *implicit_task_data;
1786
1787 if (ompt_enabled.enabled) {
1788 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1789 ompt_parallel_data, *return_address);
1790 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1791 // don't use lw_taskteam after linking. content was swaped
1792 task_info = OMPT_CUR_TASK_INFO(master_th);
1793 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1794
1795 /* OMPT implicit task begin */
1796 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1797 if (ompt_enabled.ompt_callback_implicit_task) {
1798 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1799 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1800 implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1801 ompt_task_implicit);
1802 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1803 }
1804
1805 /* OMPT state */
1806 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1807 } else {
1808 exit_frame_p = &dummy;
1809 }
1810#endif
1811
1812 {
1813 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1814 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1815 __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1816#if OMPT_SUPPORT
1817 ,
1818 exit_frame_p
1819#endif
1820 );
1821 }
1822
1823#if OMPT_SUPPORT
1824 if (ompt_enabled.enabled) {
1825 *exit_frame_p = NULL;
1826 if (ompt_enabled.ompt_callback_implicit_task) {
1827 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1828 ompt_scope_end, NULL, &(task_info->task_data), 1,
1829 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1830 }
1831
1832 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1833 __ompt_lw_taskteam_unlink(master_th);
1834 if (ompt_enabled.ompt_callback_parallel_end) {
1835 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1836 ompt_parallel_data, *parent_task_data,
1837 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1838 }
1839 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1840 }
1841#endif
1842 }
1843 } else if (call_context == fork_context_gnu) {
1844#if OMPT_SUPPORT
1845 if (ompt_enabled.enabled) {
1846 ompt_lw_taskteam_t lwt;
1847 __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1848 *return_address);
1849
1850 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1851 __ompt_lw_taskteam_link(&lwt, master_th, 1);
1852 }
1853// don't use lw_taskteam after linking. content was swaped
1854#endif
1855
1856 // we were called from GNU native code
1857 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1858 return FALSE;
1859 } else {
1860 KMP_ASSERT2(call_context < fork_context_last,
1861 "__kmp_serial_fork_call: unknown fork_context parameter");
1862 }
1863
1864 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1865 KMP_MB();
1866 return FALSE;
1867}
1868
1869/* most of the work for a fork */
1870/* return true if we really went parallel, false if serialized */
1871int __kmp_fork_call(ident_t *loc, int gtid,
1872 enum fork_context_e call_context, // Intel, GNU, ...
1873 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1874 kmp_va_list ap) {
1875 void **argv;
1876 int i;
1877 int master_tid;
1878 int master_this_cons;
1879 kmp_team_t *team;
1880 kmp_team_t *parent_team;
1881 kmp_info_t *master_th;
1882 kmp_root_t *root;
1883 int nthreads;
1884 int master_active;
1885 int master_set_numthreads;
1886 int task_thread_limit = 0;
1887 int level;
1888 int active_level;
1889 int teams_level;
1890#if KMP_NESTED_HOT_TEAMS
1891 kmp_hot_team_ptr_t **p_hot_teams;
1892#endif
1893 { // KMP_TIME_BLOCK
1894 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1895 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1896
1897 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1898 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1899 /* Some systems prefer the stack for the root thread(s) to start with */
1900 /* some gap from the parent stack to prevent false sharing. */
1901 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1902 /* These 2 lines below are so this does not get optimized out */
1903 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1904 __kmp_stkpadding += (short)((kmp_int64)dummy);
1905 }
1906
1907 /* initialize if needed */
1908 KMP_DEBUG_ASSERT(
1909 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1910 if (!TCR_4(__kmp_init_parallel))
1911 __kmp_parallel_initialize();
1912 __kmp_resume_if_soft_paused();
1913
1914 /* setup current data */
1915 // AC: potentially unsafe, not in sync with library shutdown,
1916 // __kmp_threads can be freed
1917 master_th = __kmp_threads[gtid];
1918
1919 parent_team = master_th->th.th_team;
1920 master_tid = master_th->th.th_info.ds.ds_tid;
1921 master_this_cons = master_th->th.th_local.this_construct;
1922 root = master_th->th.th_root;
1923 master_active = root->r.r_active;
1924 master_set_numthreads = master_th->th.th_set_nproc;
1925 task_thread_limit =
1926 master_th->th.th_current_task->td_icvs.task_thread_limit;
1927
1928#if OMPT_SUPPORT
1929 ompt_data_t ompt_parallel_data = ompt_data_none;
1930 ompt_data_t *parent_task_data;
1931 ompt_frame_t *ompt_frame;
1932 void *return_address = NULL;
1933
1934 if (ompt_enabled.enabled) {
1935 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1936 NULL, NULL);
1937 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1938 }
1939#endif
1940
1941 // Assign affinity to root thread if it hasn't happened yet
1942 __kmp_assign_root_init_mask();
1943
1944 // Nested level will be an index in the nested nthreads array
1945 level = parent_team->t.t_level;
1946 // used to launch non-serial teams even if nested is not allowed
1947 active_level = parent_team->t.t_active_level;
1948 // needed to check nesting inside the teams
1949 teams_level = master_th->th.th_teams_level;
1950#if KMP_NESTED_HOT_TEAMS
1951 p_hot_teams = &master_th->th.th_hot_teams;
1952 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1953 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1954 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1955 (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1956 // it is either actual or not needed (when active_level > 0)
1957 (*p_hot_teams)[0].hot_team_nth = 1;
1958 }
1959#endif
1960
1961#if OMPT_SUPPORT
1962 if (ompt_enabled.enabled) {
1963 if (ompt_enabled.ompt_callback_parallel_begin) {
1964 int team_size = master_set_numthreads
1965 ? master_set_numthreads
1966 : get__nproc_2(parent_team, master_tid);
1967 int flags = OMPT_INVOKER(call_context) |
1968 ((microtask == (microtask_t)__kmp_teams_master)
1969 ? ompt_parallel_league
1970 : ompt_parallel_team);
1971 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1972 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1973 return_address);
1974 }
1975 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1976 }
1977#endif
1978
1979 master_th->th.th_ident = loc;
1980
1981 // Parallel closely nested in teams construct:
1982 if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
1983 return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
1984 call_context, microtask, invoker,
1985 master_set_numthreads, level,
1986#if OMPT_SUPPORT
1987 ompt_parallel_data, return_address,
1988#endif
1989 ap);
1990 } // End parallel closely nested in teams construct
1991
1992#if KMP_DEBUG
1993 if (__kmp_tasking_mode != tskm_immediate_exec) {
1994 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1995 parent_team->t.t_task_team[master_th->th.th_task_state]);
1996 }
1997#endif
1998
1999 // Need this to happen before we determine the number of threads, not while
2000 // we are allocating the team
2001 //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2002
2003 // Determine the number of threads
2004 int enter_teams =
2005 __kmp_is_entering_teams(active_level, level, teams_level, ap);
2006 if ((!enter_teams &&
2007 (parent_team->t.t_active_level >=
2008 master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2009 (__kmp_library == library_serial)) {
2010 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2011 nthreads = 1;
2012 } else {
2013 nthreads = master_set_numthreads
2014 ? master_set_numthreads
2015 // TODO: get nproc directly from current task
2016 : get__nproc_2(parent_team, master_tid);
2017 // Use the thread_limit set for the current target task if exists, else go
2018 // with the deduced nthreads
2019 nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2020 ? task_thread_limit
2021 : nthreads;
2022 // Check if we need to take forkjoin lock? (no need for serialized
2023 // parallel out of teams construct).
2024 if (nthreads > 1) {
2025 /* determine how many new threads we can use */
2026 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2027 /* AC: If we execute teams from parallel region (on host), then teams
2028 should be created but each can only have 1 thread if nesting is
2029 disabled. If teams called from serial region, then teams and their
2030 threads should be created regardless of the nesting setting. */
2031 nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2032 nthreads, enter_teams);
2033 if (nthreads == 1) {
2034 // Free lock for single thread execution here; for multi-thread
2035 // execution it will be freed later after team of threads created
2036 // and initialized
2037 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2038 }
2039 }
2040 }
2041 KMP_DEBUG_ASSERT(nthreads > 0);
2042
2043 // If we temporarily changed the set number of threads then restore it now
2044 master_th->th.th_set_nproc = 0;
2045
2046 if (nthreads == 1) {
2047 return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2048 invoker, master_th, parent_team,
2049#if OMPT_SUPPORT
2050 &ompt_parallel_data, &return_address,
2051 &parent_task_data,
2052#endif
2053 ap);
2054 } // if (nthreads == 1)
2055
2056 // GEH: only modify the executing flag in the case when not serialized
2057 // serialized case is handled in kmpc_serialized_parallel
2058 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2059 "curtask=%p, curtask_max_aclevel=%d\n",
2060 parent_team->t.t_active_level, master_th,
2061 master_th->th.th_current_task,
2062 master_th->th.th_current_task->td_icvs.max_active_levels));
2063 // TODO: GEH - cannot do this assertion because root thread not set up as
2064 // executing
2065 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2066 master_th->th.th_current_task->td_flags.executing = 0;
2067
2068 if (!master_th->th.th_teams_microtask || level > teams_level) {
2069 /* Increment our nested depth level */
2070 KMP_ATOMIC_INC(&root->r.r_in_parallel);
2071 }
2072
2073 // See if we need to make a copy of the ICVs.
2074 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2075 if ((level + 1 < __kmp_nested_nth.used) &&
2076 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2077 nthreads_icv = __kmp_nested_nth.nth[level + 1];
2078 } else {
2079 nthreads_icv = 0; // don't update
2080 }
2081
2082 // Figure out the proc_bind_policy for the new team.
2083 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2084 // proc_bind_default means don't update
2085 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2086 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2087 proc_bind = proc_bind_false;
2088 } else {
2089 // No proc_bind clause specified; use current proc-bind-var for this
2090 // parallel region
2091 if (proc_bind == proc_bind_default) {
2092 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2093 }
2094 // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2095 if (master_th->th.th_teams_microtask &&
2096 microtask == (microtask_t)__kmp_teams_master) {
2097 proc_bind = __kmp_teams_proc_bind;
2098 }
2099 /* else: The proc_bind policy was specified explicitly on parallel clause.
2100 This overrides proc-bind-var for this parallel region, but does not
2101 change proc-bind-var. */
2102 // Figure the value of proc-bind-var for the child threads.
2103 if ((level + 1 < __kmp_nested_proc_bind.used) &&
2104 (__kmp_nested_proc_bind.bind_types[level + 1] !=
2105 master_th->th.th_current_task->td_icvs.proc_bind)) {
2106 // Do not modify the proc bind icv for the two teams construct forks
2107 // They just let the proc bind icv pass through
2108 if (!master_th->th.th_teams_microtask ||
2109 !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2110 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2111 }
2112 }
2113
2114 // Reset for next parallel region
2115 master_th->th.th_set_proc_bind = proc_bind_default;
2116
2117 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2118 kmp_internal_control_t new_icvs;
2119 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2120 new_icvs.next = NULL;
2121 if (nthreads_icv > 0) {
2122 new_icvs.nproc = nthreads_icv;
2123 }
2124 if (proc_bind_icv != proc_bind_default) {
2125 new_icvs.proc_bind = proc_bind_icv;
2126 }
2127
2128 /* allocate a new parallel team */
2129 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2130 team = __kmp_allocate_team(root, nthreads, nthreads,
2131#if OMPT_SUPPORT
2132 ompt_parallel_data,
2133#endif
2134 proc_bind, &new_icvs,
2135 argc USE_NESTED_HOT_ARG(master_th));
2136 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2137 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2138 } else {
2139 /* allocate a new parallel team */
2140 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2141 team = __kmp_allocate_team(root, nthreads, nthreads,
2142#if OMPT_SUPPORT
2143 ompt_parallel_data,
2144#endif
2145 proc_bind,
2146 &master_th->th.th_current_task->td_icvs,
2147 argc USE_NESTED_HOT_ARG(master_th));
2148 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2149 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2150 &master_th->th.th_current_task->td_icvs);
2151 }
2152 KF_TRACE(
2153 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2154
2155 /* setup the new team */
2156 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2157 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2158 KMP_CHECK_UPDATE(team->t.t_ident, loc);
2159 KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2160 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2161#if OMPT_SUPPORT
2162 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2163 return_address);
2164#endif
2165 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2166 // TODO: parent_team->t.t_level == INT_MAX ???
2167 if (!master_th->th.th_teams_microtask || level > teams_level) {
2168 int new_level = parent_team->t.t_level + 1;
2169 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2170 new_level = parent_team->t.t_active_level + 1;
2171 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2172 } else {
2173 // AC: Do not increase parallel level at start of the teams construct
2174 int new_level = parent_team->t.t_level;
2175 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2176 new_level = parent_team->t.t_active_level;
2177 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2178 }
2179 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2180 // set primary thread's schedule as new run-time schedule
2181 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2182
2183 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2184 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2185
2186 // Update the floating point rounding in the team if required.
2187 propagateFPControl(team);
2188#if OMPD_SUPPORT
2189 if (ompd_state & OMPD_ENABLE_BP)
2190 ompd_bp_parallel_begin();
2191#endif
2192
2193 if (__kmp_tasking_mode != tskm_immediate_exec) {
2194 // Set primary thread's task team to team's task team. Unless this is hot
2195 // team, it should be NULL.
2196 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2197 parent_team->t.t_task_team[master_th->th.th_task_state]);
2198 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2199 "%p, new task_team %p / team %p\n",
2200 __kmp_gtid_from_thread(master_th),
2201 master_th->th.th_task_team, parent_team,
2202 team->t.t_task_team[master_th->th.th_task_state], team));
2203
2204 if (active_level || master_th->th.th_task_team) {
2205 // Take a memo of primary thread's task_state
2206 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2207 if (master_th->th.th_task_state_top >=
2208 master_th->th.th_task_state_stack_sz) { // increase size
2209 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2210 kmp_uint8 *old_stack, *new_stack;
2211 kmp_uint32 i;
2212 new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2213 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2214 new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2215 }
2216 for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2217 ++i) { // zero-init rest of stack
2218 new_stack[i] = 0;
2219 }
2220 old_stack = master_th->th.th_task_state_memo_stack;
2221 master_th->th.th_task_state_memo_stack = new_stack;
2222 master_th->th.th_task_state_stack_sz = new_size;
2223 __kmp_free(old_stack);
2224 }
2225 // Store primary thread's task_state on stack
2226 master_th->th
2227 .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2228 master_th->th.th_task_state;
2229 master_th->th.th_task_state_top++;
2230#if KMP_NESTED_HOT_TEAMS
2231 if (master_th->th.th_hot_teams &&
2232 active_level < __kmp_hot_teams_max_level &&
2233 team == master_th->th.th_hot_teams[active_level].hot_team) {
2234 // Restore primary thread's nested state if nested hot team
2235 master_th->th.th_task_state =
2236 master_th->th
2237 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2238 } else {
2239#endif
2240 master_th->th.th_task_state = 0;
2241#if KMP_NESTED_HOT_TEAMS
2242 }
2243#endif
2244 }
2245#if !KMP_NESTED_HOT_TEAMS
2246 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2247 (team == root->r.r_hot_team));
2248#endif
2249 }
2250
2251 KA_TRACE(
2252 20,
2253 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2254 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2255 team->t.t_nproc));
2256 KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2257 (team->t.t_master_tid == 0 &&
2258 (team->t.t_parent == root->r.r_root_team ||
2259 team->t.t_parent->t.t_serialized)));
2260 KMP_MB();
2261
2262 /* now, setup the arguments */
2263 argv = (void **)team->t.t_argv;
2264 if (ap) {
2265 for (i = argc - 1; i >= 0; --i) {
2266 void *new_argv = va_arg(kmp_va_deref(ap), void *);
2267 KMP_CHECK_UPDATE(*argv, new_argv);
2268 argv++;
2269 }
2270 } else {
2271 for (i = 0; i < argc; ++i) {
2272 // Get args from parent team for teams construct
2273 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2274 }
2275 }
2276
2277 /* now actually fork the threads */
2278 KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2279 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2280 root->r.r_active = TRUE;
2281
2282 __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2283 __kmp_setup_icv_copy(team, nthreads,
2284 &master_th->th.th_current_task->td_icvs, loc);
2285
2286#if OMPT_SUPPORT
2287 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2288#endif
2289
2290 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2291
2292#if USE_ITT_BUILD
2293 if (team->t.t_active_level == 1 // only report frames at level 1
2294 && !master_th->th.th_teams_microtask) { // not in teams construct
2295#if USE_ITT_NOTIFY
2296 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2297 (__kmp_forkjoin_frames_mode == 3 ||
2298 __kmp_forkjoin_frames_mode == 1)) {
2299 kmp_uint64 tmp_time = 0;
2300 if (__itt_get_timestamp_ptr)
2301 tmp_time = __itt_get_timestamp();
2302 // Internal fork - report frame begin
2303 master_th->th.th_frame_time = tmp_time;
2304 if (__kmp_forkjoin_frames_mode == 3)
2305 team->t.t_region_time = tmp_time;
2306 } else
2307// only one notification scheme (either "submit" or "forking/joined", not both)
2308#endif /* USE_ITT_NOTIFY */
2309 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2310 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2311 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2312 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2313 }
2314 }
2315#endif /* USE_ITT_BUILD */
2316
2317 /* now go on and do the work */
2318 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2319 KMP_MB();
2320 KF_TRACE(10,
2321 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2322 root, team, master_th, gtid));
2323
2324#if USE_ITT_BUILD
2325 if (__itt_stack_caller_create_ptr) {
2326 // create new stack stitching id before entering fork barrier
2327 if (!enter_teams) {
2328 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2329 team->t.t_stack_id = __kmp_itt_stack_caller_create();
2330 } else if (parent_team->t.t_serialized) {
2331 // keep stack stitching id in the serialized parent_team;
2332 // current team will be used for parallel inside the teams;
2333 // if parent_team is active, then it already keeps stack stitching id
2334 // for the league of teams
2335 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2336 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2337 }
2338 }
2339#endif /* USE_ITT_BUILD */
2340
2341 // AC: skip __kmp_internal_fork at teams construct, let only primary
2342 // threads execute
2343 if (ap) {
2344 __kmp_internal_fork(loc, gtid, team);
2345 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2346 "master_th=%p, gtid=%d\n",
2347 root, team, master_th, gtid));
2348 }
2349
2350 if (call_context == fork_context_gnu) {
2351 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2352 return TRUE;
2353 }
2354
2355 /* Invoke microtask for PRIMARY thread */
2356 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2357 team->t.t_id, team->t.t_pkfn));
2358 } // END of timer KMP_fork_call block
2359
2360#if KMP_STATS_ENABLED
2361 // If beginning a teams construct, then change thread state
2362 stats_state_e previous_state = KMP_GET_THREAD_STATE();
2363 if (!ap) {
2364 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2365 }
2366#endif
2367
2368 if (!team->t.t_invoke(gtid)) {
2369 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2370 }
2371
2372#if KMP_STATS_ENABLED
2373 // If was beginning of a teams construct, then reset thread state
2374 if (!ap) {
2375 KMP_SET_THREAD_STATE(previous_state);
2376 }
2377#endif
2378
2379 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2380 team->t.t_id, team->t.t_pkfn));
2381 KMP_MB(); /* Flush all pending memory write invalidates. */
2382
2383 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2384#if OMPT_SUPPORT
2385 if (ompt_enabled.enabled) {
2386 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2387 }
2388#endif
2389
2390 return TRUE;
2391}
2392
2393#if OMPT_SUPPORT
2394static inline void __kmp_join_restore_state(kmp_info_t *thread,
2395 kmp_team_t *team) {
2396 // restore state outside the region
2397 thread->th.ompt_thread_info.state =
2398 ((team->t.t_serialized) ? ompt_state_work_serial
2399 : ompt_state_work_parallel);
2400}
2401
2402static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2403 kmp_team_t *team, ompt_data_t *parallel_data,
2404 int flags, void *codeptr) {
2405 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2406 if (ompt_enabled.ompt_callback_parallel_end) {
2407 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2408 parallel_data, &(task_info->task_data), flags, codeptr);
2409 }
2410
2411 task_info->frame.enter_frame = ompt_data_none;
2412 __kmp_join_restore_state(thread, team);
2413}
2414#endif
2415
2416void __kmp_join_call(ident_t *loc, int gtid
2417#if OMPT_SUPPORT
2418 ,
2419 enum fork_context_e fork_context
2420#endif
2421 ,
2422 int exit_teams) {
2423 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2424 kmp_team_t *team;
2425 kmp_team_t *parent_team;
2426 kmp_info_t *master_th;
2427 kmp_root_t *root;
2428 int master_active;
2429
2430 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2431
2432 /* setup current data */
2433 master_th = __kmp_threads[gtid];
2434 root = master_th->th.th_root;
2435 team = master_th->th.th_team;
2436 parent_team = team->t.t_parent;
2437
2438 master_th->th.th_ident = loc;
2439
2440#if OMPT_SUPPORT
2441 void *team_microtask = (void *)team->t.t_pkfn;
2442 // For GOMP interface with serialized parallel, need the
2443 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2444 // and end-parallel events.
2445 if (ompt_enabled.enabled &&
2446 !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2447 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2448 }
2449#endif
2450
2451#if KMP_DEBUG
2452 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2453 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2454 "th_task_team = %p\n",
2455 __kmp_gtid_from_thread(master_th), team,
2456 team->t.t_task_team[master_th->th.th_task_state],
2457 master_th->th.th_task_team));
2458 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2459 team->t.t_task_team[master_th->th.th_task_state]);
2460 }
2461#endif
2462
2463 if (team->t.t_serialized) {
2464 if (master_th->th.th_teams_microtask) {
2465 // We are in teams construct
2466 int level = team->t.t_level;
2467 int tlevel = master_th->th.th_teams_level;
2468 if (level == tlevel) {
2469 // AC: we haven't incremented it earlier at start of teams construct,
2470 // so do it here - at the end of teams construct
2471 team->t.t_level++;
2472 } else if (level == tlevel + 1) {
2473 // AC: we are exiting parallel inside teams, need to increment
2474 // serialization in order to restore it in the next call to
2475 // __kmpc_end_serialized_parallel
2476 team->t.t_serialized++;
2477 }
2478 }
2480
2481#if OMPT_SUPPORT
2482 if (ompt_enabled.enabled) {
2483 if (fork_context == fork_context_gnu) {
2484 __ompt_lw_taskteam_unlink(master_th);
2485 }
2486 __kmp_join_restore_state(master_th, parent_team);
2487 }
2488#endif
2489
2490 return;
2491 }
2492
2493 master_active = team->t.t_master_active;
2494
2495 if (!exit_teams) {
2496 // AC: No barrier for internal teams at exit from teams construct.
2497 // But there is barrier for external team (league).
2498 __kmp_internal_join(loc, gtid, team);
2499#if USE_ITT_BUILD
2500 if (__itt_stack_caller_create_ptr) {
2501 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2502 // destroy the stack stitching id after join barrier
2503 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2504 team->t.t_stack_id = NULL;
2505 }
2506#endif
2507 } else {
2508 master_th->th.th_task_state =
2509 0; // AC: no tasking in teams (out of any parallel)
2510#if USE_ITT_BUILD
2511 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2512 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2513 // destroy the stack stitching id on exit from the teams construct
2514 // if parent_team is active, then the id will be destroyed later on
2515 // by master of the league of teams
2516 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2517 parent_team->t.t_stack_id = NULL;
2518 }
2519#endif
2520 }
2521
2522 KMP_MB();
2523
2524#if OMPT_SUPPORT
2525 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2526 void *codeptr = team->t.ompt_team_info.master_return_address;
2527#endif
2528
2529#if USE_ITT_BUILD
2530 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2531 if (team->t.t_active_level == 1 &&
2532 (!master_th->th.th_teams_microtask || /* not in teams construct */
2533 master_th->th.th_teams_size.nteams == 1)) {
2534 master_th->th.th_ident = loc;
2535 // only one notification scheme (either "submit" or "forking/joined", not
2536 // both)
2537 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2538 __kmp_forkjoin_frames_mode == 3)
2539 __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2540 master_th->th.th_frame_time, 0, loc,
2541 master_th->th.th_team_nproc, 1);
2542 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2543 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2544 __kmp_itt_region_joined(gtid);
2545 } // active_level == 1
2546#endif /* USE_ITT_BUILD */
2547
2548#if KMP_AFFINITY_SUPPORTED
2549 if (!exit_teams) {
2550 // Restore master thread's partition.
2551 master_th->th.th_first_place = team->t.t_first_place;
2552 master_th->th.th_last_place = team->t.t_last_place;
2553 }
2554#endif // KMP_AFFINITY_SUPPORTED
2555
2556 if (master_th->th.th_teams_microtask && !exit_teams &&
2557 team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2558 team->t.t_level == master_th->th.th_teams_level + 1) {
2559// AC: We need to leave the team structure intact at the end of parallel
2560// inside the teams construct, so that at the next parallel same (hot) team
2561// works, only adjust nesting levels
2562#if OMPT_SUPPORT
2563 ompt_data_t ompt_parallel_data = ompt_data_none;
2564 if (ompt_enabled.enabled) {
2565 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2566 if (ompt_enabled.ompt_callback_implicit_task) {
2567 int ompt_team_size = team->t.t_nproc;
2568 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2569 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2570 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2571 }
2572 task_info->frame.exit_frame = ompt_data_none;
2573 task_info->task_data = ompt_data_none;
2574 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2575 __ompt_lw_taskteam_unlink(master_th);
2576 }
2577#endif
2578 /* Decrement our nested depth level */
2579 team->t.t_level--;
2580 team->t.t_active_level--;
2581 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2582
2583 // Restore number of threads in the team if needed. This code relies on
2584 // the proper adjustment of th_teams_size.nth after the fork in
2585 // __kmp_teams_master on each teams primary thread in the case that
2586 // __kmp_reserve_threads reduced it.
2587 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2588 int old_num = master_th->th.th_team_nproc;
2589 int new_num = master_th->th.th_teams_size.nth;
2590 kmp_info_t **other_threads = team->t.t_threads;
2591 team->t.t_nproc = new_num;
2592 for (int i = 0; i < old_num; ++i) {
2593 other_threads[i]->th.th_team_nproc = new_num;
2594 }
2595 // Adjust states of non-used threads of the team
2596 for (int i = old_num; i < new_num; ++i) {
2597 // Re-initialize thread's barrier data.
2598 KMP_DEBUG_ASSERT(other_threads[i]);
2599 kmp_balign_t *balign = other_threads[i]->th.th_bar;
2600 for (int b = 0; b < bs_last_barrier; ++b) {
2601 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2602 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2603#if USE_DEBUGGER
2604 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2605#endif
2606 }
2607 if (__kmp_tasking_mode != tskm_immediate_exec) {
2608 // Synchronize thread's task state
2609 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2610 }
2611 }
2612 }
2613
2614#if OMPT_SUPPORT
2615 if (ompt_enabled.enabled) {
2616 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2617 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2618 }
2619#endif
2620
2621 return;
2622 }
2623
2624 /* do cleanup and restore the parent team */
2625 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2626 master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2627
2628 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2629
2630 /* jc: The following lock has instructions with REL and ACQ semantics,
2631 separating the parallel user code called in this parallel region
2632 from the serial user code called after this function returns. */
2633 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2634
2635 if (!master_th->th.th_teams_microtask ||
2636 team->t.t_level > master_th->th.th_teams_level) {
2637 /* Decrement our nested depth level */
2638 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2639 }
2640 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2641
2642#if OMPT_SUPPORT
2643 if (ompt_enabled.enabled) {
2644 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2645 if (ompt_enabled.ompt_callback_implicit_task) {
2646 int flags = (team_microtask == (void *)__kmp_teams_master)
2647 ? ompt_task_initial
2648 : ompt_task_implicit;
2649 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2650 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2651 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2652 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2653 }
2654 task_info->frame.exit_frame = ompt_data_none;
2655 task_info->task_data = ompt_data_none;
2656 }
2657#endif
2658
2659 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2660 master_th, team));
2661 __kmp_pop_current_task_from_thread(master_th);
2662
2663 master_th->th.th_def_allocator = team->t.t_def_allocator;
2664
2665#if OMPD_SUPPORT
2666 if (ompd_state & OMPD_ENABLE_BP)
2667 ompd_bp_parallel_end();
2668#endif
2669 updateHWFPControl(team);
2670
2671 if (root->r.r_active != master_active)
2672 root->r.r_active = master_active;
2673
2674 __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2675 master_th)); // this will free worker threads
2676
2677 /* this race was fun to find. make sure the following is in the critical
2678 region otherwise assertions may fail occasionally since the old team may be
2679 reallocated and the hierarchy appears inconsistent. it is actually safe to
2680 run and won't cause any bugs, but will cause those assertion failures. it's
2681 only one deref&assign so might as well put this in the critical region */
2682 master_th->th.th_team = parent_team;
2683 master_th->th.th_team_nproc = parent_team->t.t_nproc;
2684 master_th->th.th_team_master = parent_team->t.t_threads[0];
2685 master_th->th.th_team_serialized = parent_team->t.t_serialized;
2686
2687 /* restore serialized team, if need be */
2688 if (parent_team->t.t_serialized &&
2689 parent_team != master_th->th.th_serial_team &&
2690 parent_team != root->r.r_root_team) {
2691 __kmp_free_team(root,
2692 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2693 master_th->th.th_serial_team = parent_team;
2694 }
2695
2696 if (__kmp_tasking_mode != tskm_immediate_exec) {
2697 if (master_th->th.th_task_state_top >
2698 0) { // Restore task state from memo stack
2699 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2700 // Remember primary thread's state if we re-use this nested hot team
2701 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2702 master_th->th.th_task_state;
2703 --master_th->th.th_task_state_top; // pop
2704 // Now restore state at this level
2705 master_th->th.th_task_state =
2706 master_th->th
2707 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2708 } else if (team != root->r.r_hot_team) {
2709 // Reset the task state of primary thread if we are not hot team because
2710 // in this case all the worker threads will be free, and their task state
2711 // will be reset. If not reset the primary's, the task state will be
2712 // inconsistent.
2713 master_th->th.th_task_state = 0;
2714 }
2715 // Copy the task team from the parent team to the primary thread
2716 master_th->th.th_task_team =
2717 parent_team->t.t_task_team[master_th->th.th_task_state];
2718 KA_TRACE(20,
2719 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2720 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2721 parent_team));
2722 }
2723
2724 // TODO: GEH - cannot do this assertion because root thread not set up as
2725 // executing
2726 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2727 master_th->th.th_current_task->td_flags.executing = 1;
2728
2729 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2730
2731#if KMP_AFFINITY_SUPPORTED
2732 if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2733 __kmp_reset_root_init_mask(gtid);
2734 }
2735#endif
2736#if OMPT_SUPPORT
2737 int flags =
2738 OMPT_INVOKER(fork_context) |
2739 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2740 : ompt_parallel_team);
2741 if (ompt_enabled.enabled) {
2742 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2743 codeptr);
2744 }
2745#endif
2746
2747 KMP_MB();
2748 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2749}
2750
2751/* Check whether we should push an internal control record onto the
2752 serial team stack. If so, do it. */
2753void __kmp_save_internal_controls(kmp_info_t *thread) {
2754
2755 if (thread->th.th_team != thread->th.th_serial_team) {
2756 return;
2757 }
2758 if (thread->th.th_team->t.t_serialized > 1) {
2759 int push = 0;
2760
2761 if (thread->th.th_team->t.t_control_stack_top == NULL) {
2762 push = 1;
2763 } else {
2764 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2765 thread->th.th_team->t.t_serialized) {
2766 push = 1;
2767 }
2768 }
2769 if (push) { /* push a record on the serial team's stack */
2770 kmp_internal_control_t *control =
2771 (kmp_internal_control_t *)__kmp_allocate(
2772 sizeof(kmp_internal_control_t));
2773
2774 copy_icvs(control, &thread->th.th_current_task->td_icvs);
2775
2776 control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2777
2778 control->next = thread->th.th_team->t.t_control_stack_top;
2779 thread->th.th_team->t.t_control_stack_top = control;
2780 }
2781 }
2782}
2783
2784/* Changes set_nproc */
2785void __kmp_set_num_threads(int new_nth, int gtid) {
2786 kmp_info_t *thread;
2787 kmp_root_t *root;
2788
2789 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2790 KMP_DEBUG_ASSERT(__kmp_init_serial);
2791
2792 if (new_nth < 1)
2793 new_nth = 1;
2794 else if (new_nth > __kmp_max_nth)
2795 new_nth = __kmp_max_nth;
2796
2797 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2798 thread = __kmp_threads[gtid];
2799 if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2800 return; // nothing to do
2801
2802 __kmp_save_internal_controls(thread);
2803
2804 set__nproc(thread, new_nth);
2805
2806 // If this omp_set_num_threads() call will cause the hot team size to be
2807 // reduced (in the absence of a num_threads clause), then reduce it now,
2808 // rather than waiting for the next parallel region.
2809 root = thread->th.th_root;
2810 if (__kmp_init_parallel && (!root->r.r_active) &&
2811 (root->r.r_hot_team->t.t_nproc > new_nth)
2812#if KMP_NESTED_HOT_TEAMS
2813 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2814#endif
2815 ) {
2816 kmp_team_t *hot_team = root->r.r_hot_team;
2817 int f;
2818
2819 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2820
2821 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2822 __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2823 }
2824 // Release the extra threads we don't need any more.
2825 for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2826 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2827 if (__kmp_tasking_mode != tskm_immediate_exec) {
2828 // When decreasing team size, threads no longer in the team should unref
2829 // task team.
2830 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2831 }
2832 __kmp_free_thread(hot_team->t.t_threads[f]);
2833 hot_team->t.t_threads[f] = NULL;
2834 }
2835 hot_team->t.t_nproc = new_nth;
2836#if KMP_NESTED_HOT_TEAMS
2837 if (thread->th.th_hot_teams) {
2838 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2839 thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2840 }
2841#endif
2842
2843 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2844 hot_team->t.b->update_num_threads(new_nth);
2845 __kmp_add_threads_to_team(hot_team, new_nth);
2846 }
2847
2848 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2849
2850 // Update the t_nproc field in the threads that are still active.
2851 for (f = 0; f < new_nth; f++) {
2852 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2853 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2854 }
2855 // Special flag in case omp_set_num_threads() call
2856 hot_team->t.t_size_changed = -1;
2857 }
2858}
2859
2860/* Changes max_active_levels */
2861void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2862 kmp_info_t *thread;
2863
2864 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2865 "%d = (%d)\n",
2866 gtid, max_active_levels));
2867 KMP_DEBUG_ASSERT(__kmp_init_serial);
2868
2869 // validate max_active_levels
2870 if (max_active_levels < 0) {
2871 KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2872 // We ignore this call if the user has specified a negative value.
2873 // The current setting won't be changed. The last valid setting will be
2874 // used. A warning will be issued (if warnings are allowed as controlled by
2875 // the KMP_WARNINGS env var).
2876 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2877 "max_active_levels for thread %d = (%d)\n",
2878 gtid, max_active_levels));
2879 return;
2880 }
2881 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2882 // it's OK, the max_active_levels is within the valid range: [ 0;
2883 // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2884 // We allow a zero value. (implementation defined behavior)
2885 } else {
2886 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2887 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2888 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2889 // Current upper limit is MAX_INT. (implementation defined behavior)
2890 // If the input exceeds the upper limit, we correct the input to be the
2891 // upper limit. (implementation defined behavior)
2892 // Actually, the flow should never get here until we use MAX_INT limit.
2893 }
2894 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2895 "max_active_levels for thread %d = (%d)\n",
2896 gtid, max_active_levels));
2897
2898 thread = __kmp_threads[gtid];
2899
2900 __kmp_save_internal_controls(thread);
2901
2902 set__max_active_levels(thread, max_active_levels);
2903}
2904
2905/* Gets max_active_levels */
2906int __kmp_get_max_active_levels(int gtid) {
2907 kmp_info_t *thread;
2908
2909 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2910 KMP_DEBUG_ASSERT(__kmp_init_serial);
2911
2912 thread = __kmp_threads[gtid];
2913 KMP_DEBUG_ASSERT(thread->th.th_current_task);
2914 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2915 "curtask_maxaclevel=%d\n",
2916 gtid, thread->th.th_current_task,
2917 thread->th.th_current_task->td_icvs.max_active_levels));
2918 return thread->th.th_current_task->td_icvs.max_active_levels;
2919}
2920
2921// nteams-var per-device ICV
2922void __kmp_set_num_teams(int num_teams) {
2923 if (num_teams > 0)
2924 __kmp_nteams = num_teams;
2925}
2926int __kmp_get_max_teams(void) { return __kmp_nteams; }
2927// teams-thread-limit-var per-device ICV
2928void __kmp_set_teams_thread_limit(int limit) {
2929 if (limit > 0)
2930 __kmp_teams_thread_limit = limit;
2931}
2932int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2933
2934KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2935KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2936
2937/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2938void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2939 kmp_info_t *thread;
2940 kmp_sched_t orig_kind;
2941 // kmp_team_t *team;
2942
2943 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2944 gtid, (int)kind, chunk));
2945 KMP_DEBUG_ASSERT(__kmp_init_serial);
2946
2947 // Check if the kind parameter is valid, correct if needed.
2948 // Valid parameters should fit in one of two intervals - standard or extended:
2949 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2950 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2951 orig_kind = kind;
2952 kind = __kmp_sched_without_mods(kind);
2953
2954 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2955 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2956 // TODO: Hint needs attention in case we change the default schedule.
2957 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2958 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2959 __kmp_msg_null);
2960 kind = kmp_sched_default;
2961 chunk = 0; // ignore chunk value in case of bad kind
2962 }
2963
2964 thread = __kmp_threads[gtid];
2965
2966 __kmp_save_internal_controls(thread);
2967
2968 if (kind < kmp_sched_upper_std) {
2969 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2970 // differ static chunked vs. unchunked: chunk should be invalid to
2971 // indicate unchunked schedule (which is the default)
2972 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2973 } else {
2974 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2975 __kmp_sch_map[kind - kmp_sched_lower - 1];
2976 }
2977 } else {
2978 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2979 // kmp_sched_lower - 2 ];
2980 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2981 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2982 kmp_sched_lower - 2];
2983 }
2984 __kmp_sched_apply_mods_intkind(
2985 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2986 if (kind == kmp_sched_auto || chunk < 1) {
2987 // ignore parameter chunk for schedule auto
2988 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2989 } else {
2990 thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2991 }
2992}
2993
2994/* Gets def_sched_var ICV values */
2995void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2996 kmp_info_t *thread;
2997 enum sched_type th_type;
2998
2999 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
3000 KMP_DEBUG_ASSERT(__kmp_init_serial);
3001
3002 thread = __kmp_threads[gtid];
3003
3004 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3005 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3006 case kmp_sch_static:
3007 case kmp_sch_static_greedy:
3008 case kmp_sch_static_balanced:
3009 *kind = kmp_sched_static;
3010 __kmp_sched_apply_mods_stdkind(kind, th_type);
3011 *chunk = 0; // chunk was not set, try to show this fact via zero value
3012 return;
3013 case kmp_sch_static_chunked:
3014 *kind = kmp_sched_static;
3015 break;
3016 case kmp_sch_dynamic_chunked:
3017 *kind = kmp_sched_dynamic;
3018 break;
3020 case kmp_sch_guided_iterative_chunked:
3021 case kmp_sch_guided_analytical_chunked:
3022 *kind = kmp_sched_guided;
3023 break;
3024 case kmp_sch_auto:
3025 *kind = kmp_sched_auto;
3026 break;
3027 case kmp_sch_trapezoidal:
3028 *kind = kmp_sched_trapezoidal;
3029 break;
3030#if KMP_STATIC_STEAL_ENABLED
3031 case kmp_sch_static_steal:
3032 *kind = kmp_sched_static_steal;
3033 break;
3034#endif
3035 default:
3036 KMP_FATAL(UnknownSchedulingType, th_type);
3037 }
3038
3039 __kmp_sched_apply_mods_stdkind(kind, th_type);
3040 *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3041}
3042
3043int __kmp_get_ancestor_thread_num(int gtid, int level) {
3044
3045 int ii, dd;
3046 kmp_team_t *team;
3047 kmp_info_t *thr;
3048
3049 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3050 KMP_DEBUG_ASSERT(__kmp_init_serial);
3051
3052 // validate level
3053 if (level == 0)
3054 return 0;
3055 if (level < 0)
3056 return -1;
3057 thr = __kmp_threads[gtid];
3058 team = thr->th.th_team;
3059 ii = team->t.t_level;
3060 if (level > ii)
3061 return -1;
3062
3063 if (thr->th.th_teams_microtask) {
3064 // AC: we are in teams region where multiple nested teams have same level
3065 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3066 if (level <=
3067 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3068 KMP_DEBUG_ASSERT(ii >= tlevel);
3069 // AC: As we need to pass by the teams league, we need to artificially
3070 // increase ii
3071 if (ii == tlevel) {
3072 ii += 2; // three teams have same level
3073 } else {
3074 ii++; // two teams have same level
3075 }
3076 }
3077 }
3078
3079 if (ii == level)
3080 return __kmp_tid_from_gtid(gtid);
3081
3082 dd = team->t.t_serialized;
3083 level++;
3084 while (ii > level) {
3085 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3086 }
3087 if ((team->t.t_serialized) && (!dd)) {
3088 team = team->t.t_parent;
3089 continue;
3090 }
3091 if (ii > level) {
3092 team = team->t.t_parent;
3093 dd = team->t.t_serialized;
3094 ii--;
3095 }
3096 }
3097
3098 return (dd > 1) ? (0) : (team->t.t_master_tid);
3099}
3100
3101int __kmp_get_team_size(int gtid, int level) {
3102
3103 int ii, dd;
3104 kmp_team_t *team;
3105 kmp_info_t *thr;
3106
3107 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3108 KMP_DEBUG_ASSERT(__kmp_init_serial);
3109
3110 // validate level
3111 if (level == 0)
3112 return 1;
3113 if (level < 0)
3114 return -1;
3115 thr = __kmp_threads[gtid];
3116 team = thr->th.th_team;
3117 ii = team->t.t_level;
3118 if (level > ii)
3119 return -1;
3120
3121 if (thr->th.th_teams_microtask) {
3122 // AC: we are in teams region where multiple nested teams have same level
3123 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3124 if (level <=
3125 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3126 KMP_DEBUG_ASSERT(ii >= tlevel);
3127 // AC: As we need to pass by the teams league, we need to artificially
3128 // increase ii
3129 if (ii == tlevel) {
3130 ii += 2; // three teams have same level
3131 } else {
3132 ii++; // two teams have same level
3133 }
3134 }
3135 }
3136
3137 while (ii > level) {
3138 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3139 }
3140 if (team->t.t_serialized && (!dd)) {
3141 team = team->t.t_parent;
3142 continue;
3143 }
3144 if (ii > level) {
3145 team = team->t.t_parent;
3146 ii--;
3147 }
3148 }
3149
3150 return team->t.t_nproc;
3151}
3152
3153kmp_r_sched_t __kmp_get_schedule_global() {
3154 // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3155 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3156 // independently. So one can get the updated schedule here.
3157
3158 kmp_r_sched_t r_sched;
3159
3160 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3161 // __kmp_guided. __kmp_sched should keep original value, so that user can set
3162 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3163 // different roots (even in OMP 2.5)
3164 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3165 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3166 if (s == kmp_sch_static) {
3167 // replace STATIC with more detailed schedule (balanced or greedy)
3168 r_sched.r_sched_type = __kmp_static;
3169 } else if (s == kmp_sch_guided_chunked) {
3170 // replace GUIDED with more detailed schedule (iterative or analytical)
3171 r_sched.r_sched_type = __kmp_guided;
3172 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3173 r_sched.r_sched_type = __kmp_sched;
3174 }
3175 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3176
3177 if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3178 // __kmp_chunk may be wrong here (if it was not ever set)
3179 r_sched.chunk = KMP_DEFAULT_CHUNK;
3180 } else {
3181 r_sched.chunk = __kmp_chunk;
3182 }
3183
3184 return r_sched;
3185}
3186
3187/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3188 at least argc number of *t_argv entries for the requested team. */
3189static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3190
3191 KMP_DEBUG_ASSERT(team);
3192 if (!realloc || argc > team->t.t_max_argc) {
3193
3194 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3195 "current entries=%d\n",
3196 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3197 /* if previously allocated heap space for args, free them */
3198 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3199 __kmp_free((void *)team->t.t_argv);
3200
3201 if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3202 /* use unused space in the cache line for arguments */
3203 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3204 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3205 "argv entries\n",
3206 team->t.t_id, team->t.t_max_argc));
3207 team->t.t_argv = &team->t.t_inline_argv[0];
3208 if (__kmp_storage_map) {
3209 __kmp_print_storage_map_gtid(
3210 -1, &team->t.t_inline_argv[0],
3211 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3212 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3213 team->t.t_id);
3214 }
3215 } else {
3216 /* allocate space for arguments in the heap */
3217 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3218 ? KMP_MIN_MALLOC_ARGV_ENTRIES
3219 : 2 * argc;
3220 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3221 "argv entries\n",
3222 team->t.t_id, team->t.t_max_argc));
3223 team->t.t_argv =
3224 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3225 if (__kmp_storage_map) {
3226 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3227 &team->t.t_argv[team->t.t_max_argc],
3228 sizeof(void *) * team->t.t_max_argc,
3229 "team_%d.t_argv", team->t.t_id);
3230 }
3231 }
3232 }
3233}
3234
3235static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3236 int i;
3237 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3238 team->t.t_threads =
3239 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3240 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3241 sizeof(dispatch_shared_info_t) * num_disp_buff);
3242 team->t.t_dispatch =
3243 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3244 team->t.t_implicit_task_taskdata =
3245 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3246 team->t.t_max_nproc = max_nth;
3247
3248 /* setup dispatch buffers */
3249 for (i = 0; i < num_disp_buff; ++i) {
3250 team->t.t_disp_buffer[i].buffer_index = i;
3251 team->t.t_disp_buffer[i].doacross_buf_idx = i;
3252 }
3253}
3254
3255static void __kmp_free_team_arrays(kmp_team_t *team) {
3256 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3257 int i;
3258 for (i = 0; i < team->t.t_max_nproc; ++i) {
3259 if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3260 __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3261 team->t.t_dispatch[i].th_disp_buffer = NULL;
3262 }
3263 }
3264#if KMP_USE_HIER_SCHED
3265 __kmp_dispatch_free_hierarchies(team);
3266#endif
3267 __kmp_free(team->t.t_threads);
3268 __kmp_free(team->t.t_disp_buffer);
3269 __kmp_free(team->t.t_dispatch);
3270 __kmp_free(team->t.t_implicit_task_taskdata);
3271 team->t.t_threads = NULL;
3272 team->t.t_disp_buffer = NULL;
3273 team->t.t_dispatch = NULL;
3274 team->t.t_implicit_task_taskdata = 0;
3275}
3276
3277static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3278 kmp_info_t **oldThreads = team->t.t_threads;
3279
3280 __kmp_free(team->t.t_disp_buffer);
3281 __kmp_free(team->t.t_dispatch);
3282 __kmp_free(team->t.t_implicit_task_taskdata);
3283 __kmp_allocate_team_arrays(team, max_nth);
3284
3285 KMP_MEMCPY(team->t.t_threads, oldThreads,
3286 team->t.t_nproc * sizeof(kmp_info_t *));
3287
3288 __kmp_free(oldThreads);
3289}
3290
3291static kmp_internal_control_t __kmp_get_global_icvs(void) {
3292
3293 kmp_r_sched_t r_sched =
3294 __kmp_get_schedule_global(); // get current state of scheduling globals
3295
3296 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3297
3298 kmp_internal_control_t g_icvs = {
3299 0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3300 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3301 // adjustment of threads (per thread)
3302 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3303 // whether blocktime is explicitly set
3304 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3305#if KMP_USE_MONITOR
3306 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3307// intervals
3308#endif
3309 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3310 // next parallel region (per thread)
3311 // (use a max ub on value if __kmp_parallel_initialize not called yet)
3312 __kmp_cg_max_nth, // int thread_limit;
3313 __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3314 // on task. This is used in the case of target thread_limit
3315 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3316 // for max_active_levels
3317 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3318 // {sched,chunk} pair
3319 __kmp_nested_proc_bind.bind_types[0],
3320 __kmp_default_device,
3321 NULL // struct kmp_internal_control *next;
3322 };
3323
3324 return g_icvs;
3325}
3326
3327static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3328
3329 kmp_internal_control_t gx_icvs;
3330 gx_icvs.serial_nesting_level =
3331 0; // probably =team->t.t_serial like in save_inter_controls
3332 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3333 gx_icvs.next = NULL;
3334
3335 return gx_icvs;
3336}
3337
3338static void __kmp_initialize_root(kmp_root_t *root) {
3339 int f;
3340 kmp_team_t *root_team;
3341 kmp_team_t *hot_team;
3342 int hot_team_max_nth;
3343 kmp_r_sched_t r_sched =
3344 __kmp_get_schedule_global(); // get current state of scheduling globals
3345 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3346 KMP_DEBUG_ASSERT(root);
3347 KMP_ASSERT(!root->r.r_begin);
3348
3349 /* setup the root state structure */
3350 __kmp_init_lock(&root->r.r_begin_lock);
3351 root->r.r_begin = FALSE;
3352 root->r.r_active = FALSE;
3353 root->r.r_in_parallel = 0;
3354 root->r.r_blocktime = __kmp_dflt_blocktime;
3355#if KMP_AFFINITY_SUPPORTED
3356 root->r.r_affinity_assigned = FALSE;
3357#endif
3358
3359 /* setup the root team for this task */
3360 /* allocate the root team structure */
3361 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3362
3363 root_team =
3364 __kmp_allocate_team(root,
3365 1, // new_nproc
3366 1, // max_nproc
3367#if OMPT_SUPPORT
3368 ompt_data_none, // root parallel id
3369#endif
3370 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3371 0 // argc
3372 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3373 );
3374#if USE_DEBUGGER
3375 // Non-NULL value should be assigned to make the debugger display the root
3376 // team.
3377 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3378#endif
3379
3380 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3381
3382 root->r.r_root_team = root_team;
3383 root_team->t.t_control_stack_top = NULL;
3384
3385 /* initialize root team */
3386 root_team->t.t_threads[0] = NULL;
3387 root_team->t.t_nproc = 1;
3388 root_team->t.t_serialized = 1;
3389 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3390 root_team->t.t_sched.sched = r_sched.sched;
3391 KA_TRACE(
3392 20,
3393 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3394 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3395
3396 /* setup the hot team for this task */
3397 /* allocate the hot team structure */
3398 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3399
3400 hot_team =
3401 __kmp_allocate_team(root,
3402 1, // new_nproc
3403 __kmp_dflt_team_nth_ub * 2, // max_nproc
3404#if OMPT_SUPPORT
3405 ompt_data_none, // root parallel id
3406#endif
3407 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3408 0 // argc
3409 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3410 );
3411 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3412
3413 root->r.r_hot_team = hot_team;
3414 root_team->t.t_control_stack_top = NULL;
3415
3416 /* first-time initialization */
3417 hot_team->t.t_parent = root_team;
3418
3419 /* initialize hot team */
3420 hot_team_max_nth = hot_team->t.t_max_nproc;
3421 for (f = 0; f < hot_team_max_nth; ++f) {
3422 hot_team->t.t_threads[f] = NULL;
3423 }
3424 hot_team->t.t_nproc = 1;
3425 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3426 hot_team->t.t_sched.sched = r_sched.sched;
3427 hot_team->t.t_size_changed = 0;
3428}
3429
3430#ifdef KMP_DEBUG
3431
3432typedef struct kmp_team_list_item {
3433 kmp_team_p const *entry;
3434 struct kmp_team_list_item *next;
3435} kmp_team_list_item_t;
3436typedef kmp_team_list_item_t *kmp_team_list_t;
3437
3438static void __kmp_print_structure_team_accum( // Add team to list of teams.
3439 kmp_team_list_t list, // List of teams.
3440 kmp_team_p const *team // Team to add.
3441) {
3442
3443 // List must terminate with item where both entry and next are NULL.
3444 // Team is added to the list only once.
3445 // List is sorted in ascending order by team id.
3446 // Team id is *not* a key.
3447
3448 kmp_team_list_t l;
3449
3450 KMP_DEBUG_ASSERT(list != NULL);
3451 if (team == NULL) {
3452 return;
3453 }
3454
3455 __kmp_print_structure_team_accum(list, team->t.t_parent);
3456 __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3457
3458 // Search list for the team.
3459 l = list;
3460 while (l->next != NULL && l->entry != team) {
3461 l = l->next;
3462 }
3463 if (l->next != NULL) {
3464 return; // Team has been added before, exit.
3465 }
3466
3467 // Team is not found. Search list again for insertion point.
3468 l = list;
3469 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3470 l = l->next;
3471 }
3472
3473 // Insert team.
3474 {
3475 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3476 sizeof(kmp_team_list_item_t));
3477 *item = *l;
3478 l->entry = team;
3479 l->next = item;
3480 }
3481}
3482
3483static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3484
3485) {
3486 __kmp_printf("%s", title);
3487 if (team != NULL) {
3488 __kmp_printf("%2x %p\n", team->t.t_id, team);
3489 } else {
3490 __kmp_printf(" - (nil)\n");
3491 }
3492}
3493
3494static void __kmp_print_structure_thread(char const *title,
3495 kmp_info_p const *thread) {
3496 __kmp_printf("%s", title);
3497 if (thread != NULL) {
3498 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3499 } else {
3500 __kmp_printf(" - (nil)\n");
3501 }
3502}
3503
3504void __kmp_print_structure(void) {
3505
3506 kmp_team_list_t list;
3507
3508 // Initialize list of teams.
3509 list =
3510 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3511 list->entry = NULL;
3512 list->next = NULL;
3513
3514 __kmp_printf("\n------------------------------\nGlobal Thread "
3515 "Table\n------------------------------\n");
3516 {
3517 int gtid;
3518 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3519 __kmp_printf("%2d", gtid);
3520 if (__kmp_threads != NULL) {
3521 __kmp_printf(" %p", __kmp_threads[gtid]);
3522 }
3523 if (__kmp_root != NULL) {
3524 __kmp_printf(" %p", __kmp_root[gtid]);
3525 }
3526 __kmp_printf("\n");
3527 }
3528 }
3529
3530 // Print out __kmp_threads array.
3531 __kmp_printf("\n------------------------------\nThreads\n--------------------"
3532 "----------\n");
3533 if (__kmp_threads != NULL) {
3534 int gtid;
3535 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3536 kmp_info_t const *thread = __kmp_threads[gtid];
3537 if (thread != NULL) {
3538 __kmp_printf("GTID %2d %p:\n", gtid, thread);
3539 __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3540 __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3541 __kmp_print_structure_team(" Serial Team: ",
3542 thread->th.th_serial_team);
3543 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3544 __kmp_print_structure_thread(" Primary: ",
3545 thread->th.th_team_master);
3546 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3547 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3548 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3549 __kmp_print_structure_thread(" Next in pool: ",
3550 thread->th.th_next_pool);
3551 __kmp_printf("\n");
3552 __kmp_print_structure_team_accum(list, thread->th.th_team);
3553 __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3554 }
3555 }
3556 } else {
3557 __kmp_printf("Threads array is not allocated.\n");
3558 }
3559
3560 // Print out __kmp_root array.
3561 __kmp_printf("\n------------------------------\nUbers\n----------------------"
3562 "--------\n");
3563 if (__kmp_root != NULL) {
3564 int gtid;
3565 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3566 kmp_root_t const *root = __kmp_root[gtid];
3567 if (root != NULL) {
3568 __kmp_printf("GTID %2d %p:\n", gtid, root);
3569 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3570 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3571 __kmp_print_structure_thread(" Uber Thread: ",
3572 root->r.r_uber_thread);
3573 __kmp_printf(" Active?: %2d\n", root->r.r_active);
3574 __kmp_printf(" In Parallel: %2d\n",
3575 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3576 __kmp_printf("\n");
3577 __kmp_print_structure_team_accum(list, root->r.r_root_team);
3578 __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3579 }
3580 }
3581 } else {
3582 __kmp_printf("Ubers array is not allocated.\n");
3583 }
3584
3585 __kmp_printf("\n------------------------------\nTeams\n----------------------"
3586 "--------\n");
3587 while (list->next != NULL) {
3588 kmp_team_p const *team = list->entry;
3589 int i;
3590 __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3591 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3592 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3593 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3594 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3595 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3596 for (i = 0; i < team->t.t_nproc; ++i) {
3597 __kmp_printf(" Thread %2d: ", i);
3598 __kmp_print_structure_thread("", team->t.t_threads[i]);
3599 }
3600 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3601 __kmp_printf("\n");
3602 list = list->next;
3603 }
3604
3605 // Print out __kmp_thread_pool and __kmp_team_pool.
3606 __kmp_printf("\n------------------------------\nPools\n----------------------"
3607 "--------\n");
3608 __kmp_print_structure_thread("Thread pool: ",
3609 CCAST(kmp_info_t *, __kmp_thread_pool));
3610 __kmp_print_structure_team("Team pool: ",
3611 CCAST(kmp_team_t *, __kmp_team_pool));
3612 __kmp_printf("\n");
3613
3614 // Free team list.
3615 while (list != NULL) {
3616 kmp_team_list_item_t *item = list;
3617 list = list->next;
3618 KMP_INTERNAL_FREE(item);
3619 }
3620}
3621
3622#endif
3623
3624//---------------------------------------------------------------------------
3625// Stuff for per-thread fast random number generator
3626// Table of primes
3627static const unsigned __kmp_primes[] = {
3628 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3629 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3630 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3631 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3632 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3633 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3634 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3635 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3636 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3637 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3638 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3639
3640//---------------------------------------------------------------------------
3641// __kmp_get_random: Get a random number using a linear congruential method.
3642unsigned short __kmp_get_random(kmp_info_t *thread) {
3643 unsigned x = thread->th.th_x;
3644 unsigned short r = (unsigned short)(x >> 16);
3645
3646 thread->th.th_x = x * thread->th.th_a + 1;
3647
3648 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3649 thread->th.th_info.ds.ds_tid, r));
3650
3651 return r;
3652}
3653//--------------------------------------------------------
3654// __kmp_init_random: Initialize a random number generator
3655void __kmp_init_random(kmp_info_t *thread) {
3656 unsigned seed = thread->th.th_info.ds.ds_tid;
3657
3658 thread->th.th_a =
3659 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3660 thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3661 KA_TRACE(30,
3662 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3663}
3664
3665#if KMP_OS_WINDOWS
3666/* reclaim array entries for root threads that are already dead, returns number
3667 * reclaimed */
3668static int __kmp_reclaim_dead_roots(void) {
3669 int i, r = 0;
3670
3671 for (i = 0; i < __kmp_threads_capacity; ++i) {
3672 if (KMP_UBER_GTID(i) &&
3673 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3674 !__kmp_root[i]
3675 ->r.r_active) { // AC: reclaim only roots died in non-active state
3676 r += __kmp_unregister_root_other_thread(i);
3677 }
3678 }
3679 return r;
3680}
3681#endif
3682
3683/* This function attempts to create free entries in __kmp_threads and
3684 __kmp_root, and returns the number of free entries generated.
3685
3686 For Windows* OS static library, the first mechanism used is to reclaim array
3687 entries for root threads that are already dead.
3688
3689 On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3690 __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3691 capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3692 threadprivate cache array has been created. Synchronization with
3693 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3694
3695 After any dead root reclamation, if the clipping value allows array expansion
3696 to result in the generation of a total of nNeed free slots, the function does
3697 that expansion. If not, nothing is done beyond the possible initial root
3698 thread reclamation.
3699
3700 If any argument is negative, the behavior is undefined. */
3701static int __kmp_expand_threads(int nNeed) {
3702 int added = 0;
3703 int minimumRequiredCapacity;
3704 int newCapacity;
3705 kmp_info_t **newThreads;
3706 kmp_root_t **newRoot;
3707
3708 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3709 // resizing __kmp_threads does not need additional protection if foreign
3710 // threads are present
3711
3712#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3713 /* only for Windows static library */
3714 /* reclaim array entries for root threads that are already dead */
3715 added = __kmp_reclaim_dead_roots();
3716
3717 if (nNeed) {
3718 nNeed -= added;
3719 if (nNeed < 0)
3720 nNeed = 0;
3721 }
3722#endif
3723 if (nNeed <= 0)
3724 return added;
3725
3726 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3727 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3728 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3729 // > __kmp_max_nth in one of two ways:
3730 //
3731 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3732 // may not be reused by another thread, so we may need to increase
3733 // __kmp_threads_capacity to __kmp_max_nth + 1.
3734 //
3735 // 2) New foreign root(s) are encountered. We always register new foreign
3736 // roots. This may cause a smaller # of threads to be allocated at
3737 // subsequent parallel regions, but the worker threads hang around (and
3738 // eventually go to sleep) and need slots in the __kmp_threads[] array.
3739 //
3740 // Anyway, that is the reason for moving the check to see if
3741 // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3742 // instead of having it performed here. -BB
3743
3744 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3745
3746 /* compute expansion headroom to check if we can expand */
3747 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3748 /* possible expansion too small -- give up */
3749 return added;
3750 }
3751 minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3752
3753 newCapacity = __kmp_threads_capacity;
3754 do {
3755 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3756 : __kmp_sys_max_nth;
3757 } while (newCapacity < minimumRequiredCapacity);
3758 newThreads = (kmp_info_t **)__kmp_allocate(
3759 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3760 newRoot =
3761 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3762 KMP_MEMCPY(newThreads, __kmp_threads,
3763 __kmp_threads_capacity * sizeof(kmp_info_t *));
3764 KMP_MEMCPY(newRoot, __kmp_root,
3765 __kmp_threads_capacity * sizeof(kmp_root_t *));
3766 // Put old __kmp_threads array on a list. Any ongoing references to the old
3767 // list will be valid. This list is cleaned up at library shutdown.
3768 kmp_old_threads_list_t *node =
3769 (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3770 node->threads = __kmp_threads;
3771 node->next = __kmp_old_threads_list;
3772 __kmp_old_threads_list = node;
3773
3774 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3775 *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3776 added += newCapacity - __kmp_threads_capacity;
3777 *(volatile int *)&__kmp_threads_capacity = newCapacity;
3778
3779 if (newCapacity > __kmp_tp_capacity) {
3780 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3781 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3782 __kmp_threadprivate_resize_cache(newCapacity);
3783 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3784 *(volatile int *)&__kmp_tp_capacity = newCapacity;
3785 }
3786 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3787 }
3788
3789 return added;
3790}
3791
3792/* Register the current thread as a root thread and obtain our gtid. We must
3793 have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3794 thread that calls from __kmp_do_serial_initialize() */
3795int __kmp_register_root(int initial_thread) {
3796 kmp_info_t *root_thread;
3797 kmp_root_t *root;
3798 int gtid;
3799 int capacity;
3800 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3801 KA_TRACE(20, ("__kmp_register_root: entered\n"));
3802 KMP_MB();
3803
3804 /* 2007-03-02:
3805 If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3806 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3807 work as expected -- it may return false (that means there is at least one
3808 empty slot in __kmp_threads array), but it is possible the only free slot
3809 is #0, which is reserved for initial thread and so cannot be used for this
3810 one. Following code workarounds this bug.
3811
3812 However, right solution seems to be not reserving slot #0 for initial
3813 thread because:
3814 (1) there is no magic in slot #0,
3815 (2) we cannot detect initial thread reliably (the first thread which does
3816 serial initialization may be not a real initial thread).
3817 */
3818 capacity = __kmp_threads_capacity;
3819 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3820 --capacity;
3821 }
3822
3823 // If it is not for initializing the hidden helper team, we need to take
3824 // __kmp_hidden_helper_threads_num out of the capacity because it is included
3825 // in __kmp_threads_capacity.
3826 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3827 capacity -= __kmp_hidden_helper_threads_num;
3828 }
3829
3830 /* see if there are too many threads */
3831 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3832 if (__kmp_tp_cached) {
3833 __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3834 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3835 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3836 } else {
3837 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3838 __kmp_msg_null);
3839 }
3840 }
3841
3842 // When hidden helper task is enabled, __kmp_threads is organized as follows:
3843 // 0: initial thread, also a regular OpenMP thread.
3844 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3845 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3846 // regular OpenMP threads.
3847 if (TCR_4(__kmp_init_hidden_helper_threads)) {
3848 // Find an available thread slot for hidden helper thread. Slots for hidden
3849 // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3850 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3851 gtid <= __kmp_hidden_helper_threads_num;
3852 gtid++)
3853 ;
3854 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3855 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3856 "hidden helper thread: T#%d\n",
3857 gtid));
3858 } else {
3859 /* find an available thread slot */
3860 // Don't reassign the zero slot since we need that to only be used by
3861 // initial thread. Slots for hidden helper threads should also be skipped.
3862 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3863 gtid = 0;
3864 } else {
3865 for (gtid = __kmp_hidden_helper_threads_num + 1;
3866 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3867 ;
3868 }
3869 KA_TRACE(
3870 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3871 KMP_ASSERT(gtid < __kmp_threads_capacity);
3872 }
3873
3874 /* update global accounting */
3875 __kmp_all_nth++;
3876 TCW_4(__kmp_nth, __kmp_nth + 1);
3877
3878 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3879 // numbers of procs, and method #2 (keyed API call) for higher numbers.
3880 if (__kmp_adjust_gtid_mode) {
3881 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3882 if (TCR_4(__kmp_gtid_mode) != 2) {
3883 TCW_4(__kmp_gtid_mode, 2);
3884 }
3885 } else {
3886 if (TCR_4(__kmp_gtid_mode) != 1) {
3887 TCW_4(__kmp_gtid_mode, 1);
3888 }
3889 }
3890 }
3891
3892#ifdef KMP_ADJUST_BLOCKTIME
3893 /* Adjust blocktime to zero if necessary */
3894 /* Middle initialization might not have occurred yet */
3895 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3896 if (__kmp_nth > __kmp_avail_proc) {
3897 __kmp_zero_bt = TRUE;
3898 }
3899 }
3900#endif /* KMP_ADJUST_BLOCKTIME */
3901
3902 /* setup this new hierarchy */
3903 if (!(root = __kmp_root[gtid])) {
3904 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3905 KMP_DEBUG_ASSERT(!root->r.r_root_team);
3906 }
3907
3908#if KMP_STATS_ENABLED
3909 // Initialize stats as soon as possible (right after gtid assignment).
3910 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3911 __kmp_stats_thread_ptr->startLife();
3912 KMP_SET_THREAD_STATE(SERIAL_REGION);
3913 KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3914#endif
3915 __kmp_initialize_root(root);
3916
3917 /* setup new root thread structure */
3918 if (root->r.r_uber_thread) {
3919 root_thread = root->r.r_uber_thread;
3920 } else {
3921 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3922 if (__kmp_storage_map) {
3923 __kmp_print_thread_storage_map(root_thread, gtid);
3924 }
3925 root_thread->th.th_info.ds.ds_gtid = gtid;
3926#if OMPT_SUPPORT
3927 root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3928#endif
3929 root_thread->th.th_root = root;
3930 if (__kmp_env_consistency_check) {
3931 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3932 }
3933#if USE_FAST_MEMORY
3934 __kmp_initialize_fast_memory(root_thread);
3935#endif /* USE_FAST_MEMORY */
3936
3937#if KMP_USE_BGET
3938 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3939 __kmp_initialize_bget(root_thread);
3940#endif
3941 __kmp_init_random(root_thread); // Initialize random number generator
3942 }
3943
3944 /* setup the serial team held in reserve by the root thread */
3945 if (!root_thread->th.th_serial_team) {
3946 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3947 KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3948 root_thread->th.th_serial_team = __kmp_allocate_team(
3949 root, 1, 1,
3950#if OMPT_SUPPORT
3951 ompt_data_none, // root parallel id
3952#endif
3953 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3954 }
3955 KMP_ASSERT(root_thread->th.th_serial_team);
3956 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3957 root_thread->th.th_serial_team));
3958
3959 /* drop root_thread into place */
3960 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3961
3962 root->r.r_root_team->t.t_threads[0] = root_thread;
3963 root->r.r_hot_team->t.t_threads[0] = root_thread;
3964 root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3965 // AC: the team created in reserve, not for execution (it is unused for now).
3966 root_thread->th.th_serial_team->t.t_serialized = 0;
3967 root->r.r_uber_thread = root_thread;
3968
3969 /* initialize the thread, get it ready to go */
3970 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3971 TCW_4(__kmp_init_gtid, TRUE);
3972
3973 /* prepare the primary thread for get_gtid() */
3974 __kmp_gtid_set_specific(gtid);
3975
3976#if USE_ITT_BUILD
3977 __kmp_itt_thread_name(gtid);
3978#endif /* USE_ITT_BUILD */
3979
3980#ifdef KMP_TDATA_GTID
3981 __kmp_gtid = gtid;
3982#endif
3983 __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3984 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3985
3986 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3987 "plain=%u\n",
3988 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3989 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3990 KMP_INIT_BARRIER_STATE));
3991 { // Initialize barrier data.
3992 int b;
3993 for (b = 0; b < bs_last_barrier; ++b) {
3994 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3995#if USE_DEBUGGER
3996 root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3997#endif
3998 }
3999 }
4000 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
4001 KMP_INIT_BARRIER_STATE);
4002
4003#if KMP_AFFINITY_SUPPORTED
4004 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4005 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4006 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4007 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4008#endif /* KMP_AFFINITY_SUPPORTED */
4009 root_thread->th.th_def_allocator = __kmp_def_allocator;
4010 root_thread->th.th_prev_level = 0;
4011 root_thread->th.th_prev_num_threads = 1;
4012
4013 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4014 tmp->cg_root = root_thread;
4015 tmp->cg_thread_limit = __kmp_cg_max_nth;
4016 tmp->cg_nthreads = 1;
4017 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4018 " cg_nthreads init to 1\n",
4019 root_thread, tmp));
4020 tmp->up = NULL;
4021 root_thread->th.th_cg_roots = tmp;
4022
4023 __kmp_root_counter++;
4024
4025#if OMPT_SUPPORT
4026 if (!initial_thread && ompt_enabled.enabled) {
4027
4028 kmp_info_t *root_thread = ompt_get_thread();
4029
4030 ompt_set_thread_state(root_thread, ompt_state_overhead);
4031
4032 if (ompt_enabled.ompt_callback_thread_begin) {
4033 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4034 ompt_thread_initial, __ompt_get_thread_data_internal());
4035 }
4036 ompt_data_t *task_data;
4037 ompt_data_t *parallel_data;
4038 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4039 NULL);
4040 if (ompt_enabled.ompt_callback_implicit_task) {
4041 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4042 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4043 }
4044
4045 ompt_set_thread_state(root_thread, ompt_state_work_serial);
4046 }
4047#endif
4048#if OMPD_SUPPORT
4049 if (ompd_state & OMPD_ENABLE_BP)
4050 ompd_bp_thread_begin();
4051#endif
4052
4053 KMP_MB();
4054 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4055
4056 return gtid;
4057}
4058
4059#if KMP_NESTED_HOT_TEAMS
4060static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4061 const int max_level) {
4062 int i, n, nth;
4063 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4064 if (!hot_teams || !hot_teams[level].hot_team) {
4065 return 0;
4066 }
4067 KMP_DEBUG_ASSERT(level < max_level);
4068 kmp_team_t *team = hot_teams[level].hot_team;
4069 nth = hot_teams[level].hot_team_nth;
4070 n = nth - 1; // primary thread is not freed
4071 if (level < max_level - 1) {
4072 for (i = 0; i < nth; ++i) {
4073 kmp_info_t *th = team->t.t_threads[i];
4074 n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4075 if (i > 0 && th->th.th_hot_teams) {
4076 __kmp_free(th->th.th_hot_teams);
4077 th->th.th_hot_teams = NULL;
4078 }
4079 }
4080 }
4081 __kmp_free_team(root, team, NULL);
4082 return n;
4083}
4084#endif
4085
4086// Resets a root thread and clear its root and hot teams.
4087// Returns the number of __kmp_threads entries directly and indirectly freed.
4088static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4089 kmp_team_t *root_team = root->r.r_root_team;
4090 kmp_team_t *hot_team = root->r.r_hot_team;
4091 int n = hot_team->t.t_nproc;
4092 int i;
4093
4094 KMP_DEBUG_ASSERT(!root->r.r_active);
4095
4096 root->r.r_root_team = NULL;
4097 root->r.r_hot_team = NULL;
4098 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4099 // before call to __kmp_free_team().
4100 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4101#if KMP_NESTED_HOT_TEAMS
4102 if (__kmp_hot_teams_max_level >
4103 0) { // need to free nested hot teams and their threads if any
4104 for (i = 0; i < hot_team->t.t_nproc; ++i) {
4105 kmp_info_t *th = hot_team->t.t_threads[i];
4106 if (__kmp_hot_teams_max_level > 1) {
4107 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4108 }
4109 if (th->th.th_hot_teams) {
4110 __kmp_free(th->th.th_hot_teams);
4111 th->th.th_hot_teams = NULL;
4112 }
4113 }
4114 }
4115#endif
4116 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4117
4118 // Before we can reap the thread, we need to make certain that all other
4119 // threads in the teams that had this root as ancestor have stopped trying to
4120 // steal tasks.
4121 if (__kmp_tasking_mode != tskm_immediate_exec) {
4122 __kmp_wait_to_unref_task_teams();
4123 }
4124
4125#if KMP_OS_WINDOWS
4126 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4127 KA_TRACE(
4128 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4129 "\n",
4130 (LPVOID) & (root->r.r_uber_thread->th),
4131 root->r.r_uber_thread->th.th_info.ds.ds_thread));
4132 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4133#endif /* KMP_OS_WINDOWS */
4134
4135#if OMPD_SUPPORT
4136 if (ompd_state & OMPD_ENABLE_BP)
4137 ompd_bp_thread_end();
4138#endif
4139
4140#if OMPT_SUPPORT
4141 ompt_data_t *task_data;
4142 ompt_data_t *parallel_data;
4143 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4144 NULL);
4145 if (ompt_enabled.ompt_callback_implicit_task) {
4146 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4147 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4148 }
4149 if (ompt_enabled.ompt_callback_thread_end) {
4150 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4151 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4152 }
4153#endif
4154
4155 TCW_4(__kmp_nth,
4156 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4157 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4158 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4159 " to %d\n",
4160 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4161 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4162 if (i == 1) {
4163 // need to free contention group structure
4164 KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4165 root->r.r_uber_thread->th.th_cg_roots->cg_root);
4166 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4167 __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4168 root->r.r_uber_thread->th.th_cg_roots = NULL;
4169 }
4170 __kmp_reap_thread(root->r.r_uber_thread, 1);
4171
4172 // We canot put root thread to __kmp_thread_pool, so we have to reap it
4173 // instead of freeing.
4174 root->r.r_uber_thread = NULL;
4175 /* mark root as no longer in use */
4176 root->r.r_begin = FALSE;
4177
4178 return n;
4179}
4180
4181void __kmp_unregister_root_current_thread(int gtid) {
4182 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4183 /* this lock should be ok, since unregister_root_current_thread is never
4184 called during an abort, only during a normal close. furthermore, if you
4185 have the forkjoin lock, you should never try to get the initz lock */
4186 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4187 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4188 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4189 "exiting T#%d\n",
4190 gtid));
4191 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4192 return;
4193 }
4194 kmp_root_t *root = __kmp_root[gtid];
4195
4196 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4197 KMP_ASSERT(KMP_UBER_GTID(gtid));
4198 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4199 KMP_ASSERT(root->r.r_active == FALSE);
4200
4201 KMP_MB();
4202
4203 kmp_info_t *thread = __kmp_threads[gtid];
4204 kmp_team_t *team = thread->th.th_team;
4205 kmp_task_team_t *task_team = thread->th.th_task_team;
4206
4207 // we need to wait for the proxy tasks before finishing the thread
4208 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4209 task_team->tt.tt_hidden_helper_task_encountered)) {
4210#if OMPT_SUPPORT
4211 // the runtime is shutting down so we won't report any events
4212 thread->th.ompt_thread_info.state = ompt_state_undefined;
4213#endif
4214 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4215 }
4216
4217 __kmp_reset_root(gtid, root);
4218
4219 KMP_MB();
4220 KC_TRACE(10,
4221 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4222
4223 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4224}
4225
4226#if KMP_OS_WINDOWS
4227/* __kmp_forkjoin_lock must be already held
4228 Unregisters a root thread that is not the current thread. Returns the number
4229 of __kmp_threads entries freed as a result. */
4230static int __kmp_unregister_root_other_thread(int gtid) {
4231 kmp_root_t *root = __kmp_root[gtid];
4232 int r;
4233
4234 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4235 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4236 KMP_ASSERT(KMP_UBER_GTID(gtid));
4237 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4238 KMP_ASSERT(root->r.r_active == FALSE);
4239
4240 r = __kmp_reset_root(gtid, root);
4241 KC_TRACE(10,
4242 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4243 return r;
4244}
4245#endif
4246
4247#if KMP_DEBUG
4248void __kmp_task_info() {
4249
4250 kmp_int32 gtid = __kmp_entry_gtid();
4251 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4252 kmp_info_t *this_thr = __kmp_threads[gtid];
4253 kmp_team_t *steam = this_thr->th.th_serial_team;
4254 kmp_team_t *team = this_thr->th.th_team;
4255
4256 __kmp_printf(
4257 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4258 "ptask=%p\n",
4259 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4260 team->t.t_implicit_task_taskdata[tid].td_parent);
4261}
4262#endif // KMP_DEBUG
4263
4264/* TODO optimize with one big memclr, take out what isn't needed, split
4265 responsibility to workers as much as possible, and delay initialization of
4266 features as much as possible */
4267static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4268 int tid, int gtid) {
4269 /* this_thr->th.th_info.ds.ds_gtid is setup in
4270 kmp_allocate_thread/create_worker.
4271 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4272 KMP_DEBUG_ASSERT(this_thr != NULL);
4273 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4274 KMP_DEBUG_ASSERT(team);
4275 KMP_DEBUG_ASSERT(team->t.t_threads);
4276 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4277 kmp_info_t *master = team->t.t_threads[0];
4278 KMP_DEBUG_ASSERT(master);
4279 KMP_DEBUG_ASSERT(master->th.th_root);
4280
4281 KMP_MB();
4282
4283 TCW_SYNC_PTR(this_thr->th.th_team, team);
4284
4285 this_thr->th.th_info.ds.ds_tid = tid;
4286 this_thr->th.th_set_nproc = 0;
4287 if (__kmp_tasking_mode != tskm_immediate_exec)
4288 // When tasking is possible, threads are not safe to reap until they are
4289 // done tasking; this will be set when tasking code is exited in wait
4290 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4291 else // no tasking --> always safe to reap
4292 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4293 this_thr->th.th_set_proc_bind = proc_bind_default;
4294#if KMP_AFFINITY_SUPPORTED
4295 this_thr->th.th_new_place = this_thr->th.th_current_place;
4296#endif
4297 this_thr->th.th_root = master->th.th_root;
4298
4299 /* setup the thread's cache of the team structure */
4300 this_thr->th.th_team_nproc = team->t.t_nproc;
4301 this_thr->th.th_team_master = master;
4302 this_thr->th.th_team_serialized = team->t.t_serialized;
4303
4304 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4305
4306 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4307 tid, gtid, this_thr, this_thr->th.th_current_task));
4308
4309 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4310 team, tid, TRUE);
4311
4312 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4313 tid, gtid, this_thr, this_thr->th.th_current_task));
4314 // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4315 // __kmp_initialize_team()?
4316
4317 /* TODO no worksharing in speculative threads */
4318 this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4319
4320 this_thr->th.th_local.this_construct = 0;
4321
4322 if (!this_thr->th.th_pri_common) {
4323 this_thr->th.th_pri_common =
4324 (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4325 if (__kmp_storage_map) {
4326 __kmp_print_storage_map_gtid(
4327 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4328 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4329 }
4330 this_thr->th.th_pri_head = NULL;
4331 }
4332
4333 if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4334 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4335 // Make new thread's CG root same as primary thread's
4336 KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4337 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4338 if (tmp) {
4339 // worker changes CG, need to check if old CG should be freed
4340 int i = tmp->cg_nthreads--;
4341 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4342 " on node %p of thread %p to %d\n",
4343 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4344 if (i == 1) {
4345 __kmp_free(tmp); // last thread left CG --> free it
4346 }
4347 }
4348 this_thr->th.th_cg_roots = master->th.th_cg_roots;
4349 // Increment new thread's CG root's counter to add the new thread
4350 this_thr->th.th_cg_roots->cg_nthreads++;
4351 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4352 " node %p of thread %p to %d\n",
4353 this_thr, this_thr->th.th_cg_roots,
4354 this_thr->th.th_cg_roots->cg_root,
4355 this_thr->th.th_cg_roots->cg_nthreads));
4356 this_thr->th.th_current_task->td_icvs.thread_limit =
4357 this_thr->th.th_cg_roots->cg_thread_limit;
4358 }
4359
4360 /* Initialize dynamic dispatch */
4361 {
4362 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4363 // Use team max_nproc since this will never change for the team.
4364 size_t disp_size =
4365 sizeof(dispatch_private_info_t) *
4366 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4367 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4368 team->t.t_max_nproc));
4369 KMP_ASSERT(dispatch);
4370 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4371 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4372
4373 dispatch->th_disp_index = 0;
4374 dispatch->th_doacross_buf_idx = 0;
4375 if (!dispatch->th_disp_buffer) {
4376 dispatch->th_disp_buffer =
4377 (dispatch_private_info_t *)__kmp_allocate(disp_size);
4378
4379 if (__kmp_storage_map) {
4380 __kmp_print_storage_map_gtid(
4381 gtid, &dispatch->th_disp_buffer[0],
4382 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4383 ? 1
4384 : __kmp_dispatch_num_buffers],
4385 disp_size,
4386 "th_%d.th_dispatch.th_disp_buffer "
4387 "(team_%d.t_dispatch[%d].th_disp_buffer)",
4388 gtid, team->t.t_id, gtid);
4389 }
4390 } else {
4391 memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4392 }
4393
4394 dispatch->th_dispatch_pr_current = 0;
4395 dispatch->th_dispatch_sh_current = 0;
4396
4397 dispatch->th_deo_fcn = 0; /* ORDERED */
4398 dispatch->th_dxo_fcn = 0; /* END ORDERED */
4399 }
4400
4401 this_thr->th.th_next_pool = NULL;
4402
4403 if (!this_thr->th.th_task_state_memo_stack) {
4404 size_t i;
4405 this_thr->th.th_task_state_memo_stack =
4406 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4407 this_thr->th.th_task_state_top = 0;
4408 this_thr->th.th_task_state_stack_sz = 4;
4409 for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4410 ++i) // zero init the stack
4411 this_thr->th.th_task_state_memo_stack[i] = 0;
4412 }
4413
4414 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4415 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4416
4417 KMP_MB();
4418}
4419
4420/* allocate a new thread for the requesting team. this is only called from
4421 within a forkjoin critical section. we will first try to get an available
4422 thread from the thread pool. if none is available, we will fork a new one
4423 assuming we are able to create a new one. this should be assured, as the
4424 caller should check on this first. */
4425kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4426 int new_tid) {
4427 kmp_team_t *serial_team;
4428 kmp_info_t *new_thr;
4429 int new_gtid;
4430
4431 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4432 KMP_DEBUG_ASSERT(root && team);
4433#if !KMP_NESTED_HOT_TEAMS
4434 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4435#endif
4436 KMP_MB();
4437
4438 /* first, try to get one from the thread pool */
4439 if (__kmp_thread_pool) {
4440 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4441 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4442 if (new_thr == __kmp_thread_pool_insert_pt) {
4443 __kmp_thread_pool_insert_pt = NULL;
4444 }
4445 TCW_4(new_thr->th.th_in_pool, FALSE);
4446 __kmp_suspend_initialize_thread(new_thr);
4447 __kmp_lock_suspend_mx(new_thr);
4448 if (new_thr->th.th_active_in_pool == TRUE) {
4449 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4450 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4451 new_thr->th.th_active_in_pool = FALSE;
4452 }
4453 __kmp_unlock_suspend_mx(new_thr);
4454
4455 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4456 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4457 KMP_ASSERT(!new_thr->th.th_team);
4458 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4459
4460 /* setup the thread structure */
4461 __kmp_initialize_info(new_thr, team, new_tid,
4462 new_thr->th.th_info.ds.ds_gtid);
4463 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4464
4465 TCW_4(__kmp_nth, __kmp_nth + 1);
4466
4467 new_thr->th.th_task_state = 0;
4468 new_thr->th.th_task_state_top = 0;
4469 new_thr->th.th_task_state_stack_sz = 4;
4470
4471 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4472 // Make sure pool thread has transitioned to waiting on own thread struct
4473 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4474 // Thread activated in __kmp_allocate_team when increasing team size
4475 }
4476
4477#ifdef KMP_ADJUST_BLOCKTIME
4478 /* Adjust blocktime back to zero if necessary */
4479 /* Middle initialization might not have occurred yet */
4480 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4481 if (__kmp_nth > __kmp_avail_proc) {
4482 __kmp_zero_bt = TRUE;
4483 }
4484 }
4485#endif /* KMP_ADJUST_BLOCKTIME */
4486
4487#if KMP_DEBUG
4488 // If thread entered pool via __kmp_free_thread, wait_flag should !=
4489 // KMP_BARRIER_PARENT_FLAG.
4490 int b;
4491 kmp_balign_t *balign = new_thr->th.th_bar;
4492 for (b = 0; b < bs_last_barrier; ++b)
4493 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4494#endif
4495
4496 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4497 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4498
4499 KMP_MB();
4500 return new_thr;
4501 }
4502
4503 /* no, well fork a new one */
4504 KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4505 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4506
4507#if KMP_USE_MONITOR
4508 // If this is the first worker thread the RTL is creating, then also
4509 // launch the monitor thread. We try to do this as early as possible.
4510 if (!TCR_4(__kmp_init_monitor)) {
4511 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4512 if (!TCR_4(__kmp_init_monitor)) {
4513 KF_TRACE(10, ("before __kmp_create_monitor\n"));
4514 TCW_4(__kmp_init_monitor, 1);
4515 __kmp_create_monitor(&__kmp_monitor);
4516 KF_TRACE(10, ("after __kmp_create_monitor\n"));
4517#if KMP_OS_WINDOWS
4518 // AC: wait until monitor has started. This is a fix for CQ232808.
4519 // The reason is that if the library is loaded/unloaded in a loop with
4520 // small (parallel) work in between, then there is high probability that
4521 // monitor thread started after the library shutdown. At shutdown it is
4522 // too late to cope with the problem, because when the primary thread is
4523 // in DllMain (process detach) the monitor has no chances to start (it is
4524 // blocked), and primary thread has no means to inform the monitor that
4525 // the library has gone, because all the memory which the monitor can
4526 // access is going to be released/reset.
4527 while (TCR_4(__kmp_init_monitor) < 2) {
4528 KMP_YIELD(TRUE);
4529 }
4530 KF_TRACE(10, ("after monitor thread has started\n"));
4531#endif
4532 }
4533 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4534 }
4535#endif
4536
4537 KMP_MB();
4538
4539 {
4540 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4541 ? 1
4542 : __kmp_hidden_helper_threads_num + 1;
4543
4544 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4545 ++new_gtid) {
4546 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4547 }
4548
4549 if (TCR_4(__kmp_init_hidden_helper_threads)) {
4550 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4551 }
4552 }
4553
4554 /* allocate space for it. */
4555 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4556
4557 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4558
4559#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4560 // suppress race conditions detection on synchronization flags in debug mode
4561 // this helps to analyze library internals eliminating false positives
4562 __itt_suppress_mark_range(
4563 __itt_suppress_range, __itt_suppress_threading_errors,
4564 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4565 __itt_suppress_mark_range(
4566 __itt_suppress_range, __itt_suppress_threading_errors,
4567 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4568#if KMP_OS_WINDOWS
4569 __itt_suppress_mark_range(
4570 __itt_suppress_range, __itt_suppress_threading_errors,
4571 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4572#else
4573 __itt_suppress_mark_range(__itt_suppress_range,
4574 __itt_suppress_threading_errors,
4575 &new_thr->th.th_suspend_init_count,
4576 sizeof(new_thr->th.th_suspend_init_count));
4577#endif
4578 // TODO: check if we need to also suppress b_arrived flags
4579 __itt_suppress_mark_range(__itt_suppress_range,
4580 __itt_suppress_threading_errors,
4581 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4582 sizeof(new_thr->th.th_bar[0].bb.b_go));
4583 __itt_suppress_mark_range(__itt_suppress_range,
4584 __itt_suppress_threading_errors,
4585 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4586 sizeof(new_thr->th.th_bar[1].bb.b_go));
4587 __itt_suppress_mark_range(__itt_suppress_range,
4588 __itt_suppress_threading_errors,
4589 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4590 sizeof(new_thr->th.th_bar[2].bb.b_go));
4591#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4592 if (__kmp_storage_map) {
4593 __kmp_print_thread_storage_map(new_thr, new_gtid);
4594 }
4595
4596 // add the reserve serialized team, initialized from the team's primary thread
4597 {
4598 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4599 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4600 new_thr->th.th_serial_team = serial_team =
4601 (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4602#if OMPT_SUPPORT
4603 ompt_data_none, // root parallel id
4604#endif
4605 proc_bind_default, &r_icvs,
4606 0 USE_NESTED_HOT_ARG(NULL));
4607 }
4608 KMP_ASSERT(serial_team);
4609 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4610 // execution (it is unused for now).
4611 serial_team->t.t_threads[0] = new_thr;
4612 KF_TRACE(10,
4613 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4614 new_thr));
4615
4616 /* setup the thread structures */
4617 __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4618
4619#if USE_FAST_MEMORY
4620 __kmp_initialize_fast_memory(new_thr);
4621#endif /* USE_FAST_MEMORY */
4622
4623#if KMP_USE_BGET
4624 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4625 __kmp_initialize_bget(new_thr);
4626#endif
4627
4628 __kmp_init_random(new_thr); // Initialize random number generator
4629
4630 /* Initialize these only once when thread is grabbed for a team allocation */
4631 KA_TRACE(20,
4632 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4633 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4634
4635 int b;
4636 kmp_balign_t *balign = new_thr->th.th_bar;
4637 for (b = 0; b < bs_last_barrier; ++b) {
4638 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4639 balign[b].bb.team = NULL;
4640 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4641 balign[b].bb.use_oncore_barrier = 0;
4642 }
4643
4644 TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4645 new_thr->th.th_sleep_loc_type = flag_unset;
4646
4647 new_thr->th.th_spin_here = FALSE;
4648 new_thr->th.th_next_waiting = 0;
4649#if KMP_OS_UNIX
4650 new_thr->th.th_blocking = false;
4651#endif
4652
4653#if KMP_AFFINITY_SUPPORTED
4654 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4655 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4656 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4657 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4658#endif
4659 new_thr->th.th_def_allocator = __kmp_def_allocator;
4660 new_thr->th.th_prev_level = 0;
4661 new_thr->th.th_prev_num_threads = 1;
4662
4663 TCW_4(new_thr->th.th_in_pool, FALSE);
4664 new_thr->th.th_active_in_pool = FALSE;
4665 TCW_4(new_thr->th.th_active, TRUE);
4666
4667 /* adjust the global counters */
4668 __kmp_all_nth++;
4669 __kmp_nth++;
4670
4671 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4672 // numbers of procs, and method #2 (keyed API call) for higher numbers.
4673 if (__kmp_adjust_gtid_mode) {
4674 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4675 if (TCR_4(__kmp_gtid_mode) != 2) {
4676 TCW_4(__kmp_gtid_mode, 2);
4677 }
4678 } else {
4679 if (TCR_4(__kmp_gtid_mode) != 1) {
4680 TCW_4(__kmp_gtid_mode, 1);
4681 }
4682 }
4683 }
4684
4685#ifdef KMP_ADJUST_BLOCKTIME
4686 /* Adjust blocktime back to zero if necessary */
4687 /* Middle initialization might not have occurred yet */
4688 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4689 if (__kmp_nth > __kmp_avail_proc) {
4690 __kmp_zero_bt = TRUE;
4691 }
4692 }
4693#endif /* KMP_ADJUST_BLOCKTIME */
4694
4695#if KMP_AFFINITY_SUPPORTED
4696 // Set the affinity and topology information for new thread
4697 __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
4698#endif
4699
4700 /* actually fork it and create the new worker thread */
4701 KF_TRACE(
4702 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4703 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4704 KF_TRACE(10,
4705 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4706
4707 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4708 new_gtid));
4709 KMP_MB();
4710 return new_thr;
4711}
4712
4713/* Reinitialize team for reuse.
4714 The hot team code calls this case at every fork barrier, so EPCC barrier
4715 test are extremely sensitive to changes in it, esp. writes to the team
4716 struct, which cause a cache invalidation in all threads.
4717 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4718static void __kmp_reinitialize_team(kmp_team_t *team,
4719 kmp_internal_control_t *new_icvs,
4720 ident_t *loc) {
4721 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4722 team->t.t_threads[0], team));
4723 KMP_DEBUG_ASSERT(team && new_icvs);
4724 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4725 KMP_CHECK_UPDATE(team->t.t_ident, loc);
4726
4727 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4728 // Copy ICVs to the primary thread's implicit taskdata
4729 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4730 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4731
4732 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4733 team->t.t_threads[0], team));
4734}
4735
4736/* Initialize the team data structure.
4737 This assumes the t_threads and t_max_nproc are already set.
4738 Also, we don't touch the arguments */
4739static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4740 kmp_internal_control_t *new_icvs,
4741 ident_t *loc) {
4742 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4743
4744 /* verify */
4745 KMP_DEBUG_ASSERT(team);
4746 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4747 KMP_DEBUG_ASSERT(team->t.t_threads);
4748 KMP_MB();
4749
4750 team->t.t_master_tid = 0; /* not needed */
4751 /* team->t.t_master_bar; not needed */
4752 team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4753 team->t.t_nproc = new_nproc;
4754
4755 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4756 team->t.t_next_pool = NULL;
4757 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4758 * up hot team */
4759
4760 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4761 team->t.t_invoke = NULL; /* not needed */
4762
4763 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4764 team->t.t_sched.sched = new_icvs->sched.sched;
4765
4766#if KMP_ARCH_X86 || KMP_ARCH_X86_64
4767 team->t.t_fp_control_saved = FALSE; /* not needed */
4768 team->t.t_x87_fpu_control_word = 0; /* not needed */
4769 team->t.t_mxcsr = 0; /* not needed */
4770#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4771
4772 team->t.t_construct = 0;
4773
4774 team->t.t_ordered.dt.t_value = 0;
4775 team->t.t_master_active = FALSE;
4776
4777#ifdef KMP_DEBUG
4778 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4779#endif
4780#if KMP_OS_WINDOWS
4781 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4782#endif
4783
4784 team->t.t_control_stack_top = NULL;
4785
4786 __kmp_reinitialize_team(team, new_icvs, loc);
4787
4788 KMP_MB();
4789 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4790}
4791
4792#if KMP_AFFINITY_SUPPORTED
4793static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4794 int first, int last, int newp) {
4795 th->th.th_first_place = first;
4796 th->th.th_last_place = last;
4797 th->th.th_new_place = newp;
4798 if (newp != th->th.th_current_place) {
4799 if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4800 team->t.t_display_affinity = 1;
4801 // Copy topology information associated with the new place
4802 th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4803 th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4804 }
4805}
4806
4807// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4808// It calculates the worker + primary thread's partition based upon the parent
4809// thread's partition, and binds each worker to a thread in their partition.
4810// The primary thread's partition should already include its current binding.
4811static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4812 // Do not partition places for the hidden helper team
4813 if (KMP_HIDDEN_HELPER_TEAM(team))
4814 return;
4815 // Copy the primary thread's place partition to the team struct
4816 kmp_info_t *master_th = team->t.t_threads[0];
4817 KMP_DEBUG_ASSERT(master_th != NULL);
4818 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4819 int first_place = master_th->th.th_first_place;
4820 int last_place = master_th->th.th_last_place;
4821 int masters_place = master_th->th.th_current_place;
4822 int num_masks = __kmp_affinity.num_masks;
4823 team->t.t_first_place = first_place;
4824 team->t.t_last_place = last_place;
4825
4826 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4827 "bound to place %d partition = [%d,%d]\n",
4828 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4829 team->t.t_id, masters_place, first_place, last_place));
4830
4831 switch (proc_bind) {
4832
4833 case proc_bind_default:
4834 // Serial teams might have the proc_bind policy set to proc_bind_default.
4835 // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4836 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4837 break;
4838
4839 case proc_bind_primary: {
4840 int f;
4841 int n_th = team->t.t_nproc;
4842 for (f = 1; f < n_th; f++) {
4843 kmp_info_t *th = team->t.t_threads[f];
4844 KMP_DEBUG_ASSERT(th != NULL);
4845 __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
4846
4847 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4848 "partition = [%d,%d]\n",
4849 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4850 f, masters_place, first_place, last_place));
4851 }
4852 } break;
4853
4854 case proc_bind_close: {
4855 int f;
4856 int n_th = team->t.t_nproc;
4857 int n_places;
4858 if (first_place <= last_place) {
4859 n_places = last_place - first_place + 1;
4860 } else {
4861 n_places = num_masks - first_place + last_place + 1;
4862 }
4863 if (n_th <= n_places) {
4864 int place = masters_place;
4865 for (f = 1; f < n_th; f++) {
4866 kmp_info_t *th = team->t.t_threads[f];
4867 KMP_DEBUG_ASSERT(th != NULL);
4868
4869 if (place == last_place) {
4870 place = first_place;
4871 } else if (place == (num_masks - 1)) {
4872 place = 0;
4873 } else {
4874 place++;
4875 }
4876 __kmp_set_thread_place(team, th, first_place, last_place, place);
4877
4878 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4879 "partition = [%d,%d]\n",
4880 __kmp_gtid_from_thread(team->t.t_threads[f]),
4881 team->t.t_id, f, place, first_place, last_place));
4882 }
4883 } else {
4884 int S, rem, gap, s_count;
4885 S = n_th / n_places;
4886 s_count = 0;
4887 rem = n_th - (S * n_places);
4888 gap = rem > 0 ? n_places / rem : n_places;
4889 int place = masters_place;
4890 int gap_ct = gap;
4891 for (f = 0; f < n_th; f++) {
4892 kmp_info_t *th = team->t.t_threads[f];
4893 KMP_DEBUG_ASSERT(th != NULL);
4894
4895 __kmp_set_thread_place(team, th, first_place, last_place, place);
4896 s_count++;
4897
4898 if ((s_count == S) && rem && (gap_ct == gap)) {
4899 // do nothing, add an extra thread to place on next iteration
4900 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4901 // we added an extra thread to this place; move to next place
4902 if (place == last_place) {
4903 place = first_place;
4904 } else if (place == (num_masks - 1)) {
4905 place = 0;
4906 } else {
4907 place++;
4908 }
4909 s_count = 0;
4910 gap_ct = 1;
4911 rem--;
4912 } else if (s_count == S) { // place full; don't add extra
4913 if (place == last_place) {
4914 place = first_place;
4915 } else if (place == (num_masks - 1)) {
4916 place = 0;
4917 } else {
4918 place++;
4919 }
4920 gap_ct++;
4921 s_count = 0;
4922 }
4923
4924 KA_TRACE(100,
4925 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4926 "partition = [%d,%d]\n",
4927 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4928 th->th.th_new_place, first_place, last_place));
4929 }
4930 KMP_DEBUG_ASSERT(place == masters_place);
4931 }
4932 } break;
4933
4934 case proc_bind_spread: {
4935 int f;
4936 int n_th = team->t.t_nproc;
4937 int n_places;
4938 int thidx;
4939 if (first_place <= last_place) {
4940 n_places = last_place - first_place + 1;
4941 } else {
4942 n_places = num_masks - first_place + last_place + 1;
4943 }
4944 if (n_th <= n_places) {
4945 int place = -1;
4946
4947 if (n_places != num_masks) {
4948 int S = n_places / n_th;
4949 int s_count, rem, gap, gap_ct;
4950
4951 place = masters_place;
4952 rem = n_places - n_th * S;
4953 gap = rem ? n_th / rem : 1;
4954 gap_ct = gap;
4955 thidx = n_th;
4956 if (update_master_only == 1)
4957 thidx = 1;
4958 for (f = 0; f < thidx; f++) {
4959 kmp_info_t *th = team->t.t_threads[f];
4960 KMP_DEBUG_ASSERT(th != NULL);
4961
4962 int fplace = place, nplace = place;
4963 s_count = 1;
4964 while (s_count < S) {
4965 if (place == last_place) {
4966 place = first_place;
4967 } else if (place == (num_masks - 1)) {
4968 place = 0;
4969 } else {
4970 place++;
4971 }
4972 s_count++;
4973 }
4974 if (rem && (gap_ct == gap)) {
4975 if (place == last_place) {
4976 place = first_place;
4977 } else if (place == (num_masks - 1)) {
4978 place = 0;
4979 } else {
4980 place++;
4981 }
4982 rem--;
4983 gap_ct = 0;
4984 }
4985 __kmp_set_thread_place(team, th, fplace, place, nplace);
4986 gap_ct++;
4987
4988 if (place == last_place) {
4989 place = first_place;
4990 } else if (place == (num_masks - 1)) {
4991 place = 0;
4992 } else {
4993 place++;
4994 }
4995
4996 KA_TRACE(100,
4997 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4998 "partition = [%d,%d], num_masks: %u\n",
4999 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5000 f, th->th.th_new_place, th->th.th_first_place,
5001 th->th.th_last_place, num_masks));
5002 }
5003 } else {
5004 /* Having uniform space of available computation places I can create
5005 T partitions of round(P/T) size and put threads into the first
5006 place of each partition. */
5007 double current = static_cast<double>(masters_place);
5008 double spacing =
5009 (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5010 int first, last;
5011 kmp_info_t *th;
5012
5013 thidx = n_th + 1;
5014 if (update_master_only == 1)
5015 thidx = 1;
5016 for (f = 0; f < thidx; f++) {
5017 first = static_cast<int>(current);
5018 last = static_cast<int>(current + spacing) - 1;
5019 KMP_DEBUG_ASSERT(last >= first);
5020 if (first >= n_places) {
5021 if (masters_place) {
5022 first -= n_places;
5023 last -= n_places;
5024 if (first == (masters_place + 1)) {
5025 KMP_DEBUG_ASSERT(f == n_th);
5026 first--;
5027 }
5028 if (last == masters_place) {
5029 KMP_DEBUG_ASSERT(f == (n_th - 1));
5030 last--;
5031 }
5032 } else {
5033 KMP_DEBUG_ASSERT(f == n_th);
5034 first = 0;
5035 last = 0;
5036 }
5037 }
5038 if (last >= n_places) {
5039 last = (n_places - 1);
5040 }
5041 place = first;
5042 current += spacing;
5043 if (f < n_th) {
5044 KMP_DEBUG_ASSERT(0 <= first);
5045 KMP_DEBUG_ASSERT(n_places > first);
5046 KMP_DEBUG_ASSERT(0 <= last);
5047 KMP_DEBUG_ASSERT(n_places > last);
5048 KMP_DEBUG_ASSERT(last_place >= first_place);
5049 th = team->t.t_threads[f];
5050 KMP_DEBUG_ASSERT(th);
5051 __kmp_set_thread_place(team, th, first, last, place);
5052 KA_TRACE(100,
5053 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5054 "partition = [%d,%d], spacing = %.4f\n",
5055 __kmp_gtid_from_thread(team->t.t_threads[f]),
5056 team->t.t_id, f, th->th.th_new_place,
5057 th->th.th_first_place, th->th.th_last_place, spacing));
5058 }
5059 }
5060 }
5061 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5062 } else {
5063 int S, rem, gap, s_count;
5064 S = n_th / n_places;
5065 s_count = 0;
5066 rem = n_th - (S * n_places);
5067 gap = rem > 0 ? n_places / rem : n_places;
5068 int place = masters_place;
5069 int gap_ct = gap;
5070 thidx = n_th;
5071 if (update_master_only == 1)
5072 thidx = 1;
5073 for (f = 0; f < thidx; f++) {
5074 kmp_info_t *th = team->t.t_threads[f];
5075 KMP_DEBUG_ASSERT(th != NULL);
5076
5077 __kmp_set_thread_place(team, th, place, place, place);
5078 s_count++;
5079
5080 if ((s_count == S) && rem && (gap_ct == gap)) {
5081 // do nothing, add an extra thread to place on next iteration
5082 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5083 // we added an extra thread to this place; move on to next place
5084 if (place == last_place) {
5085 place = first_place;
5086 } else if (place == (num_masks - 1)) {
5087 place = 0;
5088 } else {
5089 place++;
5090 }
5091 s_count = 0;
5092 gap_ct = 1;
5093 rem--;
5094 } else if (s_count == S) { // place is full; don't add extra thread
5095 if (place == last_place) {
5096 place = first_place;
5097 } else if (place == (num_masks - 1)) {
5098 place = 0;
5099 } else {
5100 place++;
5101 }
5102 gap_ct++;
5103 s_count = 0;
5104 }
5105
5106 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5107 "partition = [%d,%d]\n",
5108 __kmp_gtid_from_thread(team->t.t_threads[f]),
5109 team->t.t_id, f, th->th.th_new_place,
5110 th->th.th_first_place, th->th.th_last_place));
5111 }
5112 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5113 }
5114 } break;
5115
5116 default:
5117 break;
5118 }
5119
5120 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5121}
5122
5123#endif // KMP_AFFINITY_SUPPORTED
5124
5125/* allocate a new team data structure to use. take one off of the free pool if
5126 available */
5127kmp_team_t *
5128__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5129#if OMPT_SUPPORT
5130 ompt_data_t ompt_parallel_data,
5131#endif
5132 kmp_proc_bind_t new_proc_bind,
5133 kmp_internal_control_t *new_icvs,
5134 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5135 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5136 int f;
5137 kmp_team_t *team;
5138 int use_hot_team = !root->r.r_active;
5139 int level = 0;
5140 int do_place_partition = 1;
5141
5142 KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5143 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5144 KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5145 KMP_MB();
5146
5147#if KMP_NESTED_HOT_TEAMS
5148 kmp_hot_team_ptr_t *hot_teams;
5149 if (master) {
5150 team = master->th.th_team;
5151 level = team->t.t_active_level;
5152 if (master->th.th_teams_microtask) { // in teams construct?
5153 if (master->th.th_teams_size.nteams > 1 &&
5154 ( // #teams > 1
5155 team->t.t_pkfn ==
5156 (microtask_t)__kmp_teams_master || // inner fork of the teams
5157 master->th.th_teams_level <
5158 team->t.t_level)) { // or nested parallel inside the teams
5159 ++level; // not increment if #teams==1, or for outer fork of the teams;
5160 // increment otherwise
5161 }
5162 // Do not perform the place partition if inner fork of the teams
5163 // Wait until nested parallel region encountered inside teams construct
5164 if ((master->th.th_teams_size.nteams == 1 &&
5165 master->th.th_teams_level >= team->t.t_level) ||
5166 (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5167 do_place_partition = 0;
5168 }
5169 hot_teams = master->th.th_hot_teams;
5170 if (level < __kmp_hot_teams_max_level && hot_teams &&
5171 hot_teams[level].hot_team) {
5172 // hot team has already been allocated for given level
5173 use_hot_team = 1;
5174 } else {
5175 use_hot_team = 0;
5176 }
5177 } else {
5178 // check we won't access uninitialized hot_teams, just in case
5179 KMP_DEBUG_ASSERT(new_nproc == 1);
5180 }
5181#endif
5182 // Optimization to use a "hot" team
5183 if (use_hot_team && new_nproc > 1) {
5184 KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5185#if KMP_NESTED_HOT_TEAMS
5186 team = hot_teams[level].hot_team;
5187#else
5188 team = root->r.r_hot_team;
5189#endif
5190#if KMP_DEBUG
5191 if (__kmp_tasking_mode != tskm_immediate_exec) {
5192 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5193 "task_team[1] = %p before reinit\n",
5194 team->t.t_task_team[0], team->t.t_task_team[1]));
5195 }
5196#endif
5197
5198 if (team->t.t_nproc != new_nproc &&
5199 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5200 // Distributed barrier may need a resize
5201 int old_nthr = team->t.t_nproc;
5202 __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5203 }
5204
5205 // If not doing the place partition, then reset the team's proc bind
5206 // to indicate that partitioning of all threads still needs to take place
5207 if (do_place_partition == 0)
5208 team->t.t_proc_bind = proc_bind_default;
5209 // Has the number of threads changed?
5210 /* Let's assume the most common case is that the number of threads is
5211 unchanged, and put that case first. */
5212 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5213 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5214 // This case can mean that omp_set_num_threads() was called and the hot
5215 // team size was already reduced, so we check the special flag
5216 if (team->t.t_size_changed == -1) {
5217 team->t.t_size_changed = 1;
5218 } else {
5219 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5220 }
5221
5222 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5223 kmp_r_sched_t new_sched = new_icvs->sched;
5224 // set primary thread's schedule as new run-time schedule
5225 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5226
5227 __kmp_reinitialize_team(team, new_icvs,
5228 root->r.r_uber_thread->th.th_ident);
5229
5230 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5231 team->t.t_threads[0], team));
5232 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5233
5234#if KMP_AFFINITY_SUPPORTED
5235 if ((team->t.t_size_changed == 0) &&
5236 (team->t.t_proc_bind == new_proc_bind)) {
5237 if (new_proc_bind == proc_bind_spread) {
5238 if (do_place_partition) {
5239 // add flag to update only master for spread
5240 __kmp_partition_places(team, 1);
5241 }
5242 }
5243 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5244 "proc_bind = %d, partition = [%d,%d]\n",
5245 team->t.t_id, new_proc_bind, team->t.t_first_place,
5246 team->t.t_last_place));
5247 } else {
5248 if (do_place_partition) {
5249 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5250 __kmp_partition_places(team);
5251 }
5252 }
5253#else
5254 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5255#endif /* KMP_AFFINITY_SUPPORTED */
5256 } else if (team->t.t_nproc > new_nproc) {
5257 KA_TRACE(20,
5258 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5259 new_nproc));
5260
5261 team->t.t_size_changed = 1;
5262 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5263 // Barrier size already reduced earlier in this function
5264 // Activate team threads via th_used_in_team
5265 __kmp_add_threads_to_team(team, new_nproc);
5266 }
5267#if KMP_NESTED_HOT_TEAMS
5268 if (__kmp_hot_teams_mode == 0) {
5269 // AC: saved number of threads should correspond to team's value in this
5270 // mode, can be bigger in mode 1, when hot team has threads in reserve
5271 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5272 hot_teams[level].hot_team_nth = new_nproc;
5273#endif // KMP_NESTED_HOT_TEAMS
5274 /* release the extra threads we don't need any more */
5275 for (f = new_nproc; f < team->t.t_nproc; f++) {
5276 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5277 if (__kmp_tasking_mode != tskm_immediate_exec) {
5278 // When decreasing team size, threads no longer in the team should
5279 // unref task team.
5280 team->t.t_threads[f]->th.th_task_team = NULL;
5281 }
5282 __kmp_free_thread(team->t.t_threads[f]);
5283 team->t.t_threads[f] = NULL;
5284 }
5285#if KMP_NESTED_HOT_TEAMS
5286 } // (__kmp_hot_teams_mode == 0)
5287 else {
5288 // When keeping extra threads in team, switch threads to wait on own
5289 // b_go flag
5290 for (f = new_nproc; f < team->t.t_nproc; ++f) {
5291 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5292 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5293 for (int b = 0; b < bs_last_barrier; ++b) {
5294 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5295 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5296 }
5297 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5298 }
5299 }
5300 }
5301#endif // KMP_NESTED_HOT_TEAMS
5302 team->t.t_nproc = new_nproc;
5303 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5304 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5305 __kmp_reinitialize_team(team, new_icvs,
5306 root->r.r_uber_thread->th.th_ident);
5307
5308 // Update remaining threads
5309 for (f = 0; f < new_nproc; ++f) {
5310 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5311 }
5312
5313 // restore the current task state of the primary thread: should be the
5314 // implicit task
5315 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5316 team->t.t_threads[0], team));
5317
5318 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5319
5320#ifdef KMP_DEBUG
5321 for (f = 0; f < team->t.t_nproc; f++) {
5322 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5323 team->t.t_threads[f]->th.th_team_nproc ==
5324 team->t.t_nproc);
5325 }
5326#endif
5327
5328 if (do_place_partition) {
5329 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5330#if KMP_AFFINITY_SUPPORTED
5331 __kmp_partition_places(team);
5332#endif
5333 }
5334 } else { // team->t.t_nproc < new_nproc
5335
5336 KA_TRACE(20,
5337 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5338 new_nproc));
5339 int old_nproc = team->t.t_nproc; // save old value and use to update only
5340 team->t.t_size_changed = 1;
5341
5342#if KMP_NESTED_HOT_TEAMS
5343 int avail_threads = hot_teams[level].hot_team_nth;
5344 if (new_nproc < avail_threads)
5345 avail_threads = new_nproc;
5346 kmp_info_t **other_threads = team->t.t_threads;
5347 for (f = team->t.t_nproc; f < avail_threads; ++f) {
5348 // Adjust barrier data of reserved threads (if any) of the team
5349 // Other data will be set in __kmp_initialize_info() below.
5350 int b;
5351 kmp_balign_t *balign = other_threads[f]->th.th_bar;
5352 for (b = 0; b < bs_last_barrier; ++b) {
5353 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5354 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5355#if USE_DEBUGGER
5356 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5357#endif
5358 }
5359 }
5360 if (hot_teams[level].hot_team_nth >= new_nproc) {
5361 // we have all needed threads in reserve, no need to allocate any
5362 // this only possible in mode 1, cannot have reserved threads in mode 0
5363 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5364 team->t.t_nproc = new_nproc; // just get reserved threads involved
5365 } else {
5366 // We may have some threads in reserve, but not enough;
5367 // get reserved threads involved if any.
5368 team->t.t_nproc = hot_teams[level].hot_team_nth;
5369 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5370#endif // KMP_NESTED_HOT_TEAMS
5371 if (team->t.t_max_nproc < new_nproc) {
5372 /* reallocate larger arrays */
5373 __kmp_reallocate_team_arrays(team, new_nproc);
5374 __kmp_reinitialize_team(team, new_icvs, NULL);
5375 }
5376
5377#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5378 /* Temporarily set full mask for primary thread before creation of
5379 workers. The reason is that workers inherit the affinity from the
5380 primary thread, so if a lot of workers are created on the single
5381 core quickly, they don't get a chance to set their own affinity for
5382 a long time. */
5383 kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5384#endif
5385
5386 /* allocate new threads for the hot team */
5387 for (f = team->t.t_nproc; f < new_nproc; f++) {
5388 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5389 KMP_DEBUG_ASSERT(new_worker);
5390 team->t.t_threads[f] = new_worker;
5391
5392 KA_TRACE(20,
5393 ("__kmp_allocate_team: team %d init T#%d arrived: "
5394 "join=%llu, plain=%llu\n",
5395 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5396 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5397 team->t.t_bar[bs_plain_barrier].b_arrived));
5398
5399 { // Initialize barrier data for new threads.
5400 int b;
5401 kmp_balign_t *balign = new_worker->th.th_bar;
5402 for (b = 0; b < bs_last_barrier; ++b) {
5403 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5404 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5405 KMP_BARRIER_PARENT_FLAG);
5406#if USE_DEBUGGER
5407 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5408#endif
5409 }
5410 }
5411 }
5412
5413#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5414 /* Restore initial primary thread's affinity mask */
5415 new_temp_affinity.restore();
5416#endif
5417#if KMP_NESTED_HOT_TEAMS
5418 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5419#endif // KMP_NESTED_HOT_TEAMS
5420 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5421 // Barrier size already increased earlier in this function
5422 // Activate team threads via th_used_in_team
5423 __kmp_add_threads_to_team(team, new_nproc);
5424 }
5425 /* make sure everyone is syncronized */
5426 // new threads below
5427 __kmp_initialize_team(team, new_nproc, new_icvs,
5428 root->r.r_uber_thread->th.th_ident);
5429
5430 /* reinitialize the threads */
5431 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5432 for (f = 0; f < team->t.t_nproc; ++f)
5433 __kmp_initialize_info(team->t.t_threads[f], team, f,
5434 __kmp_gtid_from_tid(f, team));
5435
5436 // set th_task_state for new threads in hot team with older thread's state
5437 kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5438 for (f = old_nproc; f < team->t.t_nproc; ++f)
5439 team->t.t_threads[f]->th.th_task_state = old_state;
5440
5441#ifdef KMP_DEBUG
5442 for (f = 0; f < team->t.t_nproc; ++f) {
5443 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5444 team->t.t_threads[f]->th.th_team_nproc ==
5445 team->t.t_nproc);
5446 }
5447#endif
5448
5449 if (do_place_partition) {
5450 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5451#if KMP_AFFINITY_SUPPORTED
5452 __kmp_partition_places(team);
5453#endif
5454 }
5455 } // Check changes in number of threads
5456
5457 kmp_info_t *master = team->t.t_threads[0];
5458 if (master->th.th_teams_microtask) {
5459 for (f = 1; f < new_nproc; ++f) {
5460 // propagate teams construct specific info to workers
5461 kmp_info_t *thr = team->t.t_threads[f];
5462 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5463 thr->th.th_teams_level = master->th.th_teams_level;
5464 thr->th.th_teams_size = master->th.th_teams_size;
5465 }
5466 }
5467#if KMP_NESTED_HOT_TEAMS
5468 if (level) {
5469 // Sync barrier state for nested hot teams, not needed for outermost hot
5470 // team.
5471 for (f = 1; f < new_nproc; ++f) {
5472 kmp_info_t *thr = team->t.t_threads[f];
5473 int b;
5474 kmp_balign_t *balign = thr->th.th_bar;
5475 for (b = 0; b < bs_last_barrier; ++b) {
5476 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5477 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5478#if USE_DEBUGGER
5479 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5480#endif
5481 }
5482 }
5483 }
5484#endif // KMP_NESTED_HOT_TEAMS
5485
5486 /* reallocate space for arguments if necessary */
5487 __kmp_alloc_argv_entries(argc, team, TRUE);
5488 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5489 // The hot team re-uses the previous task team,
5490 // if untouched during the previous release->gather phase.
5491
5492 KF_TRACE(10, (" hot_team = %p\n", team));
5493
5494#if KMP_DEBUG
5495 if (__kmp_tasking_mode != tskm_immediate_exec) {
5496 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5497 "task_team[1] = %p after reinit\n",
5498 team->t.t_task_team[0], team->t.t_task_team[1]));
5499 }
5500#endif
5501
5502#if OMPT_SUPPORT
5503 __ompt_team_assign_id(team, ompt_parallel_data);
5504#endif
5505
5506 KMP_MB();
5507
5508 return team;
5509 }
5510
5511 /* next, let's try to take one from the team pool */
5512 KMP_MB();
5513 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5514 /* TODO: consider resizing undersized teams instead of reaping them, now
5515 that we have a resizing mechanism */
5516 if (team->t.t_max_nproc >= max_nproc) {
5517 /* take this team from the team pool */
5518 __kmp_team_pool = team->t.t_next_pool;
5519
5520 if (max_nproc > 1 &&
5521 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5522 if (!team->t.b) { // Allocate barrier structure
5523 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5524 }
5525 }
5526
5527 /* setup the team for fresh use */
5528 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5529
5530 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5531 "task_team[1] %p to NULL\n",
5532 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5533 team->t.t_task_team[0] = NULL;
5534 team->t.t_task_team[1] = NULL;
5535
5536 /* reallocate space for arguments if necessary */
5537 __kmp_alloc_argv_entries(argc, team, TRUE);
5538 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5539
5540 KA_TRACE(
5541 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5542 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5543 { // Initialize barrier data.
5544 int b;
5545 for (b = 0; b < bs_last_barrier; ++b) {
5546 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5547#if USE_DEBUGGER
5548 team->t.t_bar[b].b_master_arrived = 0;
5549 team->t.t_bar[b].b_team_arrived = 0;
5550#endif
5551 }
5552 }
5553
5554 team->t.t_proc_bind = new_proc_bind;
5555
5556 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5557 team->t.t_id));
5558
5559#if OMPT_SUPPORT
5560 __ompt_team_assign_id(team, ompt_parallel_data);
5561#endif
5562
5563 KMP_MB();
5564
5565 return team;
5566 }
5567
5568 /* reap team if it is too small, then loop back and check the next one */
5569 // not sure if this is wise, but, will be redone during the hot-teams
5570 // rewrite.
5571 /* TODO: Use technique to find the right size hot-team, don't reap them */
5572 team = __kmp_reap_team(team);
5573 __kmp_team_pool = team;
5574 }
5575
5576 /* nothing available in the pool, no matter, make a new team! */
5577 KMP_MB();
5578 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5579
5580 /* and set it up */
5581 team->t.t_max_nproc = max_nproc;
5582 if (max_nproc > 1 &&
5583 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5584 // Allocate barrier structure
5585 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5586 }
5587
5588 /* NOTE well, for some reason allocating one big buffer and dividing it up
5589 seems to really hurt performance a lot on the P4, so, let's not use this */
5590 __kmp_allocate_team_arrays(team, max_nproc);
5591
5592 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5593 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5594
5595 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5596 "%p to NULL\n",
5597 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5598 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5599 // memory, no need to duplicate
5600 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5601 // memory, no need to duplicate
5602
5603 if (__kmp_storage_map) {
5604 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5605 }
5606
5607 /* allocate space for arguments */
5608 __kmp_alloc_argv_entries(argc, team, FALSE);
5609 team->t.t_argc = argc;
5610
5611 KA_TRACE(20,
5612 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5613 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5614 { // Initialize barrier data.
5615 int b;
5616 for (b = 0; b < bs_last_barrier; ++b) {
5617 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5618#if USE_DEBUGGER
5619 team->t.t_bar[b].b_master_arrived = 0;
5620 team->t.t_bar[b].b_team_arrived = 0;
5621#endif
5622 }
5623 }
5624
5625 team->t.t_proc_bind = new_proc_bind;
5626
5627#if OMPT_SUPPORT
5628 __ompt_team_assign_id(team, ompt_parallel_data);
5629 team->t.ompt_serialized_team_info = NULL;
5630#endif
5631
5632 KMP_MB();
5633
5634 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5635 team->t.t_id));
5636
5637 return team;
5638}
5639
5640/* TODO implement hot-teams at all levels */
5641/* TODO implement lazy thread release on demand (disband request) */
5642
5643/* free the team. return it to the team pool. release all the threads
5644 * associated with it */
5645void __kmp_free_team(kmp_root_t *root,
5646 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5647 int f;
5648 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5649 team->t.t_id));
5650
5651 /* verify state */
5652 KMP_DEBUG_ASSERT(root);
5653 KMP_DEBUG_ASSERT(team);
5654 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5655 KMP_DEBUG_ASSERT(team->t.t_threads);
5656
5657 int use_hot_team = team == root->r.r_hot_team;
5658#if KMP_NESTED_HOT_TEAMS
5659 int level;
5660 if (master) {
5661 level = team->t.t_active_level - 1;
5662 if (master->th.th_teams_microtask) { // in teams construct?
5663 if (master->th.th_teams_size.nteams > 1) {
5664 ++level; // level was not increased in teams construct for
5665 // team_of_masters
5666 }
5667 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5668 master->th.th_teams_level == team->t.t_level) {
5669 ++level; // level was not increased in teams construct for
5670 // team_of_workers before the parallel
5671 } // team->t.t_level will be increased inside parallel
5672 }
5673#if KMP_DEBUG
5674 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5675#endif
5676 if (level < __kmp_hot_teams_max_level) {
5677 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5678 use_hot_team = 1;
5679 }
5680 }
5681#endif // KMP_NESTED_HOT_TEAMS
5682
5683 /* team is done working */
5684 TCW_SYNC_PTR(team->t.t_pkfn,
5685 NULL); // Important for Debugging Support Library.
5686#if KMP_OS_WINDOWS
5687 team->t.t_copyin_counter = 0; // init counter for possible reuse
5688#endif
5689 // Do not reset pointer to parent team to NULL for hot teams.
5690
5691 /* if we are non-hot team, release our threads */
5692 if (!use_hot_team) {
5693 if (__kmp_tasking_mode != tskm_immediate_exec) {
5694 // Wait for threads to reach reapable state
5695 for (f = 1; f < team->t.t_nproc; ++f) {
5696 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5697 kmp_info_t *th = team->t.t_threads[f];
5698 volatile kmp_uint32 *state = &th->th.th_reap_state;
5699 while (*state != KMP_SAFE_TO_REAP) {
5700#if KMP_OS_WINDOWS
5701 // On Windows a thread can be killed at any time, check this
5702 DWORD ecode;
5703 if (!__kmp_is_thread_alive(th, &ecode)) {
5704 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5705 break;
5706 }
5707#endif
5708 // first check if thread is sleeping
5709 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5710 if (fl.is_sleeping())
5711 fl.resume(__kmp_gtid_from_thread(th));
5712 KMP_CPU_PAUSE();
5713 }
5714 }
5715
5716 // Delete task teams
5717 int tt_idx;
5718 for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5719 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5720 if (task_team != NULL) {
5721 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5722 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5723 team->t.t_threads[f]->th.th_task_team = NULL;
5724 }
5725 KA_TRACE(
5726 20,
5727 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5728 __kmp_get_gtid(), task_team, team->t.t_id));
5729#if KMP_NESTED_HOT_TEAMS
5730 __kmp_free_task_team(master, task_team);
5731#endif
5732 team->t.t_task_team[tt_idx] = NULL;
5733 }
5734 }
5735 }
5736
5737 // Reset pointer to parent team only for non-hot teams.
5738 team->t.t_parent = NULL;
5739 team->t.t_level = 0;
5740 team->t.t_active_level = 0;
5741
5742 /* free the worker threads */
5743 for (f = 1; f < team->t.t_nproc; ++f) {
5744 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5745 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5746 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5747 1, 2);
5748 }
5749 __kmp_free_thread(team->t.t_threads[f]);
5750 }
5751
5752 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5753 if (team->t.b) {
5754 // wake up thread at old location
5755 team->t.b->go_release();
5756 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5757 for (f = 1; f < team->t.t_nproc; ++f) {
5758 if (team->t.b->sleep[f].sleep) {
5759 __kmp_atomic_resume_64(
5760 team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5761 (kmp_atomic_flag_64<> *)NULL);
5762 }
5763 }
5764 }
5765 // Wait for threads to be removed from team
5766 for (int f = 1; f < team->t.t_nproc; ++f) {
5767 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5768 KMP_CPU_PAUSE();
5769 }
5770 }
5771 }
5772
5773 for (f = 1; f < team->t.t_nproc; ++f) {
5774 team->t.t_threads[f] = NULL;
5775 }
5776
5777 if (team->t.t_max_nproc > 1 &&
5778 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5779 distributedBarrier::deallocate(team->t.b);
5780 team->t.b = NULL;
5781 }
5782 /* put the team back in the team pool */
5783 /* TODO limit size of team pool, call reap_team if pool too large */
5784 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5785 __kmp_team_pool = (volatile kmp_team_t *)team;
5786 } else { // Check if team was created for primary threads in teams construct
5787 // See if first worker is a CG root
5788 KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5789 team->t.t_threads[1]->th.th_cg_roots);
5790 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5791 // Clean up the CG root nodes on workers so that this team can be re-used
5792 for (f = 1; f < team->t.t_nproc; ++f) {
5793 kmp_info_t *thr = team->t.t_threads[f];
5794 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5795 thr->th.th_cg_roots->cg_root == thr);
5796 // Pop current CG root off list
5797 kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5798 thr->th.th_cg_roots = tmp->up;
5799 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5800 " up to node %p. cg_nthreads was %d\n",
5801 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5802 int i = tmp->cg_nthreads--;
5803 if (i == 1) {
5804 __kmp_free(tmp); // free CG if we are the last thread in it
5805 }
5806 // Restore current task's thread_limit from CG root
5807 if (thr->th.th_cg_roots)
5808 thr->th.th_current_task->td_icvs.thread_limit =
5809 thr->th.th_cg_roots->cg_thread_limit;
5810 }
5811 }
5812 }
5813
5814 KMP_MB();
5815}
5816
5817/* reap the team. destroy it, reclaim all its resources and free its memory */
5818kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5819 kmp_team_t *next_pool = team->t.t_next_pool;
5820
5821 KMP_DEBUG_ASSERT(team);
5822 KMP_DEBUG_ASSERT(team->t.t_dispatch);
5823 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5824 KMP_DEBUG_ASSERT(team->t.t_threads);
5825 KMP_DEBUG_ASSERT(team->t.t_argv);
5826
5827 /* TODO clean the threads that are a part of this? */
5828
5829 /* free stuff */
5830 __kmp_free_team_arrays(team);
5831 if (team->t.t_argv != &team->t.t_inline_argv[0])
5832 __kmp_free((void *)team->t.t_argv);
5833 __kmp_free(team);
5834
5835 KMP_MB();
5836 return next_pool;
5837}
5838
5839// Free the thread. Don't reap it, just place it on the pool of available
5840// threads.
5841//
5842// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5843// binding for the affinity mechanism to be useful.
5844//
5845// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5846// However, we want to avoid a potential performance problem by always
5847// scanning through the list to find the correct point at which to insert
5848// the thread (potential N**2 behavior). To do this we keep track of the
5849// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5850// With single-level parallelism, threads will always be added to the tail
5851// of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5852// parallelism, all bets are off and we may need to scan through the entire
5853// free list.
5854//
5855// This change also has a potentially large performance benefit, for some
5856// applications. Previously, as threads were freed from the hot team, they
5857// would be placed back on the free list in inverse order. If the hot team
5858// grew back to it's original size, then the freed thread would be placed
5859// back on the hot team in reverse order. This could cause bad cache
5860// locality problems on programs where the size of the hot team regularly
5861// grew and shrunk.
5862//
5863// Now, for single-level parallelism, the OMP tid is always == gtid.
5864void __kmp_free_thread(kmp_info_t *this_th) {
5865 int gtid;
5866 kmp_info_t **scan;
5867
5868 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5869 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5870
5871 KMP_DEBUG_ASSERT(this_th);
5872
5873 // When moving thread to pool, switch thread to wait on own b_go flag, and
5874 // uninitialized (NULL team).
5875 int b;
5876 kmp_balign_t *balign = this_th->th.th_bar;
5877 for (b = 0; b < bs_last_barrier; ++b) {
5878 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5879 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5880 balign[b].bb.team = NULL;
5881 balign[b].bb.leaf_kids = 0;
5882 }
5883 this_th->th.th_task_state = 0;
5884 this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5885
5886 /* put thread back on the free pool */
5887 TCW_PTR(this_th->th.th_team, NULL);
5888 TCW_PTR(this_th->th.th_root, NULL);
5889 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5890
5891 while (this_th->th.th_cg_roots) {
5892 this_th->th.th_cg_roots->cg_nthreads--;
5893 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5894 " %p of thread %p to %d\n",
5895 this_th, this_th->th.th_cg_roots,
5896 this_th->th.th_cg_roots->cg_root,
5897 this_th->th.th_cg_roots->cg_nthreads));
5898 kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5899 if (tmp->cg_root == this_th) { // Thread is a cg_root
5900 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5901 KA_TRACE(
5902 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5903 this_th->th.th_cg_roots = tmp->up;
5904 __kmp_free(tmp);
5905 } else { // Worker thread
5906 if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5907 __kmp_free(tmp);
5908 }
5909 this_th->th.th_cg_roots = NULL;
5910 break;
5911 }
5912 }
5913
5914 /* If the implicit task assigned to this thread can be used by other threads
5915 * -> multiple threads can share the data and try to free the task at
5916 * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5917 * with higher probability when hot team is disabled but can occurs even when
5918 * the hot team is enabled */
5919 __kmp_free_implicit_task(this_th);
5920 this_th->th.th_current_task = NULL;
5921
5922 // If the __kmp_thread_pool_insert_pt is already past the new insert
5923 // point, then we need to re-scan the entire list.
5924 gtid = this_th->th.th_info.ds.ds_gtid;
5925 if (__kmp_thread_pool_insert_pt != NULL) {
5926 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5927 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5928 __kmp_thread_pool_insert_pt = NULL;
5929 }
5930 }
5931
5932 // Scan down the list to find the place to insert the thread.
5933 // scan is the address of a link in the list, possibly the address of
5934 // __kmp_thread_pool itself.
5935 //
5936 // In the absence of nested parallelism, the for loop will have 0 iterations.
5937 if (__kmp_thread_pool_insert_pt != NULL) {
5938 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5939 } else {
5940 scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5941 }
5942 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5943 scan = &((*scan)->th.th_next_pool))
5944 ;
5945
5946 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5947 // to its address.
5948 TCW_PTR(this_th->th.th_next_pool, *scan);
5949 __kmp_thread_pool_insert_pt = *scan = this_th;
5950 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5951 (this_th->th.th_info.ds.ds_gtid <
5952 this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5953 TCW_4(this_th->th.th_in_pool, TRUE);
5954 __kmp_suspend_initialize_thread(this_th);
5955 __kmp_lock_suspend_mx(this_th);
5956 if (this_th->th.th_active == TRUE) {
5957 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5958 this_th->th.th_active_in_pool = TRUE;
5959 }
5960#if KMP_DEBUG
5961 else {
5962 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5963 }
5964#endif
5965 __kmp_unlock_suspend_mx(this_th);
5966
5967 TCW_4(__kmp_nth, __kmp_nth - 1);
5968
5969#ifdef KMP_ADJUST_BLOCKTIME
5970 /* Adjust blocktime back to user setting or default if necessary */
5971 /* Middle initialization might never have occurred */
5972 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5973 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5974 if (__kmp_nth <= __kmp_avail_proc) {
5975 __kmp_zero_bt = FALSE;
5976 }
5977 }
5978#endif /* KMP_ADJUST_BLOCKTIME */
5979
5980 KMP_MB();
5981}
5982
5983/* ------------------------------------------------------------------------ */
5984
5985void *__kmp_launch_thread(kmp_info_t *this_thr) {
5986#if OMP_PROFILING_SUPPORT
5987 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5988 // TODO: add a configuration option for time granularity
5989 if (ProfileTraceFile)
5990 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5991#endif
5992
5993 int gtid = this_thr->th.th_info.ds.ds_gtid;
5994 /* void *stack_data;*/
5995 kmp_team_t **volatile pteam;
5996
5997 KMP_MB();
5998 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5999
6000 if (__kmp_env_consistency_check) {
6001 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6002 }
6003
6004#if OMPD_SUPPORT
6005 if (ompd_state & OMPD_ENABLE_BP)
6006 ompd_bp_thread_begin();
6007#endif
6008
6009#if OMPT_SUPPORT
6010 ompt_data_t *thread_data = nullptr;
6011 if (ompt_enabled.enabled) {
6012 thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6013 *thread_data = ompt_data_none;
6014
6015 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6016 this_thr->th.ompt_thread_info.wait_id = 0;
6017 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6018 this_thr->th.ompt_thread_info.parallel_flags = 0;
6019 if (ompt_enabled.ompt_callback_thread_begin) {
6020 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6021 ompt_thread_worker, thread_data);
6022 }
6023 this_thr->th.ompt_thread_info.state = ompt_state_idle;
6024 }
6025#endif
6026
6027 /* This is the place where threads wait for work */
6028 while (!TCR_4(__kmp_global.g.g_done)) {
6029 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6030 KMP_MB();
6031
6032 /* wait for work to do */
6033 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6034
6035 /* No tid yet since not part of a team */
6036 __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6037
6038#if OMPT_SUPPORT
6039 if (ompt_enabled.enabled) {
6040 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6041 }
6042#endif
6043
6044 pteam = &this_thr->th.th_team;
6045
6046 /* have we been allocated? */
6047 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6048 /* we were just woken up, so run our new task */
6049 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6050 int rc;
6051 KA_TRACE(20,
6052 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6053 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6054 (*pteam)->t.t_pkfn));
6055
6056 updateHWFPControl(*pteam);
6057
6058#if OMPT_SUPPORT
6059 if (ompt_enabled.enabled) {
6060 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6061 }
6062#endif
6063
6064 rc = (*pteam)->t.t_invoke(gtid);
6065 KMP_ASSERT(rc);
6066
6067 KMP_MB();
6068 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6069 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6070 (*pteam)->t.t_pkfn));
6071 }
6072#if OMPT_SUPPORT
6073 if (ompt_enabled.enabled) {
6074 /* no frame set while outside task */
6075 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6076
6077 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6078 }
6079#endif
6080 /* join barrier after parallel region */
6081 __kmp_join_barrier(gtid);
6082 }
6083 }
6084
6085#if OMPD_SUPPORT
6086 if (ompd_state & OMPD_ENABLE_BP)
6087 ompd_bp_thread_end();
6088#endif
6089
6090#if OMPT_SUPPORT
6091 if (ompt_enabled.ompt_callback_thread_end) {
6092 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6093 }
6094#endif
6095
6096 this_thr->th.th_task_team = NULL;
6097 /* run the destructors for the threadprivate data for this thread */
6098 __kmp_common_destroy_gtid(gtid);
6099
6100 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6101 KMP_MB();
6102
6103#if OMP_PROFILING_SUPPORT
6104 llvm::timeTraceProfilerFinishThread();
6105#endif
6106 return this_thr;
6107}
6108
6109/* ------------------------------------------------------------------------ */
6110
6111void __kmp_internal_end_dest(void *specific_gtid) {
6112 // Make sure no significant bits are lost
6113 int gtid;
6114 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6115
6116 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6117 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6118 * this is because 0 is reserved for the nothing-stored case */
6119
6120 __kmp_internal_end_thread(gtid);
6121}
6122
6123#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6124
6125__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6126 __kmp_internal_end_atexit();
6127}
6128
6129#endif
6130
6131/* [Windows] josh: when the atexit handler is called, there may still be more
6132 than one thread alive */
6133void __kmp_internal_end_atexit(void) {
6134 KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6135 /* [Windows]
6136 josh: ideally, we want to completely shutdown the library in this atexit
6137 handler, but stat code that depends on thread specific data for gtid fails
6138 because that data becomes unavailable at some point during the shutdown, so
6139 we call __kmp_internal_end_thread instead. We should eventually remove the
6140 dependency on __kmp_get_specific_gtid in the stat code and use
6141 __kmp_internal_end_library to cleanly shutdown the library.
6142
6143 // TODO: Can some of this comment about GVS be removed?
6144 I suspect that the offending stat code is executed when the calling thread
6145 tries to clean up a dead root thread's data structures, resulting in GVS
6146 code trying to close the GVS structures for that thread, but since the stat
6147 code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6148 the calling thread is cleaning up itself instead of another thread, it get
6149 confused. This happens because allowing a thread to unregister and cleanup
6150 another thread is a recent modification for addressing an issue.
6151 Based on the current design (20050722), a thread may end up
6152 trying to unregister another thread only if thread death does not trigger
6153 the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6154 thread specific data destructor function to detect thread death. For
6155 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6156 is nothing. Thus, the workaround is applicable only for Windows static
6157 stat library. */
6158 __kmp_internal_end_library(-1);
6159#if KMP_OS_WINDOWS
6160 __kmp_close_console();
6161#endif
6162}
6163
6164static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6165 // It is assumed __kmp_forkjoin_lock is acquired.
6166
6167 int gtid;
6168
6169 KMP_DEBUG_ASSERT(thread != NULL);
6170
6171 gtid = thread->th.th_info.ds.ds_gtid;
6172
6173 if (!is_root) {
6174 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6175 /* Assume the threads are at the fork barrier here */
6176 KA_TRACE(
6177 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6178 gtid));
6179 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6180 while (
6181 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6182 KMP_CPU_PAUSE();
6183 __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6184 } else {
6185 /* Need release fence here to prevent seg faults for tree forkjoin
6186 barrier (GEH) */
6187 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6188 thread);
6189 __kmp_release_64(&flag);
6190 }
6191 }
6192
6193 // Terminate OS thread.
6194 __kmp_reap_worker(thread);
6195
6196 // The thread was killed asynchronously. If it was actively
6197 // spinning in the thread pool, decrement the global count.
6198 //
6199 // There is a small timing hole here - if the worker thread was just waking
6200 // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6201 // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6202 // the global counter might not get updated.
6203 //
6204 // Currently, this can only happen as the library is unloaded,
6205 // so there are no harmful side effects.
6206 if (thread->th.th_active_in_pool) {
6207 thread->th.th_active_in_pool = FALSE;
6208 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6209 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6210 }
6211 }
6212
6213 __kmp_free_implicit_task(thread);
6214
6215// Free the fast memory for tasking
6216#if USE_FAST_MEMORY
6217 __kmp_free_fast_memory(thread);
6218#endif /* USE_FAST_MEMORY */
6219
6220 __kmp_suspend_uninitialize_thread(thread);
6221
6222 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6223 TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6224
6225 --__kmp_all_nth;
6226 // __kmp_nth was decremented when thread is added to the pool.
6227
6228#ifdef KMP_ADJUST_BLOCKTIME
6229 /* Adjust blocktime back to user setting or default if necessary */
6230 /* Middle initialization might never have occurred */
6231 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6232 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6233 if (__kmp_nth <= __kmp_avail_proc) {
6234 __kmp_zero_bt = FALSE;
6235 }
6236 }
6237#endif /* KMP_ADJUST_BLOCKTIME */
6238
6239 /* free the memory being used */
6240 if (__kmp_env_consistency_check) {
6241 if (thread->th.th_cons) {
6242 __kmp_free_cons_stack(thread->th.th_cons);
6243 thread->th.th_cons = NULL;
6244 }
6245 }
6246
6247 if (thread->th.th_pri_common != NULL) {
6248 __kmp_free(thread->th.th_pri_common);
6249 thread->th.th_pri_common = NULL;
6250 }
6251
6252 if (thread->th.th_task_state_memo_stack != NULL) {
6253 __kmp_free(thread->th.th_task_state_memo_stack);
6254 thread->th.th_task_state_memo_stack = NULL;
6255 }
6256
6257#if KMP_USE_BGET
6258 if (thread->th.th_local.bget_data != NULL) {
6259 __kmp_finalize_bget(thread);
6260 }
6261#endif
6262
6263#if KMP_AFFINITY_SUPPORTED
6264 if (thread->th.th_affin_mask != NULL) {
6265 KMP_CPU_FREE(thread->th.th_affin_mask);
6266 thread->th.th_affin_mask = NULL;
6267 }
6268#endif /* KMP_AFFINITY_SUPPORTED */
6269
6270#if KMP_USE_HIER_SCHED
6271 if (thread->th.th_hier_bar_data != NULL) {
6272 __kmp_free(thread->th.th_hier_bar_data);
6273 thread->th.th_hier_bar_data = NULL;
6274 }
6275#endif
6276
6277 __kmp_reap_team(thread->th.th_serial_team);
6278 thread->th.th_serial_team = NULL;
6279 __kmp_free(thread);
6280
6281 KMP_MB();
6282
6283} // __kmp_reap_thread
6284
6285static void __kmp_itthash_clean(kmp_info_t *th) {
6286#if USE_ITT_NOTIFY
6287 if (__kmp_itt_region_domains.count > 0) {
6288 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6289 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6290 while (bucket) {
6291 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6292 __kmp_thread_free(th, bucket);
6293 bucket = next;
6294 }
6295 }
6296 }
6297 if (__kmp_itt_barrier_domains.count > 0) {
6298 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6299 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6300 while (bucket) {
6301 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6302 __kmp_thread_free(th, bucket);
6303 bucket = next;
6304 }
6305 }
6306 }
6307#endif
6308}
6309
6310static void __kmp_internal_end(void) {
6311 int i;
6312
6313 /* First, unregister the library */
6314 __kmp_unregister_library();
6315
6316#if KMP_OS_WINDOWS
6317 /* In Win static library, we can't tell when a root actually dies, so we
6318 reclaim the data structures for any root threads that have died but not
6319 unregistered themselves, in order to shut down cleanly.
6320 In Win dynamic library we also can't tell when a thread dies. */
6321 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6322// dead roots
6323#endif
6324
6325 for (i = 0; i < __kmp_threads_capacity; i++)
6326 if (__kmp_root[i])
6327 if (__kmp_root[i]->r.r_active)
6328 break;
6329 KMP_MB(); /* Flush all pending memory write invalidates. */
6330 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6331
6332 if (i < __kmp_threads_capacity) {
6333#if KMP_USE_MONITOR
6334 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6335 KMP_MB(); /* Flush all pending memory write invalidates. */
6336
6337 // Need to check that monitor was initialized before reaping it. If we are
6338 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6339 // __kmp_monitor will appear to contain valid data, but it is only valid in
6340 // the parent process, not the child.
6341 // New behavior (201008): instead of keying off of the flag
6342 // __kmp_init_parallel, the monitor thread creation is keyed off
6343 // of the new flag __kmp_init_monitor.
6344 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6345 if (TCR_4(__kmp_init_monitor)) {
6346 __kmp_reap_monitor(&__kmp_monitor);
6347 TCW_4(__kmp_init_monitor, 0);
6348 }
6349 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6350 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6351#endif // KMP_USE_MONITOR
6352 } else {
6353/* TODO move this to cleanup code */
6354#ifdef KMP_DEBUG
6355 /* make sure that everything has properly ended */
6356 for (i = 0; i < __kmp_threads_capacity; i++) {
6357 if (__kmp_root[i]) {
6358 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6359 // there can be uber threads alive here
6360 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6361 }
6362 }
6363#endif
6364
6365 KMP_MB();
6366
6367 // Reap the worker threads.
6368 // This is valid for now, but be careful if threads are reaped sooner.
6369 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6370 // Get the next thread from the pool.
6371 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6372 __kmp_thread_pool = thread->th.th_next_pool;
6373 // Reap it.
6374 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6375 thread->th.th_next_pool = NULL;
6376 thread->th.th_in_pool = FALSE;
6377 __kmp_reap_thread(thread, 0);
6378 }
6379 __kmp_thread_pool_insert_pt = NULL;
6380
6381 // Reap teams.
6382 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6383 // Get the next team from the pool.
6384 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6385 __kmp_team_pool = team->t.t_next_pool;
6386 // Reap it.
6387 team->t.t_next_pool = NULL;
6388 __kmp_reap_team(team);
6389 }
6390
6391 __kmp_reap_task_teams();
6392
6393#if KMP_OS_UNIX
6394 // Threads that are not reaped should not access any resources since they
6395 // are going to be deallocated soon, so the shutdown sequence should wait
6396 // until all threads either exit the final spin-waiting loop or begin
6397 // sleeping after the given blocktime.
6398 for (i = 0; i < __kmp_threads_capacity; i++) {
6399 kmp_info_t *thr = __kmp_threads[i];
6400 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6401 KMP_CPU_PAUSE();
6402 }
6403#endif
6404
6405 for (i = 0; i < __kmp_threads_capacity; ++i) {
6406 // TBD: Add some checking...
6407 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6408 }
6409
6410 /* Make sure all threadprivate destructors get run by joining with all
6411 worker threads before resetting this flag */
6412 TCW_SYNC_4(__kmp_init_common, FALSE);
6413
6414 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6415 KMP_MB();
6416
6417#if KMP_USE_MONITOR
6418 // See note above: One of the possible fixes for CQ138434 / CQ140126
6419 //
6420 // FIXME: push both code fragments down and CSE them?
6421 // push them into __kmp_cleanup() ?
6422 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6423 if (TCR_4(__kmp_init_monitor)) {
6424 __kmp_reap_monitor(&__kmp_monitor);
6425 TCW_4(__kmp_init_monitor, 0);
6426 }
6427 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6428 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6429#endif
6430 } /* else !__kmp_global.t_active */
6431 TCW_4(__kmp_init_gtid, FALSE);
6432 KMP_MB(); /* Flush all pending memory write invalidates. */
6433
6434 __kmp_cleanup();
6435#if OMPT_SUPPORT
6436 ompt_fini();
6437#endif
6438}
6439
6440void __kmp_internal_end_library(int gtid_req) {
6441 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6442 /* this shouldn't be a race condition because __kmp_internal_end() is the
6443 only place to clear __kmp_serial_init */
6444 /* we'll check this later too, after we get the lock */
6445 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6446 // redundant, because the next check will work in any case.
6447 if (__kmp_global.g.g_abort) {
6448 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6449 /* TODO abort? */
6450 return;
6451 }
6452 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6453 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6454 return;
6455 }
6456
6457 // If hidden helper team has been initialized, we need to deinit it
6458 if (TCR_4(__kmp_init_hidden_helper) &&
6459 !TCR_4(__kmp_hidden_helper_team_done)) {
6460 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6461 // First release the main thread to let it continue its work
6462 __kmp_hidden_helper_main_thread_release();
6463 // Wait until the hidden helper team has been destroyed
6464 __kmp_hidden_helper_threads_deinitz_wait();
6465 }
6466
6467 KMP_MB(); /* Flush all pending memory write invalidates. */
6468 /* find out who we are and what we should do */
6469 {
6470 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6471 KA_TRACE(
6472 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6473 if (gtid == KMP_GTID_SHUTDOWN) {
6474 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6475 "already shutdown\n"));
6476 return;
6477 } else if (gtid == KMP_GTID_MONITOR) {
6478 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6479 "registered, or system shutdown\n"));
6480 return;
6481 } else if (gtid == KMP_GTID_DNE) {
6482 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6483 "shutdown\n"));
6484 /* we don't know who we are, but we may still shutdown the library */
6485 } else if (KMP_UBER_GTID(gtid)) {
6486 /* unregister ourselves as an uber thread. gtid is no longer valid */
6487 if (__kmp_root[gtid]->r.r_active) {
6488 __kmp_global.g.g_abort = -1;
6489 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6490 __kmp_unregister_library();
6491 KA_TRACE(10,
6492 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6493 gtid));
6494 return;
6495 } else {
6496 __kmp_itthash_clean(__kmp_threads[gtid]);
6497 KA_TRACE(
6498 10,
6499 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6500 __kmp_unregister_root_current_thread(gtid);
6501 }
6502 } else {
6503/* worker threads may call this function through the atexit handler, if they
6504 * call exit() */
6505/* For now, skip the usual subsequent processing and just dump the debug buffer.
6506 TODO: do a thorough shutdown instead */
6507#ifdef DUMP_DEBUG_ON_EXIT
6508 if (__kmp_debug_buf)
6509 __kmp_dump_debug_buffer();
6510#endif
6511 // added unregister library call here when we switch to shm linux
6512 // if we don't, it will leave lots of files in /dev/shm
6513 // cleanup shared memory file before exiting.
6514 __kmp_unregister_library();
6515 return;
6516 }
6517 }
6518 /* synchronize the termination process */
6519 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6520
6521 /* have we already finished */
6522 if (__kmp_global.g.g_abort) {
6523 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6524 /* TODO abort? */
6525 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6526 return;
6527 }
6528 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6529 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6530 return;
6531 }
6532
6533 /* We need this lock to enforce mutex between this reading of
6534 __kmp_threads_capacity and the writing by __kmp_register_root.
6535 Alternatively, we can use a counter of roots that is atomically updated by
6536 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6537 __kmp_internal_end_*. */
6538 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6539
6540 /* now we can safely conduct the actual termination */
6541 __kmp_internal_end();
6542
6543 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6544 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6545
6546 KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6547
6548#ifdef DUMP_DEBUG_ON_EXIT
6549 if (__kmp_debug_buf)
6550 __kmp_dump_debug_buffer();
6551#endif
6552
6553#if KMP_OS_WINDOWS
6554 __kmp_close_console();
6555#endif
6556
6557 __kmp_fini_allocator();
6558
6559} // __kmp_internal_end_library
6560
6561void __kmp_internal_end_thread(int gtid_req) {
6562 int i;
6563
6564 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6565 /* this shouldn't be a race condition because __kmp_internal_end() is the
6566 * only place to clear __kmp_serial_init */
6567 /* we'll check this later too, after we get the lock */
6568 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6569 // redundant, because the next check will work in any case.
6570 if (__kmp_global.g.g_abort) {
6571 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6572 /* TODO abort? */
6573 return;
6574 }
6575 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6576 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6577 return;
6578 }
6579
6580 // If hidden helper team has been initialized, we need to deinit it
6581 if (TCR_4(__kmp_init_hidden_helper) &&
6582 !TCR_4(__kmp_hidden_helper_team_done)) {
6583 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6584 // First release the main thread to let it continue its work
6585 __kmp_hidden_helper_main_thread_release();
6586 // Wait until the hidden helper team has been destroyed
6587 __kmp_hidden_helper_threads_deinitz_wait();
6588 }
6589
6590 KMP_MB(); /* Flush all pending memory write invalidates. */
6591
6592 /* find out who we are and what we should do */
6593 {
6594 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6595 KA_TRACE(10,
6596 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6597 if (gtid == KMP_GTID_SHUTDOWN) {
6598 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6599 "already shutdown\n"));
6600 return;
6601 } else if (gtid == KMP_GTID_MONITOR) {
6602 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6603 "registered, or system shutdown\n"));
6604 return;
6605 } else if (gtid == KMP_GTID_DNE) {
6606 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6607 "shutdown\n"));
6608 return;
6609 /* we don't know who we are */
6610 } else if (KMP_UBER_GTID(gtid)) {
6611 /* unregister ourselves as an uber thread. gtid is no longer valid */
6612 if (__kmp_root[gtid]->r.r_active) {
6613 __kmp_global.g.g_abort = -1;
6614 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6615 KA_TRACE(10,
6616 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6617 gtid));
6618 return;
6619 } else {
6620 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6621 gtid));
6622 __kmp_unregister_root_current_thread(gtid);
6623 }
6624 } else {
6625 /* just a worker thread, let's leave */
6626 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6627
6628 if (gtid >= 0) {
6629 __kmp_threads[gtid]->th.th_task_team = NULL;
6630 }
6631
6632 KA_TRACE(10,
6633 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6634 gtid));
6635 return;
6636 }
6637 }
6638#if KMP_DYNAMIC_LIB
6639 if (__kmp_pause_status != kmp_hard_paused)
6640 // AC: lets not shutdown the dynamic library at the exit of uber thread,
6641 // because we will better shutdown later in the library destructor.
6642 {
6643 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6644 return;
6645 }
6646#endif
6647 /* synchronize the termination process */
6648 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6649
6650 /* have we already finished */
6651 if (__kmp_global.g.g_abort) {
6652 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6653 /* TODO abort? */
6654 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6655 return;
6656 }
6657 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6658 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6659 return;
6660 }
6661
6662 /* We need this lock to enforce mutex between this reading of
6663 __kmp_threads_capacity and the writing by __kmp_register_root.
6664 Alternatively, we can use a counter of roots that is atomically updated by
6665 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6666 __kmp_internal_end_*. */
6667
6668 /* should we finish the run-time? are all siblings done? */
6669 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6670
6671 for (i = 0; i < __kmp_threads_capacity; ++i) {
6672 if (KMP_UBER_GTID(i)) {
6673 KA_TRACE(
6674 10,
6675 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6676 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6677 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6678 return;
6679 }
6680 }
6681
6682 /* now we can safely conduct the actual termination */
6683
6684 __kmp_internal_end();
6685
6686 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6687 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6688
6689 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6690
6691#ifdef DUMP_DEBUG_ON_EXIT
6692 if (__kmp_debug_buf)
6693 __kmp_dump_debug_buffer();
6694#endif
6695} // __kmp_internal_end_thread
6696
6697// -----------------------------------------------------------------------------
6698// Library registration stuff.
6699
6700static long __kmp_registration_flag = 0;
6701// Random value used to indicate library initialization.
6702static char *__kmp_registration_str = NULL;
6703// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6704
6705static inline char *__kmp_reg_status_name() {
6706/* On RHEL 3u5 if linked statically, getpid() returns different values in
6707 each thread. If registration and unregistration go in different threads
6708 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6709 env var can not be found, because the name will contain different pid. */
6710// macOS* complains about name being too long with additional getuid()
6711#if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6712 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6713 (int)getuid());
6714#else
6715 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6716#endif
6717} // __kmp_reg_status_get
6718
6719#if defined(KMP_USE_SHM)
6720// If /dev/shm is not accessible, we will create a temporary file under /tmp.
6721char *temp_reg_status_file_name = nullptr;
6722#endif
6723
6724void __kmp_register_library_startup(void) {
6725
6726 char *name = __kmp_reg_status_name(); // Name of the environment variable.
6727 int done = 0;
6728 union {
6729 double dtime;
6730 long ltime;
6731 } time;
6732#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6733 __kmp_initialize_system_tick();
6734#endif
6735 __kmp_read_system_time(&time.dtime);
6736 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6737 __kmp_registration_str =
6738 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6739 __kmp_registration_flag, KMP_LIBRARY_FILE);
6740
6741 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6742 __kmp_registration_str));
6743
6744 while (!done) {
6745
6746 char *value = NULL; // Actual value of the environment variable.
6747
6748#if defined(KMP_USE_SHM)
6749 char *shm_name = __kmp_str_format("/%s", name);
6750 int shm_preexist = 0;
6751 char *data1;
6752 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6753 if ((fd1 == -1) && (errno == EEXIST)) {
6754 // file didn't open because it already exists.
6755 // try opening existing file
6756 fd1 = shm_open(shm_name, O_RDWR, 0666);
6757 if (fd1 == -1) { // file didn't open
6758 // error out here
6759 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6760 __kmp_msg_null);
6761 } else {
6762 // able to open existing file
6763 shm_preexist = 1;
6764 }
6765 } else if (fd1 == -1) {
6766 // SHM didn't open; it was due to error other than already exists. Try to
6767 // create a temp file under /tmp.
6768 // TODO: /tmp might not always be the temporary directory. For now we will
6769 // not consider TMPDIR. If /tmp is not accessible, we simply error out.
6770 char *temp_file_name = __kmp_str_format("/tmp/%sXXXXXX", name);
6771 fd1 = mkstemp(temp_file_name);
6772 if (fd1 == -1) {
6773 // error out here.
6774 __kmp_fatal(KMP_MSG(FunctionError, "Can't open TEMP"), KMP_ERR(errno),
6775 __kmp_msg_null);
6776 }
6777 temp_reg_status_file_name = temp_file_name;
6778 }
6779 if (shm_preexist == 0) {
6780 // we created SHM now set size
6781 if (ftruncate(fd1, SHM_SIZE) == -1) {
6782 // error occured setting size;
6783 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6784 KMP_ERR(errno), __kmp_msg_null);
6785 }
6786 }
6787 data1 =
6788 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6789 if (data1 == MAP_FAILED) {
6790 // failed to map shared memory
6791 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6792 __kmp_msg_null);
6793 }
6794 if (shm_preexist == 0) { // set data to SHM, set value
6795 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6796 }
6797 // Read value from either what we just wrote or existing file.
6798 value = __kmp_str_format("%s", data1); // read value from SHM
6799 munmap(data1, SHM_SIZE);
6800 close(fd1);
6801#else // Windows and unix with static library
6802 // Set environment variable, but do not overwrite if it is exist.
6803 __kmp_env_set(name, __kmp_registration_str, 0);
6804 // read value to see if it got set
6805 value = __kmp_env_get(name);
6806#endif
6807
6808 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6809 done = 1; // Ok, environment variable set successfully, exit the loop.
6810 } else {
6811 // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6812 // Check whether it alive or dead.
6813 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6814 char *tail = value;
6815 char *flag_addr_str = NULL;
6816 char *flag_val_str = NULL;
6817 char const *file_name = NULL;
6818 __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6819 __kmp_str_split(tail, '-', &flag_val_str, &tail);
6820 file_name = tail;
6821 if (tail != NULL) {
6822 unsigned long *flag_addr = 0;
6823 unsigned long flag_val = 0;
6824 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6825 KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6826 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6827 // First, check whether environment-encoded address is mapped into
6828 // addr space.
6829 // If so, dereference it to see if it still has the right value.
6830 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6831 neighbor = 1;
6832 } else {
6833 // If not, then we know the other copy of the library is no longer
6834 // running.
6835 neighbor = 2;
6836 }
6837 }
6838 }
6839 switch (neighbor) {
6840 case 0: // Cannot parse environment variable -- neighbor status unknown.
6841 // Assume it is the incompatible format of future version of the
6842 // library. Assume the other library is alive.
6843 // WARN( ... ); // TODO: Issue a warning.
6844 file_name = "unknown library";
6845 KMP_FALLTHROUGH();
6846 // Attention! Falling to the next case. That's intentional.
6847 case 1: { // Neighbor is alive.
6848 // Check it is allowed.
6849 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6850 if (!__kmp_str_match_true(duplicate_ok)) {
6851 // That's not allowed. Issue fatal error.
6852 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6853 KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6854 }
6855 KMP_INTERNAL_FREE(duplicate_ok);
6856 __kmp_duplicate_library_ok = 1;
6857 done = 1; // Exit the loop.
6858 } break;
6859 case 2: { // Neighbor is dead.
6860
6861#if defined(KMP_USE_SHM)
6862 // close shared memory.
6863 shm_unlink(shm_name); // this removes file in /dev/shm
6864#else
6865 // Clear the variable and try to register library again.
6866 __kmp_env_unset(name);
6867#endif
6868 } break;
6869 default: {
6870 KMP_DEBUG_ASSERT(0);
6871 } break;
6872 }
6873 }
6874 KMP_INTERNAL_FREE((void *)value);
6875#if defined(KMP_USE_SHM)
6876 KMP_INTERNAL_FREE((void *)shm_name);
6877#endif
6878 } // while
6879 KMP_INTERNAL_FREE((void *)name);
6880
6881} // func __kmp_register_library_startup
6882
6883void __kmp_unregister_library(void) {
6884
6885 char *name = __kmp_reg_status_name();
6886 char *value = NULL;
6887
6888#if defined(KMP_USE_SHM)
6889 bool use_shm = true;
6890 char *shm_name = __kmp_str_format("/%s", name);
6891 int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6892 if (fd1 == -1) {
6893 // File did not open. Try the temporary file.
6894 use_shm = false;
6895 KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6896 fd1 = open(temp_reg_status_file_name, O_RDONLY);
6897 if (fd1 == -1) {
6898 // give it up now.
6899 return;
6900 }
6901 }
6902 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6903 if (data1 != MAP_FAILED) {
6904 value = __kmp_str_format("%s", data1); // read value from SHM
6905 munmap(data1, SHM_SIZE);
6906 }
6907 close(fd1);
6908#else
6909 value = __kmp_env_get(name);
6910#endif
6911
6912 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6913 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6914 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6915// Ok, this is our variable. Delete it.
6916#if defined(KMP_USE_SHM)
6917 if (use_shm) {
6918 shm_unlink(shm_name); // this removes file in /dev/shm
6919 } else {
6920 KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6921 unlink(temp_reg_status_file_name); // this removes the temp file
6922 }
6923#else
6924 __kmp_env_unset(name);
6925#endif
6926 }
6927
6928#if defined(KMP_USE_SHM)
6929 KMP_INTERNAL_FREE(shm_name);
6930 if (!use_shm) {
6931 KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6932 KMP_INTERNAL_FREE(temp_reg_status_file_name);
6933 }
6934#endif
6935
6936 KMP_INTERNAL_FREE(__kmp_registration_str);
6937 KMP_INTERNAL_FREE(value);
6938 KMP_INTERNAL_FREE(name);
6939
6940 __kmp_registration_flag = 0;
6941 __kmp_registration_str = NULL;
6942
6943} // __kmp_unregister_library
6944
6945// End of Library registration stuff.
6946// -----------------------------------------------------------------------------
6947
6948#if KMP_MIC_SUPPORTED
6949
6950static void __kmp_check_mic_type() {
6951 kmp_cpuid_t cpuid_state = {0};
6952 kmp_cpuid_t *cs_p = &cpuid_state;
6953 __kmp_x86_cpuid(1, 0, cs_p);
6954 // We don't support mic1 at the moment
6955 if ((cs_p->eax & 0xff0) == 0xB10) {
6956 __kmp_mic_type = mic2;
6957 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6958 __kmp_mic_type = mic3;
6959 } else {
6960 __kmp_mic_type = non_mic;
6961 }
6962}
6963
6964#endif /* KMP_MIC_SUPPORTED */
6965
6966#if KMP_HAVE_UMWAIT
6967static void __kmp_user_level_mwait_init() {
6968 struct kmp_cpuid buf;
6969 __kmp_x86_cpuid(7, 0, &buf);
6970 __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
6971 __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
6972 __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
6973 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6974 __kmp_umwait_enabled));
6975}
6976#elif KMP_HAVE_MWAIT
6977#ifndef AT_INTELPHIUSERMWAIT
6978// Spurious, non-existent value that should always fail to return anything.
6979// Will be replaced with the correct value when we know that.
6980#define AT_INTELPHIUSERMWAIT 10000
6981#endif
6982// getauxval() function is available in RHEL7 and SLES12. If a system with an
6983// earlier OS is used to build the RTL, we'll use the following internal
6984// function when the entry is not found.
6985unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6986unsigned long getauxval(unsigned long) { return 0; }
6987
6988static void __kmp_user_level_mwait_init() {
6989 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6990 // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6991 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6992 // KMP_USER_LEVEL_MWAIT was set to TRUE.
6993 if (__kmp_mic_type == mic3) {
6994 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6995 if ((res & 0x1) || __kmp_user_level_mwait) {
6996 __kmp_mwait_enabled = TRUE;
6997 if (__kmp_user_level_mwait) {
6998 KMP_INFORM(EnvMwaitWarn);
6999 }
7000 } else {
7001 __kmp_mwait_enabled = FALSE;
7002 }
7003 }
7004 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7005 "__kmp_mwait_enabled = %d\n",
7006 __kmp_mic_type, __kmp_mwait_enabled));
7007}
7008#endif /* KMP_HAVE_UMWAIT */
7009
7010static void __kmp_do_serial_initialize(void) {
7011 int i, gtid;
7012 size_t size;
7013
7014 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7015
7016 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7017 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7018 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7019 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7020 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7021
7022#if OMPT_SUPPORT
7023 ompt_pre_init();
7024#endif
7025#if OMPD_SUPPORT
7026 __kmp_env_dump();
7027 ompd_init();
7028#endif
7029
7030 __kmp_validate_locks();
7031
7032#if ENABLE_LIBOMPTARGET
7033 /* Initialize functions from libomptarget */
7034 __kmp_init_omptarget();
7035#endif
7036
7037 /* Initialize internal memory allocator */
7038 __kmp_init_allocator();
7039
7040 /* Register the library startup via an environment variable or via mapped
7041 shared memory file and check to see whether another copy of the library is
7042 already registered. Since forked child process is often terminated, we
7043 postpone the registration till middle initialization in the child */
7044 if (__kmp_need_register_serial)
7045 __kmp_register_library_startup();
7046
7047 /* TODO reinitialization of library */
7048 if (TCR_4(__kmp_global.g.g_done)) {
7049 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7050 }
7051
7052 __kmp_global.g.g_abort = 0;
7053 TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7054
7055/* initialize the locks */
7056#if KMP_USE_ADAPTIVE_LOCKS
7057#if KMP_DEBUG_ADAPTIVE_LOCKS
7058 __kmp_init_speculative_stats();
7059#endif
7060#endif
7061#if KMP_STATS_ENABLED
7062 __kmp_stats_init();
7063#endif
7064 __kmp_init_lock(&__kmp_global_lock);
7065 __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7066 __kmp_init_lock(&__kmp_debug_lock);
7067 __kmp_init_atomic_lock(&__kmp_atomic_lock);
7068 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7069 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7070 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7071 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7072 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7073 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7074 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7075 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7076 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7077 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7078 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7079 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7080 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7081 __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7082#if KMP_USE_MONITOR
7083 __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7084#endif
7085 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7086
7087 /* conduct initialization and initial setup of configuration */
7088
7089 __kmp_runtime_initialize();
7090
7091#if KMP_MIC_SUPPORTED
7092 __kmp_check_mic_type();
7093#endif
7094
7095// Some global variable initialization moved here from kmp_env_initialize()
7096#ifdef KMP_DEBUG
7097 kmp_diag = 0;
7098#endif
7099 __kmp_abort_delay = 0;
7100
7101 // From __kmp_init_dflt_team_nth()
7102 /* assume the entire machine will be used */
7103 __kmp_dflt_team_nth_ub = __kmp_xproc;
7104 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7105 __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7106 }
7107 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7108 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7109 }
7110 __kmp_max_nth = __kmp_sys_max_nth;
7111 __kmp_cg_max_nth = __kmp_sys_max_nth;
7112 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7113 if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7114 __kmp_teams_max_nth = __kmp_sys_max_nth;
7115 }
7116
7117 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7118 // part
7119 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7120#if KMP_USE_MONITOR
7121 __kmp_monitor_wakeups =
7122 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7123 __kmp_bt_intervals =
7124 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7125#endif
7126 // From "KMP_LIBRARY" part of __kmp_env_initialize()
7127 __kmp_library = library_throughput;
7128 // From KMP_SCHEDULE initialization
7129 __kmp_static = kmp_sch_static_balanced;
7130// AC: do not use analytical here, because it is non-monotonous
7131//__kmp_guided = kmp_sch_guided_iterative_chunked;
7132//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7133// need to repeat assignment
7134// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7135// bit control and barrier method control parts
7136#if KMP_FAST_REDUCTION_BARRIER
7137#define kmp_reduction_barrier_gather_bb ((int)1)
7138#define kmp_reduction_barrier_release_bb ((int)1)
7139#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7140#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7141#endif // KMP_FAST_REDUCTION_BARRIER
7142 for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7143 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7144 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7145 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7146 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7147#if KMP_FAST_REDUCTION_BARRIER
7148 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7149 // lin_64 ): hyper,1
7150 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7151 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7152 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7153 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7154 }
7155#endif // KMP_FAST_REDUCTION_BARRIER
7156 }
7157#if KMP_FAST_REDUCTION_BARRIER
7158#undef kmp_reduction_barrier_release_pat
7159#undef kmp_reduction_barrier_gather_pat
7160#undef kmp_reduction_barrier_release_bb
7161#undef kmp_reduction_barrier_gather_bb
7162#endif // KMP_FAST_REDUCTION_BARRIER
7163#if KMP_MIC_SUPPORTED
7164 if (__kmp_mic_type == mic2) { // KNC
7165 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7166 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7167 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7168 1; // forkjoin release
7169 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7170 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7171 }
7172#if KMP_FAST_REDUCTION_BARRIER
7173 if (__kmp_mic_type == mic2) { // KNC
7174 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7175 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7176 }
7177#endif // KMP_FAST_REDUCTION_BARRIER
7178#endif // KMP_MIC_SUPPORTED
7179
7180// From KMP_CHECKS initialization
7181#ifdef KMP_DEBUG
7182 __kmp_env_checks = TRUE; /* development versions have the extra checks */
7183#else
7184 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7185#endif
7186
7187 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7188 __kmp_foreign_tp = TRUE;
7189
7190 __kmp_global.g.g_dynamic = FALSE;
7191 __kmp_global.g.g_dynamic_mode = dynamic_default;
7192
7193 __kmp_init_nesting_mode();
7194
7195 __kmp_env_initialize(NULL);
7196
7197#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7198 __kmp_user_level_mwait_init();
7199#endif
7200// Print all messages in message catalog for testing purposes.
7201#ifdef KMP_DEBUG
7202 char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7203 if (__kmp_str_match_true(val)) {
7204 kmp_str_buf_t buffer;
7205 __kmp_str_buf_init(&buffer);
7206 __kmp_i18n_dump_catalog(&buffer);
7207 __kmp_printf("%s", buffer.str);
7208 __kmp_str_buf_free(&buffer);
7209 }
7210 __kmp_env_free(&val);
7211#endif
7212
7213 __kmp_threads_capacity =
7214 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7215 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7216 __kmp_tp_capacity = __kmp_default_tp_capacity(
7217 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7218
7219 // If the library is shut down properly, both pools must be NULL. Just in
7220 // case, set them to NULL -- some memory may leak, but subsequent code will
7221 // work even if pools are not freed.
7222 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7223 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7224 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7225 __kmp_thread_pool = NULL;
7226 __kmp_thread_pool_insert_pt = NULL;
7227 __kmp_team_pool = NULL;
7228
7229 /* Allocate all of the variable sized records */
7230 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7231 * expandable */
7232 /* Since allocation is cache-aligned, just add extra padding at the end */
7233 size =
7234 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7235 CACHE_LINE;
7236 __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7237 __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7238 sizeof(kmp_info_t *) * __kmp_threads_capacity);
7239
7240 /* init thread counts */
7241 KMP_DEBUG_ASSERT(__kmp_all_nth ==
7242 0); // Asserts fail if the library is reinitializing and
7243 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7244 __kmp_all_nth = 0;
7245 __kmp_nth = 0;
7246
7247 /* setup the uber master thread and hierarchy */
7248 gtid = __kmp_register_root(TRUE);
7249 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7250 KMP_ASSERT(KMP_UBER_GTID(gtid));
7251 KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7252
7253 KMP_MB(); /* Flush all pending memory write invalidates. */
7254
7255 __kmp_common_initialize();
7256
7257#if KMP_OS_UNIX
7258 /* invoke the child fork handler */
7259 __kmp_register_atfork();
7260#endif
7261
7262#if !KMP_DYNAMIC_LIB || \
7263 ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7264 {
7265 /* Invoke the exit handler when the program finishes, only for static
7266 library and macOS* dynamic. For other dynamic libraries, we already
7267 have _fini and DllMain. */
7268 int rc = atexit(__kmp_internal_end_atexit);
7269 if (rc != 0) {
7270 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7271 __kmp_msg_null);
7272 }
7273 }
7274#endif
7275
7276#if KMP_HANDLE_SIGNALS
7277#if KMP_OS_UNIX
7278 /* NOTE: make sure that this is called before the user installs their own
7279 signal handlers so that the user handlers are called first. this way they
7280 can return false, not call our handler, avoid terminating the library, and
7281 continue execution where they left off. */
7282 __kmp_install_signals(FALSE);
7283#endif /* KMP_OS_UNIX */
7284#if KMP_OS_WINDOWS
7285 __kmp_install_signals(TRUE);
7286#endif /* KMP_OS_WINDOWS */
7287#endif
7288
7289 /* we have finished the serial initialization */
7290 __kmp_init_counter++;
7291
7292 __kmp_init_serial = TRUE;
7293
7294 if (__kmp_version) {
7295 __kmp_print_version_1();
7296 }
7297
7298 if (__kmp_settings) {
7299 __kmp_env_print();
7300 }
7301
7302 if (__kmp_display_env || __kmp_display_env_verbose) {
7303 __kmp_env_print_2();
7304 }
7305
7306#if OMPT_SUPPORT
7307 ompt_post_init();
7308#endif
7309
7310 KMP_MB();
7311
7312 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7313}
7314
7315void __kmp_serial_initialize(void) {
7316 if (__kmp_init_serial) {
7317 return;
7318 }
7319 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7320 if (__kmp_init_serial) {
7321 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7322 return;
7323 }
7324 __kmp_do_serial_initialize();
7325 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7326}
7327
7328static void __kmp_do_middle_initialize(void) {
7329 int i, j;
7330 int prev_dflt_team_nth;
7331
7332 if (!__kmp_init_serial) {
7333 __kmp_do_serial_initialize();
7334 }
7335
7336 KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7337
7338 if (UNLIKELY(!__kmp_need_register_serial)) {
7339 // We are in a forked child process. The registration was skipped during
7340 // serial initialization in __kmp_atfork_child handler. Do it here.
7341 __kmp_register_library_startup();
7342 }
7343
7344 // Save the previous value for the __kmp_dflt_team_nth so that
7345 // we can avoid some reinitialization if it hasn't changed.
7346 prev_dflt_team_nth = __kmp_dflt_team_nth;
7347
7348#if KMP_AFFINITY_SUPPORTED
7349 // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7350 // number of cores on the machine.
7351 __kmp_affinity_initialize(__kmp_affinity);
7352
7353#endif /* KMP_AFFINITY_SUPPORTED */
7354
7355 KMP_ASSERT(__kmp_xproc > 0);
7356 if (__kmp_avail_proc == 0) {
7357 __kmp_avail_proc = __kmp_xproc;
7358 }
7359
7360 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7361 // correct them now
7362 j = 0;
7363 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7364 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7365 __kmp_avail_proc;
7366 j++;
7367 }
7368
7369 if (__kmp_dflt_team_nth == 0) {
7370#ifdef KMP_DFLT_NTH_CORES
7371 // Default #threads = #cores
7372 __kmp_dflt_team_nth = __kmp_ncores;
7373 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7374 "__kmp_ncores (%d)\n",
7375 __kmp_dflt_team_nth));
7376#else
7377 // Default #threads = #available OS procs
7378 __kmp_dflt_team_nth = __kmp_avail_proc;
7379 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7380 "__kmp_avail_proc(%d)\n",
7381 __kmp_dflt_team_nth));
7382#endif /* KMP_DFLT_NTH_CORES */
7383 }
7384
7385 if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7386 __kmp_dflt_team_nth = KMP_MIN_NTH;
7387 }
7388 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7389 __kmp_dflt_team_nth = __kmp_sys_max_nth;
7390 }
7391
7392 if (__kmp_nesting_mode > 0)
7393 __kmp_set_nesting_mode_threads();
7394
7395 // There's no harm in continuing if the following check fails,
7396 // but it indicates an error in the previous logic.
7397 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7398
7399 if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7400 // Run through the __kmp_threads array and set the num threads icv for each
7401 // root thread that is currently registered with the RTL (which has not
7402 // already explicitly set its nthreads-var with a call to
7403 // omp_set_num_threads()).
7404 for (i = 0; i < __kmp_threads_capacity; i++) {
7405 kmp_info_t *thread = __kmp_threads[i];
7406 if (thread == NULL)
7407 continue;
7408 if (thread->th.th_current_task->td_icvs.nproc != 0)
7409 continue;
7410
7411 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7412 }
7413 }
7414 KA_TRACE(
7415 20,
7416 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7417 __kmp_dflt_team_nth));
7418
7419#ifdef KMP_ADJUST_BLOCKTIME
7420 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7421 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7422 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7423 if (__kmp_nth > __kmp_avail_proc) {
7424 __kmp_zero_bt = TRUE;
7425 }
7426 }
7427#endif /* KMP_ADJUST_BLOCKTIME */
7428
7429 /* we have finished middle initialization */
7430 TCW_SYNC_4(__kmp_init_middle, TRUE);
7431
7432 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7433}
7434
7435void __kmp_middle_initialize(void) {
7436 if (__kmp_init_middle) {
7437 return;
7438 }
7439 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7440 if (__kmp_init_middle) {
7441 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7442 return;
7443 }
7444 __kmp_do_middle_initialize();
7445 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7446}
7447
7448void __kmp_parallel_initialize(void) {
7449 int gtid = __kmp_entry_gtid(); // this might be a new root
7450
7451 /* synchronize parallel initialization (for sibling) */
7452 if (TCR_4(__kmp_init_parallel))
7453 return;
7454 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7455 if (TCR_4(__kmp_init_parallel)) {
7456 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7457 return;
7458 }
7459
7460 /* TODO reinitialization after we have already shut down */
7461 if (TCR_4(__kmp_global.g.g_done)) {
7462 KA_TRACE(
7463 10,
7464 ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7465 __kmp_infinite_loop();
7466 }
7467
7468 /* jc: The lock __kmp_initz_lock is already held, so calling
7469 __kmp_serial_initialize would cause a deadlock. So we call
7470 __kmp_do_serial_initialize directly. */
7471 if (!__kmp_init_middle) {
7472 __kmp_do_middle_initialize();
7473 }
7474 __kmp_assign_root_init_mask();
7475 __kmp_resume_if_hard_paused();
7476
7477 /* begin initialization */
7478 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7479 KMP_ASSERT(KMP_UBER_GTID(gtid));
7480
7481#if KMP_ARCH_X86 || KMP_ARCH_X86_64
7482 // Save the FP control regs.
7483 // Worker threads will set theirs to these values at thread startup.
7484 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7485 __kmp_store_mxcsr(&__kmp_init_mxcsr);
7486 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7487#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7488
7489#if KMP_OS_UNIX
7490#if KMP_HANDLE_SIGNALS
7491 /* must be after __kmp_serial_initialize */
7492 __kmp_install_signals(TRUE);
7493#endif
7494#endif
7495
7496 __kmp_suspend_initialize();
7497
7498#if defined(USE_LOAD_BALANCE)
7499 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7500 __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7501 }
7502#else
7503 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7504 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7505 }
7506#endif
7507
7508 if (__kmp_version) {
7509 __kmp_print_version_2();
7510 }
7511
7512 /* we have finished parallel initialization */
7513 TCW_SYNC_4(__kmp_init_parallel, TRUE);
7514
7515 KMP_MB();
7516 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7517
7518 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7519}
7520
7521void __kmp_hidden_helper_initialize() {
7522 if (TCR_4(__kmp_init_hidden_helper))
7523 return;
7524
7525 // __kmp_parallel_initialize is required before we initialize hidden helper
7526 if (!TCR_4(__kmp_init_parallel))
7527 __kmp_parallel_initialize();
7528
7529 // Double check. Note that this double check should not be placed before
7530 // __kmp_parallel_initialize as it will cause dead lock.
7531 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7532 if (TCR_4(__kmp_init_hidden_helper)) {
7533 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7534 return;
7535 }
7536
7537#if KMP_AFFINITY_SUPPORTED
7538 // Initialize hidden helper affinity settings.
7539 // The above __kmp_parallel_initialize() will initialize
7540 // regular affinity (and topology) if not already done.
7541 if (!__kmp_hh_affinity.flags.initialized)
7542 __kmp_affinity_initialize(__kmp_hh_affinity);
7543#endif
7544
7545 // Set the count of hidden helper tasks to be executed to zero
7546 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7547
7548 // Set the global variable indicating that we're initializing hidden helper
7549 // team/threads
7550 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7551
7552 // Platform independent initialization
7553 __kmp_do_initialize_hidden_helper_threads();
7554
7555 // Wait here for the finish of initialization of hidden helper teams
7556 __kmp_hidden_helper_threads_initz_wait();
7557
7558 // We have finished hidden helper initialization
7559 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7560
7561 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7562}
7563
7564/* ------------------------------------------------------------------------ */
7565
7566void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7567 kmp_team_t *team) {
7568 kmp_disp_t *dispatch;
7569
7570 KMP_MB();
7571
7572 /* none of the threads have encountered any constructs, yet. */
7573 this_thr->th.th_local.this_construct = 0;
7574#if KMP_CACHE_MANAGE
7575 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7576#endif /* KMP_CACHE_MANAGE */
7577 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7578 KMP_DEBUG_ASSERT(dispatch);
7579 KMP_DEBUG_ASSERT(team->t.t_dispatch);
7580 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7581 // this_thr->th.th_info.ds.ds_tid ] );
7582
7583 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7584 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7585 if (__kmp_env_consistency_check)
7586 __kmp_push_parallel(gtid, team->t.t_ident);
7587
7588 KMP_MB(); /* Flush all pending memory write invalidates. */
7589}
7590
7591void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7592 kmp_team_t *team) {
7593 if (__kmp_env_consistency_check)
7594 __kmp_pop_parallel(gtid, team->t.t_ident);
7595
7596 __kmp_finish_implicit_task(this_thr);
7597}
7598
7599int __kmp_invoke_task_func(int gtid) {
7600 int rc;
7601 int tid = __kmp_tid_from_gtid(gtid);
7602 kmp_info_t *this_thr = __kmp_threads[gtid];
7603 kmp_team_t *team = this_thr->th.th_team;
7604
7605 __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7606#if USE_ITT_BUILD
7607 if (__itt_stack_caller_create_ptr) {
7608 // inform ittnotify about entering user's code
7609 if (team->t.t_stack_id != NULL) {
7610 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7611 } else {
7612 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7613 __kmp_itt_stack_callee_enter(
7614 (__itt_caller)team->t.t_parent->t.t_stack_id);
7615 }
7616 }
7617#endif /* USE_ITT_BUILD */
7618#if INCLUDE_SSC_MARKS
7619 SSC_MARK_INVOKING();
7620#endif
7621
7622#if OMPT_SUPPORT
7623 void *dummy;
7624 void **exit_frame_p;
7625 ompt_data_t *my_task_data;
7626 ompt_data_t *my_parallel_data;
7627 int ompt_team_size;
7628
7629 if (ompt_enabled.enabled) {
7630 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7631 .ompt_task_info.frame.exit_frame.ptr);
7632 } else {
7633 exit_frame_p = &dummy;
7634 }
7635
7636 my_task_data =
7637 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7638 my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7639 if (ompt_enabled.ompt_callback_implicit_task) {
7640 ompt_team_size = team->t.t_nproc;
7641 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7642 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7643 __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7644 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7645 }
7646#endif
7647
7648#if KMP_STATS_ENABLED
7649 stats_state_e previous_state = KMP_GET_THREAD_STATE();
7650 if (previous_state == stats_state_e::TEAMS_REGION) {
7651 KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7652 } else {
7653 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7654 }
7655 KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7656#endif
7657
7658 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7659 tid, (int)team->t.t_argc, (void **)team->t.t_argv
7660#if OMPT_SUPPORT
7661 ,
7662 exit_frame_p
7663#endif
7664 );
7665#if OMPT_SUPPORT
7666 *exit_frame_p = NULL;
7667 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7668#endif
7669
7670#if KMP_STATS_ENABLED
7671 if (previous_state == stats_state_e::TEAMS_REGION) {
7672 KMP_SET_THREAD_STATE(previous_state);
7673 }
7674 KMP_POP_PARTITIONED_TIMER();
7675#endif
7676
7677#if USE_ITT_BUILD
7678 if (__itt_stack_caller_create_ptr) {
7679 // inform ittnotify about leaving user's code
7680 if (team->t.t_stack_id != NULL) {
7681 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7682 } else {
7683 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7684 __kmp_itt_stack_callee_leave(
7685 (__itt_caller)team->t.t_parent->t.t_stack_id);
7686 }
7687 }
7688#endif /* USE_ITT_BUILD */
7689 __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7690
7691 return rc;
7692}
7693
7694void __kmp_teams_master(int gtid) {
7695 // This routine is called by all primary threads in teams construct
7696 kmp_info_t *thr = __kmp_threads[gtid];
7697 kmp_team_t *team = thr->th.th_team;
7698 ident_t *loc = team->t.t_ident;
7699 thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7700 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7701 KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7702 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7703 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7704
7705 // This thread is a new CG root. Set up the proper variables.
7706 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7707 tmp->cg_root = thr; // Make thr the CG root
7708 // Init to thread limit stored when league primary threads were forked
7709 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7710 tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7711 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7712 " cg_nthreads to 1\n",
7713 thr, tmp));
7714 tmp->up = thr->th.th_cg_roots;
7715 thr->th.th_cg_roots = tmp;
7716
7717// Launch league of teams now, but not let workers execute
7718// (they hang on fork barrier until next parallel)
7719#if INCLUDE_SSC_MARKS
7720 SSC_MARK_FORKING();
7721#endif
7722 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7723 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7724 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7725#if INCLUDE_SSC_MARKS
7726 SSC_MARK_JOINING();
7727#endif
7728 // If the team size was reduced from the limit, set it to the new size
7729 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7730 thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7731 // AC: last parameter "1" eliminates join barrier which won't work because
7732 // worker threads are in a fork barrier waiting for more parallel regions
7733 __kmp_join_call(loc, gtid
7734#if OMPT_SUPPORT
7735 ,
7736 fork_context_intel
7737#endif
7738 ,
7739 1);
7740}
7741
7742int __kmp_invoke_teams_master(int gtid) {
7743 kmp_info_t *this_thr = __kmp_threads[gtid];
7744 kmp_team_t *team = this_thr->th.th_team;
7745#if KMP_DEBUG
7746 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7747 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7748 (void *)__kmp_teams_master);
7749#endif
7750 __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7751#if OMPT_SUPPORT
7752 int tid = __kmp_tid_from_gtid(gtid);
7753 ompt_data_t *task_data =
7754 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7755 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7756 if (ompt_enabled.ompt_callback_implicit_task) {
7757 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7758 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7759 ompt_task_initial);
7760 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7761 }
7762#endif
7763 __kmp_teams_master(gtid);
7764#if OMPT_SUPPORT
7765 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7766#endif
7767 __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7768 return 1;
7769}
7770
7771/* this sets the requested number of threads for the next parallel region
7772 encountered by this team. since this should be enclosed in the forkjoin
7773 critical section it should avoid race conditions with asymmetrical nested
7774 parallelism */
7775
7776void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7777 kmp_info_t *thr = __kmp_threads[gtid];
7778
7779 if (num_threads > 0)
7780 thr->th.th_set_nproc = num_threads;
7781}
7782
7783static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7784 int num_threads) {
7785 KMP_DEBUG_ASSERT(thr);
7786 // Remember the number of threads for inner parallel regions
7787 if (!TCR_4(__kmp_init_middle))
7788 __kmp_middle_initialize(); // get internal globals calculated
7789 __kmp_assign_root_init_mask();
7790 KMP_DEBUG_ASSERT(__kmp_avail_proc);
7791 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7792
7793 if (num_threads == 0) {
7794 if (__kmp_teams_thread_limit > 0) {
7795 num_threads = __kmp_teams_thread_limit;
7796 } else {
7797 num_threads = __kmp_avail_proc / num_teams;
7798 }
7799 // adjust num_threads w/o warning as it is not user setting
7800 // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7801 // no thread_limit clause specified - do not change thread-limit-var ICV
7802 if (num_threads > __kmp_dflt_team_nth) {
7803 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7804 }
7805 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7806 num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7807 } // prevent team size to exceed thread-limit-var
7808 if (num_teams * num_threads > __kmp_teams_max_nth) {
7809 num_threads = __kmp_teams_max_nth / num_teams;
7810 }
7811 if (num_threads == 0) {
7812 num_threads = 1;
7813 }
7814 } else {
7815 if (num_threads < 0) {
7816 __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7817 __kmp_msg_null);
7818 num_threads = 1;
7819 }
7820 // This thread will be the primary thread of the league primary threads
7821 // Store new thread limit; old limit is saved in th_cg_roots list
7822 thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7823 // num_threads = min(num_threads, nthreads-var)
7824 if (num_threads > __kmp_dflt_team_nth) {
7825 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7826 }
7827 if (num_teams * num_threads > __kmp_teams_max_nth) {
7828 int new_threads = __kmp_teams_max_nth / num_teams;
7829 if (new_threads == 0) {
7830 new_threads = 1;
7831 }
7832 if (new_threads != num_threads) {
7833 if (!__kmp_reserve_warn) { // user asked for too many threads
7834 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7835 __kmp_msg(kmp_ms_warning,
7836 KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7837 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7838 }
7839 }
7840 num_threads = new_threads;
7841 }
7842 }
7843 thr->th.th_teams_size.nth = num_threads;
7844}
7845
7846/* this sets the requested number of teams for the teams region and/or
7847 the number of threads for the next parallel region encountered */
7848void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7849 int num_threads) {
7850 kmp_info_t *thr = __kmp_threads[gtid];
7851 if (num_teams < 0) {
7852 // OpenMP specification requires requested values to be positive,
7853 // but people can send us any value, so we'd better check
7854 __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7855 __kmp_msg_null);
7856 num_teams = 1;
7857 }
7858 if (num_teams == 0) {
7859 if (__kmp_nteams > 0) {
7860 num_teams = __kmp_nteams;
7861 } else {
7862 num_teams = 1; // default number of teams is 1.
7863 }
7864 }
7865 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7866 if (!__kmp_reserve_warn) {
7867 __kmp_reserve_warn = 1;
7868 __kmp_msg(kmp_ms_warning,
7869 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7870 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7871 }
7872 num_teams = __kmp_teams_max_nth;
7873 }
7874 // Set number of teams (number of threads in the outer "parallel" of the
7875 // teams)
7876 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7877
7878 __kmp_push_thread_limit(thr, num_teams, num_threads);
7879}
7880
7881/* This sets the requested number of teams for the teams region and/or
7882 the number of threads for the next parallel region encountered */
7883void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7884 int num_teams_ub, int num_threads) {
7885 kmp_info_t *thr = __kmp_threads[gtid];
7886 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7887 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7888 KMP_DEBUG_ASSERT(num_threads >= 0);
7889
7890 if (num_teams_lb > num_teams_ub) {
7891 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7892 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7893 }
7894
7895 int num_teams = 1; // defalt number of teams is 1.
7896
7897 if (num_teams_lb == 0 && num_teams_ub > 0)
7898 num_teams_lb = num_teams_ub;
7899
7900 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7901 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7902 if (num_teams > __kmp_teams_max_nth) {
7903 if (!__kmp_reserve_warn) {
7904 __kmp_reserve_warn = 1;
7905 __kmp_msg(kmp_ms_warning,
7906 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7907 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7908 }
7909 num_teams = __kmp_teams_max_nth;
7910 }
7911 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7912 num_teams = num_teams_ub;
7913 } else { // num_teams_lb <= num_teams <= num_teams_ub
7914 if (num_threads <= 0) {
7915 if (num_teams_ub > __kmp_teams_max_nth) {
7916 num_teams = num_teams_lb;
7917 } else {
7918 num_teams = num_teams_ub;
7919 }
7920 } else {
7921 num_teams = (num_threads > __kmp_teams_max_nth)
7922 ? num_teams
7923 : __kmp_teams_max_nth / num_threads;
7924 if (num_teams < num_teams_lb) {
7925 num_teams = num_teams_lb;
7926 } else if (num_teams > num_teams_ub) {
7927 num_teams = num_teams_ub;
7928 }
7929 }
7930 }
7931 // Set number of teams (number of threads in the outer "parallel" of the
7932 // teams)
7933 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7934
7935 __kmp_push_thread_limit(thr, num_teams, num_threads);
7936}
7937
7938// Set the proc_bind var to use in the following parallel region.
7939void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7940 kmp_info_t *thr = __kmp_threads[gtid];
7941 thr->th.th_set_proc_bind = proc_bind;
7942}
7943
7944/* Launch the worker threads into the microtask. */
7945
7946void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7947 kmp_info_t *this_thr = __kmp_threads[gtid];
7948
7949#ifdef KMP_DEBUG
7950 int f;
7951#endif /* KMP_DEBUG */
7952
7953 KMP_DEBUG_ASSERT(team);
7954 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7955 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7956 KMP_MB(); /* Flush all pending memory write invalidates. */
7957
7958 team->t.t_construct = 0; /* no single directives seen yet */
7959 team->t.t_ordered.dt.t_value =
7960 0; /* thread 0 enters the ordered section first */
7961
7962 /* Reset the identifiers on the dispatch buffer */
7963 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7964 if (team->t.t_max_nproc > 1) {
7965 int i;
7966 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7967 team->t.t_disp_buffer[i].buffer_index = i;
7968 team->t.t_disp_buffer[i].doacross_buf_idx = i;
7969 }
7970 } else {
7971 team->t.t_disp_buffer[0].buffer_index = 0;
7972 team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7973 }
7974
7975 KMP_MB(); /* Flush all pending memory write invalidates. */
7976 KMP_ASSERT(this_thr->th.th_team == team);
7977
7978#ifdef KMP_DEBUG
7979 for (f = 0; f < team->t.t_nproc; f++) {
7980 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7981 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7982 }
7983#endif /* KMP_DEBUG */
7984
7985 /* release the worker threads so they may begin working */
7986 __kmp_fork_barrier(gtid, 0);
7987}
7988
7989void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7990 kmp_info_t *this_thr = __kmp_threads[gtid];
7991
7992 KMP_DEBUG_ASSERT(team);
7993 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7994 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7995 KMP_MB(); /* Flush all pending memory write invalidates. */
7996
7997 /* Join barrier after fork */
7998
7999#ifdef KMP_DEBUG
8000 if (__kmp_threads[gtid] &&
8001 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8002 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8003 __kmp_threads[gtid]);
8004 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8005 "team->t.t_nproc=%d\n",
8006 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8007 team->t.t_nproc);
8008 __kmp_print_structure();
8009 }
8010 KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8011 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8012#endif /* KMP_DEBUG */
8013
8014 __kmp_join_barrier(gtid); /* wait for everyone */
8015#if OMPT_SUPPORT
8016 if (ompt_enabled.enabled &&
8017 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
8018 int ds_tid = this_thr->th.th_info.ds.ds_tid;
8019 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8020 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8021#if OMPT_OPTIONAL
8022 void *codeptr = NULL;
8023 if (KMP_MASTER_TID(ds_tid) &&
8024 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8025 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8026 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8027
8028 if (ompt_enabled.ompt_callback_sync_region_wait) {
8029 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8030 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8031 codeptr);
8032 }
8033 if (ompt_enabled.ompt_callback_sync_region) {
8034 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8035 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8036 codeptr);
8037 }
8038#endif
8039 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8040 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8041 ompt_scope_end, NULL, task_data, 0, ds_tid,
8042 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8043 }
8044 }
8045#endif
8046
8047 KMP_MB(); /* Flush all pending memory write invalidates. */
8048 KMP_ASSERT(this_thr->th.th_team == team);
8049}
8050
8051/* ------------------------------------------------------------------------ */
8052
8053#ifdef USE_LOAD_BALANCE
8054
8055// Return the worker threads actively spinning in the hot team, if we
8056// are at the outermost level of parallelism. Otherwise, return 0.
8057static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8058 int i;
8059 int retval;
8060 kmp_team_t *hot_team;
8061
8062 if (root->r.r_active) {
8063 return 0;
8064 }
8065 hot_team = root->r.r_hot_team;
8066 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8067 return hot_team->t.t_nproc - 1; // Don't count primary thread
8068 }
8069
8070 // Skip the primary thread - it is accounted for elsewhere.
8071 retval = 0;
8072 for (i = 1; i < hot_team->t.t_nproc; i++) {
8073 if (hot_team->t.t_threads[i]->th.th_active) {
8074 retval++;
8075 }
8076 }
8077 return retval;
8078}
8079
8080// Perform an automatic adjustment to the number of
8081// threads used by the next parallel region.
8082static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8083 int retval;
8084 int pool_active;
8085 int hot_team_active;
8086 int team_curr_active;
8087 int system_active;
8088
8089 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8090 set_nproc));
8091 KMP_DEBUG_ASSERT(root);
8092 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8093 ->th.th_current_task->td_icvs.dynamic == TRUE);
8094 KMP_DEBUG_ASSERT(set_nproc > 1);
8095
8096 if (set_nproc == 1) {
8097 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8098 return 1;
8099 }
8100
8101 // Threads that are active in the thread pool, active in the hot team for this
8102 // particular root (if we are at the outer par level), and the currently
8103 // executing thread (to become the primary thread) are available to add to the
8104 // new team, but are currently contributing to the system load, and must be
8105 // accounted for.
8106 pool_active = __kmp_thread_pool_active_nth;
8107 hot_team_active = __kmp_active_hot_team_nproc(root);
8108 team_curr_active = pool_active + hot_team_active + 1;
8109
8110 // Check the system load.
8111 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8112 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8113 "hot team active = %d\n",
8114 system_active, pool_active, hot_team_active));
8115
8116 if (system_active < 0) {
8117 // There was an error reading the necessary info from /proc, so use the
8118 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8119 // = dynamic_thread_limit, we shouldn't wind up getting back here.
8120 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8121 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8122
8123 // Make this call behave like the thread limit algorithm.
8124 retval = __kmp_avail_proc - __kmp_nth +
8125 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8126 if (retval > set_nproc) {
8127 retval = set_nproc;
8128 }
8129 if (retval < KMP_MIN_NTH) {
8130 retval = KMP_MIN_NTH;
8131 }
8132
8133 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8134 retval));
8135 return retval;
8136 }
8137
8138 // There is a slight delay in the load balance algorithm in detecting new
8139 // running procs. The real system load at this instant should be at least as
8140 // large as the #active omp thread that are available to add to the team.
8141 if (system_active < team_curr_active) {
8142 system_active = team_curr_active;
8143 }
8144 retval = __kmp_avail_proc - system_active + team_curr_active;
8145 if (retval > set_nproc) {
8146 retval = set_nproc;
8147 }
8148 if (retval < KMP_MIN_NTH) {
8149 retval = KMP_MIN_NTH;
8150 }
8151
8152 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8153 return retval;
8154} // __kmp_load_balance_nproc()
8155
8156#endif /* USE_LOAD_BALANCE */
8157
8158/* ------------------------------------------------------------------------ */
8159
8160/* NOTE: this is called with the __kmp_init_lock held */
8161void __kmp_cleanup(void) {
8162 int f;
8163
8164 KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8165
8166 if (TCR_4(__kmp_init_parallel)) {
8167#if KMP_HANDLE_SIGNALS
8168 __kmp_remove_signals();
8169#endif
8170 TCW_4(__kmp_init_parallel, FALSE);
8171 }
8172
8173 if (TCR_4(__kmp_init_middle)) {
8174#if KMP_AFFINITY_SUPPORTED
8175 __kmp_affinity_uninitialize();
8176#endif /* KMP_AFFINITY_SUPPORTED */
8177 __kmp_cleanup_hierarchy();
8178 TCW_4(__kmp_init_middle, FALSE);
8179 }
8180
8181 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8182
8183 if (__kmp_init_serial) {
8184 __kmp_runtime_destroy();
8185 __kmp_init_serial = FALSE;
8186 }
8187
8188 __kmp_cleanup_threadprivate_caches();
8189
8190 for (f = 0; f < __kmp_threads_capacity; f++) {
8191 if (__kmp_root[f] != NULL) {
8192 __kmp_free(__kmp_root[f]);
8193 __kmp_root[f] = NULL;
8194 }
8195 }
8196 __kmp_free(__kmp_threads);
8197 // __kmp_threads and __kmp_root were allocated at once, as single block, so
8198 // there is no need in freeing __kmp_root.
8199 __kmp_threads = NULL;
8200 __kmp_root = NULL;
8201 __kmp_threads_capacity = 0;
8202
8203 // Free old __kmp_threads arrays if they exist.
8204 kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8205 while (ptr) {
8206 kmp_old_threads_list_t *next = ptr->next;
8207 __kmp_free(ptr->threads);
8208 __kmp_free(ptr);
8209 ptr = next;
8210 }
8211
8212#if KMP_USE_DYNAMIC_LOCK
8213 __kmp_cleanup_indirect_user_locks();
8214#else
8215 __kmp_cleanup_user_locks();
8216#endif
8217#if OMPD_SUPPORT
8218 if (ompd_state) {
8219 __kmp_free(ompd_env_block);
8220 ompd_env_block = NULL;
8221 ompd_env_block_size = 0;
8222 }
8223#endif
8224
8225#if KMP_AFFINITY_SUPPORTED
8226 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8227 __kmp_cpuinfo_file = NULL;
8228#endif /* KMP_AFFINITY_SUPPORTED */
8229
8230#if KMP_USE_ADAPTIVE_LOCKS
8231#if KMP_DEBUG_ADAPTIVE_LOCKS
8232 __kmp_print_speculative_stats();
8233#endif
8234#endif
8235 KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8236 __kmp_nested_nth.nth = NULL;
8237 __kmp_nested_nth.size = 0;
8238 __kmp_nested_nth.used = 0;
8239 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8240 __kmp_nested_proc_bind.bind_types = NULL;
8241 __kmp_nested_proc_bind.size = 0;
8242 __kmp_nested_proc_bind.used = 0;
8243 if (__kmp_affinity_format) {
8244 KMP_INTERNAL_FREE(__kmp_affinity_format);
8245 __kmp_affinity_format = NULL;
8246 }
8247
8248 __kmp_i18n_catclose();
8249
8250#if KMP_USE_HIER_SCHED
8251 __kmp_hier_scheds.deallocate();
8252#endif
8253
8254#if KMP_STATS_ENABLED
8255 __kmp_stats_fini();
8256#endif
8257
8258 KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8259}
8260
8261/* ------------------------------------------------------------------------ */
8262
8263int __kmp_ignore_mppbeg(void) {
8264 char *env;
8265
8266 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8267 if (__kmp_str_match_false(env))
8268 return FALSE;
8269 }
8270 // By default __kmpc_begin() is no-op.
8271 return TRUE;
8272}
8273
8274int __kmp_ignore_mppend(void) {
8275 char *env;
8276
8277 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8278 if (__kmp_str_match_false(env))
8279 return FALSE;
8280 }
8281 // By default __kmpc_end() is no-op.
8282 return TRUE;
8283}
8284
8285void __kmp_internal_begin(void) {
8286 int gtid;
8287 kmp_root_t *root;
8288
8289 /* this is a very important step as it will register new sibling threads
8290 and assign these new uber threads a new gtid */
8291 gtid = __kmp_entry_gtid();
8292 root = __kmp_threads[gtid]->th.th_root;
8293 KMP_ASSERT(KMP_UBER_GTID(gtid));
8294
8295 if (root->r.r_begin)
8296 return;
8297 __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8298 if (root->r.r_begin) {
8299 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8300 return;
8301 }
8302
8303 root->r.r_begin = TRUE;
8304
8305 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8306}
8307
8308/* ------------------------------------------------------------------------ */
8309
8310void __kmp_user_set_library(enum library_type arg) {
8311 int gtid;
8312 kmp_root_t *root;
8313 kmp_info_t *thread;
8314
8315 /* first, make sure we are initialized so we can get our gtid */
8316
8317 gtid = __kmp_entry_gtid();
8318 thread = __kmp_threads[gtid];
8319
8320 root = thread->th.th_root;
8321
8322 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8323 library_serial));
8324 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8325 thread */
8326 KMP_WARNING(SetLibraryIncorrectCall);
8327 return;
8328 }
8329
8330 switch (arg) {
8331 case library_serial:
8332 thread->th.th_set_nproc = 0;
8333 set__nproc(thread, 1);
8334 break;
8335 case library_turnaround:
8336 thread->th.th_set_nproc = 0;
8337 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8338 : __kmp_dflt_team_nth_ub);
8339 break;
8340 case library_throughput:
8341 thread->th.th_set_nproc = 0;
8342 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8343 : __kmp_dflt_team_nth_ub);
8344 break;
8345 default:
8346 KMP_FATAL(UnknownLibraryType, arg);
8347 }
8348
8349 __kmp_aux_set_library(arg);
8350}
8351
8352void __kmp_aux_set_stacksize(size_t arg) {
8353 if (!__kmp_init_serial)
8354 __kmp_serial_initialize();
8355
8356#if KMP_OS_DARWIN
8357 if (arg & (0x1000 - 1)) {
8358 arg &= ~(0x1000 - 1);
8359 if (arg + 0x1000) /* check for overflow if we round up */
8360 arg += 0x1000;
8361 }
8362#endif
8363 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8364
8365 /* only change the default stacksize before the first parallel region */
8366 if (!TCR_4(__kmp_init_parallel)) {
8367 size_t value = arg; /* argument is in bytes */
8368
8369 if (value < __kmp_sys_min_stksize)
8370 value = __kmp_sys_min_stksize;
8371 else if (value > KMP_MAX_STKSIZE)
8372 value = KMP_MAX_STKSIZE;
8373
8374 __kmp_stksize = value;
8375
8376 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8377 }
8378
8379 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8380}
8381
8382/* set the behaviour of the runtime library */
8383/* TODO this can cause some odd behaviour with sibling parallelism... */
8384void __kmp_aux_set_library(enum library_type arg) {
8385 __kmp_library = arg;
8386
8387 switch (__kmp_library) {
8388 case library_serial: {
8389 KMP_INFORM(LibraryIsSerial);
8390 } break;
8391 case library_turnaround:
8392 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8393 __kmp_use_yield = 2; // only yield when oversubscribed
8394 break;
8395 case library_throughput:
8396 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8397 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8398 break;
8399 default:
8400 KMP_FATAL(UnknownLibraryType, arg);
8401 }
8402}
8403
8404/* Getting team information common for all team API */
8405// Returns NULL if not in teams construct
8406static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8407 kmp_info_t *thr = __kmp_entry_thread();
8408 teams_serialized = 0;
8409 if (thr->th.th_teams_microtask) {
8410 kmp_team_t *team = thr->th.th_team;
8411 int tlevel = thr->th.th_teams_level; // the level of the teams construct
8412 int ii = team->t.t_level;
8413 teams_serialized = team->t.t_serialized;
8414 int level = tlevel + 1;
8415 KMP_DEBUG_ASSERT(ii >= tlevel);
8416 while (ii > level) {
8417 for (teams_serialized = team->t.t_serialized;
8418 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8419 }
8420 if (team->t.t_serialized && (!teams_serialized)) {
8421 team = team->t.t_parent;
8422 continue;
8423 }
8424 if (ii > level) {
8425 team = team->t.t_parent;
8426 ii--;
8427 }
8428 }
8429 return team;
8430 }
8431 return NULL;
8432}
8433
8434int __kmp_aux_get_team_num() {
8435 int serialized;
8436 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8437 if (team) {
8438 if (serialized > 1) {
8439 return 0; // teams region is serialized ( 1 team of 1 thread ).
8440 } else {
8441 return team->t.t_master_tid;
8442 }
8443 }
8444 return 0;
8445}
8446
8447int __kmp_aux_get_num_teams() {
8448 int serialized;
8449 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8450 if (team) {
8451 if (serialized > 1) {
8452 return 1;
8453 } else {
8454 return team->t.t_parent->t.t_nproc;
8455 }
8456 }
8457 return 1;
8458}
8459
8460/* ------------------------------------------------------------------------ */
8461
8462/*
8463 * Affinity Format Parser
8464 *
8465 * Field is in form of: %[[[0].]size]type
8466 * % and type are required (%% means print a literal '%')
8467 * type is either single char or long name surrounded by {},
8468 * e.g., N or {num_threads}
8469 * 0 => leading zeros
8470 * . => right justified when size is specified
8471 * by default output is left justified
8472 * size is the *minimum* field length
8473 * All other characters are printed as is
8474 *
8475 * Available field types:
8476 * L {thread_level} - omp_get_level()
8477 * n {thread_num} - omp_get_thread_num()
8478 * h {host} - name of host machine
8479 * P {process_id} - process id (integer)
8480 * T {thread_identifier} - native thread identifier (integer)
8481 * N {num_threads} - omp_get_num_threads()
8482 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8483 * a {thread_affinity} - comma separated list of integers or integer ranges
8484 * (values of affinity mask)
8485 *
8486 * Implementation-specific field types can be added
8487 * If a type is unknown, print "undefined"
8488 */
8489
8490// Structure holding the short name, long name, and corresponding data type
8491// for snprintf. A table of these will represent the entire valid keyword
8492// field types.
8493typedef struct kmp_affinity_format_field_t {
8494 char short_name; // from spec e.g., L -> thread level
8495 const char *long_name; // from spec thread_level -> thread level
8496 char field_format; // data type for snprintf (typically 'd' or 's'
8497 // for integer or string)
8498} kmp_affinity_format_field_t;
8499
8500static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8501#if KMP_AFFINITY_SUPPORTED
8502 {'A', "thread_affinity", 's'},
8503#endif
8504 {'t', "team_num", 'd'},
8505 {'T', "num_teams", 'd'},
8506 {'L', "nesting_level", 'd'},
8507 {'n', "thread_num", 'd'},
8508 {'N', "num_threads", 'd'},
8509 {'a', "ancestor_tnum", 'd'},
8510 {'H', "host", 's'},
8511 {'P', "process_id", 'd'},
8512 {'i', "native_thread_id", 'd'}};
8513
8514// Return the number of characters it takes to hold field
8515static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8516 const char **ptr,
8517 kmp_str_buf_t *field_buffer) {
8518 int rc, format_index, field_value;
8519 const char *width_left, *width_right;
8520 bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8521 static const int FORMAT_SIZE = 20;
8522 char format[FORMAT_SIZE] = {0};
8523 char absolute_short_name = 0;
8524
8525 KMP_DEBUG_ASSERT(gtid >= 0);
8526 KMP_DEBUG_ASSERT(th);
8527 KMP_DEBUG_ASSERT(**ptr == '%');
8528 KMP_DEBUG_ASSERT(field_buffer);
8529
8530 __kmp_str_buf_clear(field_buffer);
8531
8532 // Skip the initial %
8533 (*ptr)++;
8534
8535 // Check for %% first
8536 if (**ptr == '%') {
8537 __kmp_str_buf_cat(field_buffer, "%", 1);
8538 (*ptr)++; // skip over the second %
8539 return 1;
8540 }
8541
8542 // Parse field modifiers if they are present
8543 pad_zeros = false;
8544 if (**ptr == '0') {
8545 pad_zeros = true;
8546 (*ptr)++; // skip over 0
8547 }
8548 right_justify = false;
8549 if (**ptr == '.') {
8550 right_justify = true;
8551 (*ptr)++; // skip over .
8552 }
8553 // Parse width of field: [width_left, width_right)
8554 width_left = width_right = NULL;
8555 if (**ptr >= '0' && **ptr <= '9') {
8556 width_left = *ptr;
8557 SKIP_DIGITS(*ptr);
8558 width_right = *ptr;
8559 }
8560
8561 // Create the format for KMP_SNPRINTF based on flags parsed above
8562 format_index = 0;
8563 format[format_index++] = '%';
8564 if (!right_justify)
8565 format[format_index++] = '-';
8566 if (pad_zeros)
8567 format[format_index++] = '0';
8568 if (width_left && width_right) {
8569 int i = 0;
8570 // Only allow 8 digit number widths.
8571 // This also prevents overflowing format variable
8572 while (i < 8 && width_left < width_right) {
8573 format[format_index++] = *width_left;
8574 width_left++;
8575 i++;
8576 }
8577 }
8578
8579 // Parse a name (long or short)
8580 // Canonicalize the name into absolute_short_name
8581 found_valid_name = false;
8582 parse_long_name = (**ptr == '{');
8583 if (parse_long_name)
8584 (*ptr)++; // skip initial left brace
8585 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8586 sizeof(__kmp_affinity_format_table[0]);
8587 ++i) {
8588 char short_name = __kmp_affinity_format_table[i].short_name;
8589 const char *long_name = __kmp_affinity_format_table[i].long_name;
8590 char field_format = __kmp_affinity_format_table[i].field_format;
8591 if (parse_long_name) {
8592 size_t length = KMP_STRLEN(long_name);
8593 if (strncmp(*ptr, long_name, length) == 0) {
8594 found_valid_name = true;
8595 (*ptr) += length; // skip the long name
8596 }
8597 } else if (**ptr == short_name) {
8598 found_valid_name = true;
8599 (*ptr)++; // skip the short name
8600 }
8601 if (found_valid_name) {
8602 format[format_index++] = field_format;
8603 format[format_index++] = '\0';
8604 absolute_short_name = short_name;
8605 break;
8606 }
8607 }
8608 if (parse_long_name) {
8609 if (**ptr != '}') {
8610 absolute_short_name = 0;
8611 } else {
8612 (*ptr)++; // skip over the right brace
8613 }
8614 }
8615
8616 // Attempt to fill the buffer with the requested
8617 // value using snprintf within __kmp_str_buf_print()
8618 switch (absolute_short_name) {
8619 case 't':
8620 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8621 break;
8622 case 'T':
8623 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8624 break;
8625 case 'L':
8626 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8627 break;
8628 case 'n':
8629 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8630 break;
8631 case 'H': {
8632 static const int BUFFER_SIZE = 256;
8633 char buf[BUFFER_SIZE];
8634 __kmp_expand_host_name(buf, BUFFER_SIZE);
8635 rc = __kmp_str_buf_print(field_buffer, format, buf);
8636 } break;
8637 case 'P':
8638 rc = __kmp_str_buf_print(field_buffer, format, getpid());
8639 break;
8640 case 'i':
8641 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8642 break;
8643 case 'N':
8644 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8645 break;
8646 case 'a':
8647 field_value =
8648 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8649 rc = __kmp_str_buf_print(field_buffer, format, field_value);
8650 break;
8651#if KMP_AFFINITY_SUPPORTED
8652 case 'A': {
8653 kmp_str_buf_t buf;
8654 __kmp_str_buf_init(&buf);
8655 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8656 rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8657 __kmp_str_buf_free(&buf);
8658 } break;
8659#endif
8660 default:
8661 // According to spec, If an implementation does not have info for field
8662 // type, then "undefined" is printed
8663 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8664 // Skip the field
8665 if (parse_long_name) {
8666 SKIP_TOKEN(*ptr);
8667 if (**ptr == '}')
8668 (*ptr)++;
8669 } else {
8670 (*ptr)++;
8671 }
8672 }
8673
8674 KMP_ASSERT(format_index <= FORMAT_SIZE);
8675 return rc;
8676}
8677
8678/*
8679 * Return number of characters needed to hold the affinity string
8680 * (not including null byte character)
8681 * The resultant string is printed to buffer, which the caller can then
8682 * handle afterwards
8683 */
8684size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8685 kmp_str_buf_t *buffer) {
8686 const char *parse_ptr;
8687 size_t retval;
8688 const kmp_info_t *th;
8689 kmp_str_buf_t field;
8690
8691 KMP_DEBUG_ASSERT(buffer);
8692 KMP_DEBUG_ASSERT(gtid >= 0);
8693
8694 __kmp_str_buf_init(&field);
8695 __kmp_str_buf_clear(buffer);
8696
8697 th = __kmp_threads[gtid];
8698 retval = 0;
8699
8700 // If format is NULL or zero-length string, then we use
8701 // affinity-format-var ICV
8702 parse_ptr = format;
8703 if (parse_ptr == NULL || *parse_ptr == '\0') {
8704 parse_ptr = __kmp_affinity_format;
8705 }
8706 KMP_DEBUG_ASSERT(parse_ptr);
8707
8708 while (*parse_ptr != '\0') {
8709 // Parse a field
8710 if (*parse_ptr == '%') {
8711 // Put field in the buffer
8712 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8713 __kmp_str_buf_catbuf(buffer, &field);
8714 retval += rc;
8715 } else {
8716 // Put literal character in buffer
8717 __kmp_str_buf_cat(buffer, parse_ptr, 1);
8718 retval++;
8719 parse_ptr++;
8720 }
8721 }
8722 __kmp_str_buf_free(&field);
8723 return retval;
8724}
8725
8726// Displays the affinity string to stdout
8727void __kmp_aux_display_affinity(int gtid, const char *format) {
8728 kmp_str_buf_t buf;
8729 __kmp_str_buf_init(&buf);
8730 __kmp_aux_capture_affinity(gtid, format, &buf);
8731 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8732 __kmp_str_buf_free(&buf);
8733}
8734
8735/* ------------------------------------------------------------------------ */
8736void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8737 int blocktime = arg; /* argument is in microseconds */
8738#if KMP_USE_MONITOR
8739 int bt_intervals;
8740#endif
8741 kmp_int8 bt_set;
8742
8743 __kmp_save_internal_controls(thread);
8744
8745 /* Normalize and set blocktime for the teams */
8746 if (blocktime < KMP_MIN_BLOCKTIME)
8747 blocktime = KMP_MIN_BLOCKTIME;
8748 else if (blocktime > KMP_MAX_BLOCKTIME)
8749 blocktime = KMP_MAX_BLOCKTIME;
8750
8751 set__blocktime_team(thread->th.th_team, tid, blocktime);
8752 set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8753
8754#if KMP_USE_MONITOR
8755 /* Calculate and set blocktime intervals for the teams */
8756 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8757
8758 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8759 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8760#endif
8761
8762 /* Set whether blocktime has been set to "TRUE" */
8763 bt_set = TRUE;
8764
8765 set__bt_set_team(thread->th.th_team, tid, bt_set);
8766 set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8767#if KMP_USE_MONITOR
8768 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8769 "bt_intervals=%d, monitor_updates=%d\n",
8770 __kmp_gtid_from_tid(tid, thread->th.th_team),
8771 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8772 __kmp_monitor_wakeups));
8773#else
8774 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8775 __kmp_gtid_from_tid(tid, thread->th.th_team),
8776 thread->th.th_team->t.t_id, tid, blocktime));
8777#endif
8778}
8779
8780void __kmp_aux_set_defaults(char const *str, size_t len) {
8781 if (!__kmp_init_serial) {
8782 __kmp_serial_initialize();
8783 }
8784 __kmp_env_initialize(str);
8785
8786 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8787 __kmp_env_print();
8788 }
8789} // __kmp_aux_set_defaults
8790
8791/* ------------------------------------------------------------------------ */
8792/* internal fast reduction routines */
8793
8794PACKED_REDUCTION_METHOD_T
8795__kmp_determine_reduction_method(
8796 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8797 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8798 kmp_critical_name *lck) {
8799
8800 // Default reduction method: critical construct ( lck != NULL, like in current
8801 // PAROPT )
8802 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8803 // can be selected by RTL
8804 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8805 // can be selected by RTL
8806 // Finally, it's up to OpenMP RTL to make a decision on which method to select
8807 // among generated by PAROPT.
8808
8809 PACKED_REDUCTION_METHOD_T retval;
8810
8811 int team_size;
8812
8813 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8814
8815#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8816 (loc && \
8817 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8818#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8819
8820 retval = critical_reduce_block;
8821
8822 // another choice of getting a team size (with 1 dynamic deference) is slower
8823 team_size = __kmp_get_team_num_threads(global_tid);
8824 if (team_size == 1) {
8825
8826 retval = empty_reduce_block;
8827
8828 } else {
8829
8830 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8831
8832#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8833 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE
8834
8835#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8836 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8837
8838 int teamsize_cutoff = 4;
8839
8840#if KMP_MIC_SUPPORTED
8841 if (__kmp_mic_type != non_mic) {
8842 teamsize_cutoff = 8;
8843 }
8844#endif
8845 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8846 if (tree_available) {
8847 if (team_size <= teamsize_cutoff) {
8848 if (atomic_available) {
8849 retval = atomic_reduce_block;
8850 }
8851 } else {
8852 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8853 }
8854 } else if (atomic_available) {
8855 retval = atomic_reduce_block;
8856 }
8857#else
8858#error "Unknown or unsupported OS"
8859#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8860 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8861
8862#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8863
8864#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8865
8866 // basic tuning
8867
8868 if (atomic_available) {
8869 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8870 retval = atomic_reduce_block;
8871 }
8872 } // otherwise: use critical section
8873
8874#elif KMP_OS_DARWIN
8875
8876 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8877 if (atomic_available && (num_vars <= 3)) {
8878 retval = atomic_reduce_block;
8879 } else if (tree_available) {
8880 if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8881 (reduce_size < (2000 * sizeof(kmp_real64)))) {
8882 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8883 }
8884 } // otherwise: use critical section
8885
8886#else
8887#error "Unknown or unsupported OS"
8888#endif
8889
8890#else
8891#error "Unknown or unsupported architecture"
8892#endif
8893 }
8894
8895 // KMP_FORCE_REDUCTION
8896
8897 // If the team is serialized (team_size == 1), ignore the forced reduction
8898 // method and stay with the unsynchronized method (empty_reduce_block)
8899 if (__kmp_force_reduction_method != reduction_method_not_defined &&
8900 team_size != 1) {
8901
8902 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8903
8904 int atomic_available, tree_available;
8905
8906 switch ((forced_retval = __kmp_force_reduction_method)) {
8907 case critical_reduce_block:
8908 KMP_ASSERT(lck); // lck should be != 0
8909 break;
8910
8911 case atomic_reduce_block:
8912 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8913 if (!atomic_available) {
8914 KMP_WARNING(RedMethodNotSupported, "atomic");
8915 forced_retval = critical_reduce_block;
8916 }
8917 break;
8918
8919 case tree_reduce_block:
8920 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8921 if (!tree_available) {
8922 KMP_WARNING(RedMethodNotSupported, "tree");
8923 forced_retval = critical_reduce_block;
8924 } else {
8925#if KMP_FAST_REDUCTION_BARRIER
8926 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8927#endif
8928 }
8929 break;
8930
8931 default:
8932 KMP_ASSERT(0); // "unsupported method specified"
8933 }
8934
8935 retval = forced_retval;
8936 }
8937
8938 KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8939
8940#undef FAST_REDUCTION_TREE_METHOD_GENERATED
8941#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8942
8943 return (retval);
8944}
8945// this function is for testing set/get/determine reduce method
8946kmp_int32 __kmp_get_reduce_method(void) {
8947 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8948}
8949
8950// Soft pause sets up threads to ignore blocktime and just go to sleep.
8951// Spin-wait code checks __kmp_pause_status and reacts accordingly.
8952void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8953
8954// Hard pause shuts down the runtime completely. Resume happens naturally when
8955// OpenMP is used subsequently.
8956void __kmp_hard_pause() {
8957 __kmp_pause_status = kmp_hard_paused;
8958 __kmp_internal_end_thread(-1);
8959}
8960
8961// Soft resume sets __kmp_pause_status, and wakes up all threads.
8962void __kmp_resume_if_soft_paused() {
8963 if (__kmp_pause_status == kmp_soft_paused) {
8964 __kmp_pause_status = kmp_not_paused;
8965
8966 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8967 kmp_info_t *thread = __kmp_threads[gtid];
8968 if (thread) { // Wake it if sleeping
8969 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8970 thread);
8971 if (fl.is_sleeping())
8972 fl.resume(gtid);
8973 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8974 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8975 } else { // thread holds the lock and may sleep soon
8976 do { // until either the thread sleeps, or we can get the lock
8977 if (fl.is_sleeping()) {
8978 fl.resume(gtid);
8979 break;
8980 } else if (__kmp_try_suspend_mx(thread)) {
8981 __kmp_unlock_suspend_mx(thread);
8982 break;
8983 }
8984 } while (1);
8985 }
8986 }
8987 }
8988 }
8989}
8990
8991// This function is called via __kmpc_pause_resource. Returns 0 if successful.
8992// TODO: add warning messages
8993int __kmp_pause_resource(kmp_pause_status_t level) {
8994 if (level == kmp_not_paused) { // requesting resume
8995 if (__kmp_pause_status == kmp_not_paused) {
8996 // error message about runtime not being paused, so can't resume
8997 return 1;
8998 } else {
8999 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9000 __kmp_pause_status == kmp_hard_paused);
9001 __kmp_pause_status = kmp_not_paused;
9002 return 0;
9003 }
9004 } else if (level == kmp_soft_paused) { // requesting soft pause
9005 if (__kmp_pause_status != kmp_not_paused) {
9006 // error message about already being paused
9007 return 1;
9008 } else {
9009 __kmp_soft_pause();
9010 return 0;
9011 }
9012 } else if (level == kmp_hard_paused) { // requesting hard pause
9013 if (__kmp_pause_status != kmp_not_paused) {
9014 // error message about already being paused
9015 return 1;
9016 } else {
9017 __kmp_hard_pause();
9018 return 0;
9019 }
9020 } else {
9021 // error message about invalid level
9022 return 1;
9023 }
9024}
9025
9026void __kmp_omp_display_env(int verbose) {
9027 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9028 if (__kmp_init_serial == 0)
9029 __kmp_do_serial_initialize();
9030 __kmp_display_env_impl(!verbose, verbose);
9031 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9032}
9033
9034// The team size is changing, so distributed barrier must be modified
9035void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9036 int new_nthreads) {
9037 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9038 bp_dist_bar);
9039 kmp_info_t **other_threads = team->t.t_threads;
9040
9041 // We want all the workers to stop waiting on the barrier while we adjust the
9042 // size of the team.
9043 for (int f = 1; f < old_nthreads; ++f) {
9044 KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9045 // Ignore threads that are already inactive or not present in the team
9046 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9047 // teams construct causes thread_limit to get passed in, and some of
9048 // those could be inactive; just ignore them
9049 continue;
9050 }
9051 // If thread is transitioning still to in_use state, wait for it
9052 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9053 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9054 KMP_CPU_PAUSE();
9055 }
9056 // The thread should be in_use now
9057 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9058 // Transition to unused state
9059 team->t.t_threads[f]->th.th_used_in_team.store(2);
9060 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9061 }
9062 // Release all the workers
9063 team->t.b->go_release();
9064
9065 KMP_MFENCE();
9066
9067 // Workers should see transition status 2 and move to 0; but may need to be
9068 // woken up first
9069 int count = old_nthreads - 1;
9070 while (count > 0) {
9071 count = old_nthreads - 1;
9072 for (int f = 1; f < old_nthreads; ++f) {
9073 if (other_threads[f]->th.th_used_in_team.load() != 0) {
9074 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9075 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9076 void *, other_threads[f]->th.th_sleep_loc);
9077 __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9078 }
9079 } else {
9080 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9081 count--;
9082 }
9083 }
9084 }
9085 // Now update the barrier size
9086 team->t.b->update_num_threads(new_nthreads);
9087 team->t.b->go_reset();
9088}
9089
9090void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9091 // Add the threads back to the team
9092 KMP_DEBUG_ASSERT(team);
9093 // Threads were paused and pointed at th_used_in_team temporarily during a
9094 // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9095 // the thread that it should transition itself back into the team. Then, if
9096 // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9097 // to wake it up.
9098 for (int f = 1; f < new_nthreads; ++f) {
9099 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9100 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9101 3);
9102 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9103 __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9104 (kmp_flag_32<false, false> *)NULL);
9105 }
9106 }
9107 // The threads should be transitioning to the team; when they are done, they
9108 // should have set th_used_in_team to 1. This loop forces master to wait until
9109 // all threads have moved into the team and are waiting in the barrier.
9110 int count = new_nthreads - 1;
9111 while (count > 0) {
9112 count = new_nthreads - 1;
9113 for (int f = 1; f < new_nthreads; ++f) {
9114 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9115 count--;
9116 }
9117 }
9118 }
9119}
9120
9121// Globals and functions for hidden helper task
9122kmp_info_t **__kmp_hidden_helper_threads;
9123kmp_info_t *__kmp_hidden_helper_main_thread;
9124std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9125#if KMP_OS_LINUX
9126kmp_int32 __kmp_hidden_helper_threads_num = 8;
9127kmp_int32 __kmp_enable_hidden_helper = TRUE;
9128#else
9129kmp_int32 __kmp_hidden_helper_threads_num = 0;
9130kmp_int32 __kmp_enable_hidden_helper = FALSE;
9131#endif
9132
9133namespace {
9134std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9135
9136void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9137 // This is an explicit synchronization on all hidden helper threads in case
9138 // that when a regular thread pushes a hidden helper task to one hidden
9139 // helper thread, the thread has not been awaken once since they're released
9140 // by the main thread after creating the team.
9141 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9142 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9143 __kmp_hidden_helper_threads_num)
9144 ;
9145
9146 // If main thread, then wait for signal
9147 if (__kmpc_master(nullptr, *gtid)) {
9148 // First, unset the initial state and release the initial thread
9149 TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9150 __kmp_hidden_helper_initz_release();
9151 __kmp_hidden_helper_main_thread_wait();
9152 // Now wake up all worker threads
9153 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9154 __kmp_hidden_helper_worker_thread_signal();
9155 }
9156 }
9157}
9158} // namespace
9159
9160void __kmp_hidden_helper_threads_initz_routine() {
9161 // Create a new root for hidden helper team/threads
9162 const int gtid = __kmp_register_root(TRUE);
9163 __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9164 __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9165 __kmp_hidden_helper_main_thread->th.th_set_nproc =
9166 __kmp_hidden_helper_threads_num;
9167
9168 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9169
9170 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9171
9172 // Set the initialization flag to FALSE
9173 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9174
9175 __kmp_hidden_helper_threads_deinitz_release();
9176}
9177
9178/* Nesting Mode:
9179 Set via KMP_NESTING_MODE, which takes an integer.
9180 Note: we skip duplicate topology levels, and skip levels with only
9181 one entity.
9182 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9183 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9184 in the topology, and initializes the number of threads at each of those
9185 levels to the number of entities at each level, respectively, below the
9186 entity at the parent level.
9187 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9188 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9189 the user to turn nesting on explicitly. This is an even more experimental
9190 option to this experimental feature, and may change or go away in the
9191 future.
9192*/
9193
9194// Allocate space to store nesting levels
9195void __kmp_init_nesting_mode() {
9196 int levels = KMP_HW_LAST;
9197 __kmp_nesting_mode_nlevels = levels;
9198 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9199 for (int i = 0; i < levels; ++i)
9200 __kmp_nesting_nth_level[i] = 0;
9201 if (__kmp_nested_nth.size < levels) {
9202 __kmp_nested_nth.nth =
9203 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9204 __kmp_nested_nth.size = levels;
9205 }
9206}
9207
9208// Set # threads for top levels of nesting; must be called after topology set
9209void __kmp_set_nesting_mode_threads() {
9210 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9211
9212 if (__kmp_nesting_mode == 1)
9213 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9214 else if (__kmp_nesting_mode > 1)
9215 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9216
9217 if (__kmp_topology) { // use topology info
9218 int loc, hw_level;
9219 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9220 loc < __kmp_nesting_mode_nlevels;
9221 loc++, hw_level++) {
9222 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9223 if (__kmp_nesting_nth_level[loc] == 1)
9224 loc--;
9225 }
9226 // Make sure all cores are used
9227 if (__kmp_nesting_mode > 1 && loc > 1) {
9228 int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9229 int num_cores = __kmp_topology->get_count(core_level);
9230 int upper_levels = 1;
9231 for (int level = 0; level < loc - 1; ++level)
9232 upper_levels *= __kmp_nesting_nth_level[level];
9233 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9234 __kmp_nesting_nth_level[loc - 1] =
9235 num_cores / __kmp_nesting_nth_level[loc - 2];
9236 }
9237 __kmp_nesting_mode_nlevels = loc;
9238 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9239 } else { // no topology info available; provide a reasonable guesstimation
9240 if (__kmp_avail_proc >= 4) {
9241 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9242 __kmp_nesting_nth_level[1] = 2;
9243 __kmp_nesting_mode_nlevels = 2;
9244 } else {
9245 __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9246 __kmp_nesting_mode_nlevels = 1;
9247 }
9248 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9249 }
9250 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9251 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9252 }
9253 set__nproc(thread, __kmp_nesting_nth_level[0]);
9254 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9255 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9256 if (get__max_active_levels(thread) > 1) {
9257 // if max levels was set, set nesting mode levels to same
9258 __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9259 }
9260 if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9261 set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9262}
9263
9264// Empty symbols to export (see exports_so.txt) when feature is disabled
9265extern "C" {
9266#if !KMP_STATS_ENABLED
9267void __kmp_reset_stats() {}
9268#endif
9269#if !USE_DEBUGGER
9270int __kmp_omp_debug_struct_info = FALSE;
9271int __kmp_debugging = FALSE;
9272#endif
9273#if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9274void __kmp_itt_fini_ittlib() {}
9275void __kmp_itt_init_ittlib() {}
9276#endif
9277}
9278
9279// end of file
@ KMP_IDENT_AUTOPAR
Definition kmp.h:200
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition kmp_stats.h:940
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition kmp_stats.h:898
stats_state_e
the states which a thread can be in
Definition kmp_stats.h:63
sched_type
Definition kmp.h:358
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition kmp.h:365
@ kmp_sch_static
Definition kmp.h:361
@ kmp_sch_guided_chunked
Definition kmp.h:363
Definition kmp.h:235
kmp_int32 flags
Definition kmp.h:237