LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #if KMP_OS_WINDOWS
51 // windows does not need include files as it doesn't use shared memory
52 #else
53 #include <sys/mman.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #define SHM_SIZE 1024
57 #endif
58 
59 #if defined(KMP_GOMP_COMPAT)
60 char const __kmp_version_alt_comp[] =
61  KMP_VERSION_PREFIX "alternative compiler support: yes";
62 #endif /* defined(KMP_GOMP_COMPAT) */
63 
64 char const __kmp_version_omp_api[] =
65  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66 
67 #ifdef KMP_DEBUG
68 char const __kmp_version_lock[] =
69  KMP_VERSION_PREFIX "lock type: run time selectable";
70 #endif /* KMP_DEBUG */
71 
72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73 
74 /* ------------------------------------------------------------------------ */
75 
76 #if KMP_USE_MONITOR
77 kmp_info_t __kmp_monitor;
78 #endif
79 
80 /* Forward declarations */
81 
82 void __kmp_cleanup(void);
83 
84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85  int gtid);
86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87  kmp_internal_control_t *new_icvs,
88  ident_t *loc);
89 #if KMP_AFFINITY_SUPPORTED
90 static void __kmp_partition_places(kmp_team_t *team,
91  int update_master_only = 0);
92 #endif
93 static void __kmp_do_serial_initialize(void);
94 void __kmp_fork_barrier(int gtid, int tid);
95 void __kmp_join_barrier(int gtid);
96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97  kmp_internal_control_t *new_icvs, ident_t *loc);
98 
99 #ifdef USE_LOAD_BALANCE
100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101 #endif
102 
103 static int __kmp_expand_threads(int nNeed);
104 #if KMP_OS_WINDOWS
105 static int __kmp_unregister_root_other_thread(int gtid);
106 #endif
107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109 
110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111  int new_nthreads);
112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113 
114 /* Calculate the identifier of the current thread */
115 /* fast (and somewhat portable) way to get unique identifier of executing
116  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117 int __kmp_get_global_thread_id() {
118  int i;
119  kmp_info_t **other_threads;
120  size_t stack_data;
121  char *stack_addr;
122  size_t stack_size;
123  char *stack_base;
124 
125  KA_TRACE(
126  1000,
127  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
128  __kmp_nth, __kmp_all_nth));
129 
130  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133  __kmp_init_gtid for this to work. */
134 
135  if (!TCR_4(__kmp_init_gtid))
136  return KMP_GTID_DNE;
137 
138 #ifdef KMP_TDATA_GTID
139  if (TCR_4(__kmp_gtid_mode) >= 3) {
140  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141  return __kmp_gtid;
142  }
143 #endif
144  if (TCR_4(__kmp_gtid_mode) >= 2) {
145  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146  return __kmp_gtid_get_specific();
147  }
148  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149 
150  stack_addr = (char *)&stack_data;
151  other_threads = __kmp_threads;
152 
153  /* ATT: The code below is a source of potential bugs due to unsynchronized
154  access to __kmp_threads array. For example:
155  1. Current thread loads other_threads[i] to thr and checks it, it is
156  non-NULL.
157  2. Current thread is suspended by OS.
158  3. Another thread unregisters and finishes (debug versions of free()
159  may fill memory with something like 0xEF).
160  4. Current thread is resumed.
161  5. Current thread reads junk from *thr.
162  TODO: Fix it. --ln */
163 
164  for (i = 0; i < __kmp_threads_capacity; i++) {
165 
166  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167  if (!thr)
168  continue;
169 
170  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172 
173  /* stack grows down -- search through all of the active threads */
174 
175  if (stack_addr <= stack_base) {
176  size_t stack_diff = stack_base - stack_addr;
177 
178  if (stack_diff <= stack_size) {
179  /* The only way we can be closer than the allocated */
180  /* stack size is if we are running on this thread. */
181  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
182  return i;
183  }
184  }
185  }
186 
187  /* get specific to try and determine our gtid */
188  KA_TRACE(1000,
189  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
190  "thread, using TLS\n"));
191  i = __kmp_gtid_get_specific();
192 
193  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
194 
195  /* if we havn't been assigned a gtid, then return code */
196  if (i < 0)
197  return i;
198 
199  /* dynamically updated stack window for uber threads to avoid get_specific
200  call */
201  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
202  KMP_FATAL(StackOverflow, i);
203  }
204 
205  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206  if (stack_addr > stack_base) {
207  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
208  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
210  stack_base);
211  } else {
212  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
213  stack_base - stack_addr);
214  }
215 
216  /* Reprint stack bounds for ubermaster since they have been refined */
217  if (__kmp_storage_map) {
218  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
220  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
221  other_threads[i]->th.th_info.ds.ds_stacksize,
222  "th_%d stack (refinement)", i);
223  }
224  return i;
225 }
226 
227 int __kmp_get_global_thread_id_reg() {
228  int gtid;
229 
230  if (!__kmp_init_serial) {
231  gtid = KMP_GTID_DNE;
232  } else
233 #ifdef KMP_TDATA_GTID
234  if (TCR_4(__kmp_gtid_mode) >= 3) {
235  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
236  gtid = __kmp_gtid;
237  } else
238 #endif
239  if (TCR_4(__kmp_gtid_mode) >= 2) {
240  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
241  gtid = __kmp_gtid_get_specific();
242  } else {
243  KA_TRACE(1000,
244  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
245  gtid = __kmp_get_global_thread_id();
246  }
247 
248  /* we must be a new uber master sibling thread */
249  if (gtid == KMP_GTID_DNE) {
250  KA_TRACE(10,
251  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
252  "Registering a new gtid.\n"));
253  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
254  if (!__kmp_init_serial) {
255  __kmp_do_serial_initialize();
256  gtid = __kmp_gtid_get_specific();
257  } else {
258  gtid = __kmp_register_root(FALSE);
259  }
260  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
261  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
262  }
263 
264  KMP_DEBUG_ASSERT(gtid >= 0);
265 
266  return gtid;
267 }
268 
269 /* caller must hold forkjoin_lock */
270 void __kmp_check_stack_overlap(kmp_info_t *th) {
271  int f;
272  char *stack_beg = NULL;
273  char *stack_end = NULL;
274  int gtid;
275 
276  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
277  if (__kmp_storage_map) {
278  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
279  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
280 
281  gtid = __kmp_gtid_from_thread(th);
282 
283  if (gtid == KMP_GTID_MONITOR) {
284  __kmp_print_storage_map_gtid(
285  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286  "th_%s stack (%s)", "mon",
287  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288  } else {
289  __kmp_print_storage_map_gtid(
290  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
291  "th_%d stack (%s)", gtid,
292  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
293  }
294  }
295 
296  /* No point in checking ubermaster threads since they use refinement and
297  * cannot overlap */
298  gtid = __kmp_gtid_from_thread(th);
299  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
300  KA_TRACE(10,
301  ("__kmp_check_stack_overlap: performing extensive checking\n"));
302  if (stack_beg == NULL) {
303  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
304  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
305  }
306 
307  for (f = 0; f < __kmp_threads_capacity; f++) {
308  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
309 
310  if (f_th && f_th != th) {
311  char *other_stack_end =
312  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
313  char *other_stack_beg =
314  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
315  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
316  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
317 
318  /* Print the other stack values before the abort */
319  if (__kmp_storage_map)
320  __kmp_print_storage_map_gtid(
321  -1, other_stack_beg, other_stack_end,
322  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
323  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
324 
325  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
326  __kmp_msg_null);
327  }
328  }
329  }
330  }
331  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
332 }
333 
334 /* ------------------------------------------------------------------------ */
335 
336 void __kmp_infinite_loop(void) {
337  static int done = FALSE;
338 
339  while (!done) {
340  KMP_YIELD(TRUE);
341  }
342 }
343 
344 #define MAX_MESSAGE 512
345 
346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
347  char const *format, ...) {
348  char buffer[MAX_MESSAGE];
349  va_list ap;
350 
351  va_start(ap, format);
352  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
353  p2, (unsigned long)size, format);
354  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
355  __kmp_vprintf(kmp_err, buffer, ap);
356 #if KMP_PRINT_DATA_PLACEMENT
357  int node;
358  if (gtid >= 0) {
359  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
360  if (__kmp_storage_map_verbose) {
361  node = __kmp_get_host_node(p1);
362  if (node < 0) /* doesn't work, so don't try this next time */
363  __kmp_storage_map_verbose = FALSE;
364  else {
365  char *last;
366  int lastNode;
367  int localProc = __kmp_get_cpu_from_gtid(gtid);
368 
369  const int page_size = KMP_GET_PAGE_SIZE();
370 
371  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
372  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
373  if (localProc >= 0)
374  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
375  localProc >> 1);
376  else
377  __kmp_printf_no_lock(" GTID %d\n", gtid);
378 #if KMP_USE_PRCTL
379  /* The more elaborate format is disabled for now because of the prctl
380  * hanging bug. */
381  do {
382  last = p1;
383  lastNode = node;
384  /* This loop collates adjacent pages with the same host node. */
385  do {
386  (char *)p1 += page_size;
387  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
388  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
389  lastNode);
390  } while (p1 <= p2);
391 #else
392  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
393  (char *)p1 + (page_size - 1),
394  __kmp_get_host_node(p1));
395  if (p1 < p2) {
396  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
397  (char *)p2 + (page_size - 1),
398  __kmp_get_host_node(p2));
399  }
400 #endif
401  }
402  }
403  } else
404  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
405  }
406 #endif /* KMP_PRINT_DATA_PLACEMENT */
407  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
408 }
409 
410 void __kmp_warn(char const *format, ...) {
411  char buffer[MAX_MESSAGE];
412  va_list ap;
413 
414  if (__kmp_generate_warnings == kmp_warnings_off) {
415  return;
416  }
417 
418  va_start(ap, format);
419 
420  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
421  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
422  __kmp_vprintf(kmp_err, buffer, ap);
423  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
424 
425  va_end(ap);
426 }
427 
428 void __kmp_abort_process() {
429  // Later threads may stall here, but that's ok because abort() will kill them.
430  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
431 
432  if (__kmp_debug_buf) {
433  __kmp_dump_debug_buffer();
434  }
435 
436  if (KMP_OS_WINDOWS) {
437  // Let other threads know of abnormal termination and prevent deadlock
438  // if abort happened during library initialization or shutdown
439  __kmp_global.g.g_abort = SIGABRT;
440 
441  /* On Windows* OS by default abort() causes pop-up error box, which stalls
442  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
443  boxes. _set_abort_behavior() works well, but this function is not
444  available in VS7 (this is not problem for DLL, but it is a problem for
445  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
446  help, at least in some versions of MS C RTL.
447 
448  It seems following sequence is the only way to simulate abort() and
449  avoid pop-up error box. */
450  raise(SIGABRT);
451  _exit(3); // Just in case, if signal ignored, exit anyway.
452  } else {
453  __kmp_unregister_library();
454  abort();
455  }
456 
457  __kmp_infinite_loop();
458  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
459 
460 } // __kmp_abort_process
461 
462 void __kmp_abort_thread(void) {
463  // TODO: Eliminate g_abort global variable and this function.
464  // In case of abort just call abort(), it will kill all the threads.
465  __kmp_infinite_loop();
466 } // __kmp_abort_thread
467 
468 /* Print out the storage map for the major kmp_info_t thread data structures
469  that are allocated together. */
470 
471 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
472  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
473  gtid);
474 
475  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
476  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
477 
478  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
479  sizeof(kmp_local_t), "th_%d.th_local", gtid);
480 
481  __kmp_print_storage_map_gtid(
482  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
483  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
484 
485  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
486  &thr->th.th_bar[bs_plain_barrier + 1],
487  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
488  gtid);
489 
490  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
491  &thr->th.th_bar[bs_forkjoin_barrier + 1],
492  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
493  gtid);
494 
495 #if KMP_FAST_REDUCTION_BARRIER
496  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
497  &thr->th.th_bar[bs_reduction_barrier + 1],
498  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
499  gtid);
500 #endif // KMP_FAST_REDUCTION_BARRIER
501 }
502 
503 /* Print out the storage map for the major kmp_team_t team data structures
504  that are allocated together. */
505 
506 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
507  int team_id, int num_thr) {
508  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
509  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
510  header, team_id);
511 
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
513  &team->t.t_bar[bs_last_barrier],
514  sizeof(kmp_balign_team_t) * bs_last_barrier,
515  "%s_%d.t_bar", header, team_id);
516 
517  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
518  &team->t.t_bar[bs_plain_barrier + 1],
519  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
520  header, team_id);
521 
522  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
523  &team->t.t_bar[bs_forkjoin_barrier + 1],
524  sizeof(kmp_balign_team_t),
525  "%s_%d.t_bar[forkjoin]", header, team_id);
526 
527 #if KMP_FAST_REDUCTION_BARRIER
528  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
529  &team->t.t_bar[bs_reduction_barrier + 1],
530  sizeof(kmp_balign_team_t),
531  "%s_%d.t_bar[reduction]", header, team_id);
532 #endif // KMP_FAST_REDUCTION_BARRIER
533 
534  __kmp_print_storage_map_gtid(
535  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
536  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
537 
538  __kmp_print_storage_map_gtid(
539  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
540  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
541 
542  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
543  &team->t.t_disp_buffer[num_disp_buff],
544  sizeof(dispatch_shared_info_t) * num_disp_buff,
545  "%s_%d.t_disp_buffer", header, team_id);
546 }
547 
548 static void __kmp_init_allocator() {
549  __kmp_init_memkind();
550  __kmp_init_target_mem();
551 }
552 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
553 
554 /* ------------------------------------------------------------------------ */
555 
556 #if KMP_DYNAMIC_LIB
557 #if KMP_OS_WINDOWS
558 
559 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
560  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
561 
562  switch (fdwReason) {
563 
564  case DLL_PROCESS_ATTACH:
565  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
566 
567  return TRUE;
568 
569  case DLL_PROCESS_DETACH:
570  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
571 
572  // According to Windows* documentation for DllMain entry point:
573  // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
574  // lpReserved == NULL when FreeLibrary() is called,
575  // lpReserved != NULL when the process is terminated.
576  // When FreeLibrary() is called, worker threads remain alive. So the
577  // runtime's state is consistent and executing proper shutdown is OK.
578  // When the process is terminated, worker threads have exited or been
579  // forcefully terminated by the OS and only the shutdown thread remains.
580  // This can leave the runtime in an inconsistent state.
581  // Hence, only attempt proper cleanup when FreeLibrary() is called.
582  // Otherwise, rely on OS to reclaim resources.
583  if (lpReserved == NULL)
584  __kmp_internal_end_library(__kmp_gtid_get_specific());
585 
586  return TRUE;
587 
588  case DLL_THREAD_ATTACH:
589  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
590 
591  /* if we want to register new siblings all the time here call
592  * __kmp_get_gtid(); */
593  return TRUE;
594 
595  case DLL_THREAD_DETACH:
596  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
597 
598  __kmp_internal_end_thread(__kmp_gtid_get_specific());
599  return TRUE;
600  }
601 
602  return TRUE;
603 }
604 
605 #endif /* KMP_OS_WINDOWS */
606 #endif /* KMP_DYNAMIC_LIB */
607 
608 /* __kmp_parallel_deo -- Wait until it's our turn. */
609 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
610  int gtid = *gtid_ref;
611 #ifdef BUILD_PARALLEL_ORDERED
612  kmp_team_t *team = __kmp_team_from_gtid(gtid);
613 #endif /* BUILD_PARALLEL_ORDERED */
614 
615  if (__kmp_env_consistency_check) {
616  if (__kmp_threads[gtid]->th.th_root->r.r_active)
617 #if KMP_USE_DYNAMIC_LOCK
618  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
619 #else
620  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
621 #endif
622  }
623 #ifdef BUILD_PARALLEL_ORDERED
624  if (!team->t.t_serialized) {
625  KMP_MB();
626  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
627  NULL);
628  KMP_MB();
629  }
630 #endif /* BUILD_PARALLEL_ORDERED */
631 }
632 
633 /* __kmp_parallel_dxo -- Signal the next task. */
634 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
635  int gtid = *gtid_ref;
636 #ifdef BUILD_PARALLEL_ORDERED
637  int tid = __kmp_tid_from_gtid(gtid);
638  kmp_team_t *team = __kmp_team_from_gtid(gtid);
639 #endif /* BUILD_PARALLEL_ORDERED */
640 
641  if (__kmp_env_consistency_check) {
642  if (__kmp_threads[gtid]->th.th_root->r.r_active)
643  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
644  }
645 #ifdef BUILD_PARALLEL_ORDERED
646  if (!team->t.t_serialized) {
647  KMP_MB(); /* Flush all pending memory write invalidates. */
648 
649  /* use the tid of the next thread in this team */
650  /* TODO replace with general release procedure */
651  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
652 
653  KMP_MB(); /* Flush all pending memory write invalidates. */
654  }
655 #endif /* BUILD_PARALLEL_ORDERED */
656 }
657 
658 /* ------------------------------------------------------------------------ */
659 /* The BARRIER for a SINGLE process section is always explicit */
660 
661 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
662  int status;
663  kmp_info_t *th;
664  kmp_team_t *team;
665 
666  if (!TCR_4(__kmp_init_parallel))
667  __kmp_parallel_initialize();
668  __kmp_resume_if_soft_paused();
669 
670  th = __kmp_threads[gtid];
671  team = th->th.th_team;
672  status = 0;
673 
674  th->th.th_ident = id_ref;
675 
676  if (team->t.t_serialized) {
677  status = 1;
678  } else {
679  kmp_int32 old_this = th->th.th_local.this_construct;
680 
681  ++th->th.th_local.this_construct;
682  /* try to set team count to thread count--success means thread got the
683  single block */
684  /* TODO: Should this be acquire or release? */
685  if (team->t.t_construct == old_this) {
686  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
687  th->th.th_local.this_construct);
688  }
689 #if USE_ITT_BUILD
690  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
691  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
692  team->t.t_active_level == 1) {
693  // Only report metadata by primary thread of active team at level 1
694  __kmp_itt_metadata_single(id_ref);
695  }
696 #endif /* USE_ITT_BUILD */
697  }
698 
699  if (__kmp_env_consistency_check) {
700  if (status && push_ws) {
701  __kmp_push_workshare(gtid, ct_psingle, id_ref);
702  } else {
703  __kmp_check_workshare(gtid, ct_psingle, id_ref);
704  }
705  }
706 #if USE_ITT_BUILD
707  if (status) {
708  __kmp_itt_single_start(gtid);
709  }
710 #endif /* USE_ITT_BUILD */
711  return status;
712 }
713 
714 void __kmp_exit_single(int gtid) {
715 #if USE_ITT_BUILD
716  __kmp_itt_single_end(gtid);
717 #endif /* USE_ITT_BUILD */
718  if (__kmp_env_consistency_check)
719  __kmp_pop_workshare(gtid, ct_psingle, NULL);
720 }
721 
722 /* determine if we can go parallel or must use a serialized parallel region and
723  * how many threads we can use
724  * set_nproc is the number of threads requested for the team
725  * returns 0 if we should serialize or only use one thread,
726  * otherwise the number of threads to use
727  * The forkjoin lock is held by the caller. */
728 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
729  int master_tid, int set_nthreads,
730  int enter_teams) {
731  int capacity;
732  int new_nthreads;
733  KMP_DEBUG_ASSERT(__kmp_init_serial);
734  KMP_DEBUG_ASSERT(root && parent_team);
735  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
736 
737  // If dyn-var is set, dynamically adjust the number of desired threads,
738  // according to the method specified by dynamic_mode.
739  new_nthreads = set_nthreads;
740  if (!get__dynamic_2(parent_team, master_tid)) {
741  ;
742  }
743 #ifdef USE_LOAD_BALANCE
744  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
745  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
746  if (new_nthreads == 1) {
747  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
748  "reservation to 1 thread\n",
749  master_tid));
750  return 1;
751  }
752  if (new_nthreads < set_nthreads) {
753  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
754  "reservation to %d threads\n",
755  master_tid, new_nthreads));
756  }
757  }
758 #endif /* USE_LOAD_BALANCE */
759  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
760  new_nthreads = __kmp_avail_proc - __kmp_nth +
761  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
762  if (new_nthreads <= 1) {
763  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
764  "reservation to 1 thread\n",
765  master_tid));
766  return 1;
767  }
768  if (new_nthreads < set_nthreads) {
769  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
770  "reservation to %d threads\n",
771  master_tid, new_nthreads));
772  } else {
773  new_nthreads = set_nthreads;
774  }
775  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
776  if (set_nthreads > 2) {
777  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
778  new_nthreads = (new_nthreads % set_nthreads) + 1;
779  if (new_nthreads == 1) {
780  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
781  "reservation to 1 thread\n",
782  master_tid));
783  return 1;
784  }
785  if (new_nthreads < set_nthreads) {
786  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
787  "reservation to %d threads\n",
788  master_tid, new_nthreads));
789  }
790  }
791  } else {
792  KMP_ASSERT(0);
793  }
794 
795  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
796  if (__kmp_nth + new_nthreads -
797  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
798  __kmp_max_nth) {
799  int tl_nthreads = __kmp_max_nth - __kmp_nth +
800  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
801  if (tl_nthreads <= 0) {
802  tl_nthreads = 1;
803  }
804 
805  // If dyn-var is false, emit a 1-time warning.
806  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
807  __kmp_reserve_warn = 1;
808  __kmp_msg(kmp_ms_warning,
809  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
810  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
811  }
812  if (tl_nthreads == 1) {
813  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
814  "reduced reservation to 1 thread\n",
815  master_tid));
816  return 1;
817  }
818  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
819  "reservation to %d threads\n",
820  master_tid, tl_nthreads));
821  new_nthreads = tl_nthreads;
822  }
823 
824  // Respect OMP_THREAD_LIMIT
825  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
826  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
827  if (cg_nthreads + new_nthreads -
828  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
829  max_cg_threads) {
830  int tl_nthreads = max_cg_threads - cg_nthreads +
831  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
832  if (tl_nthreads <= 0) {
833  tl_nthreads = 1;
834  }
835 
836  // If dyn-var is false, emit a 1-time warning.
837  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
838  __kmp_reserve_warn = 1;
839  __kmp_msg(kmp_ms_warning,
840  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
841  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
842  }
843  if (tl_nthreads == 1) {
844  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
845  "reduced reservation to 1 thread\n",
846  master_tid));
847  return 1;
848  }
849  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
850  "reservation to %d threads\n",
851  master_tid, tl_nthreads));
852  new_nthreads = tl_nthreads;
853  }
854 
855  // Check if the threads array is large enough, or needs expanding.
856  // See comment in __kmp_register_root() about the adjustment if
857  // __kmp_threads[0] == NULL.
858  capacity = __kmp_threads_capacity;
859  if (TCR_PTR(__kmp_threads[0]) == NULL) {
860  --capacity;
861  }
862  // If it is not for initializing the hidden helper team, we need to take
863  // __kmp_hidden_helper_threads_num out of the capacity because it is included
864  // in __kmp_threads_capacity.
865  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
866  capacity -= __kmp_hidden_helper_threads_num;
867  }
868  if (__kmp_nth + new_nthreads -
869  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870  capacity) {
871  // Expand the threads array.
872  int slotsRequired = __kmp_nth + new_nthreads -
873  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
874  capacity;
875  int slotsAdded = __kmp_expand_threads(slotsRequired);
876  if (slotsAdded < slotsRequired) {
877  // The threads array was not expanded enough.
878  new_nthreads -= (slotsRequired - slotsAdded);
879  KMP_ASSERT(new_nthreads >= 1);
880 
881  // If dyn-var is false, emit a 1-time warning.
882  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
883  __kmp_reserve_warn = 1;
884  if (__kmp_tp_cached) {
885  __kmp_msg(kmp_ms_warning,
886  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
887  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
888  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
889  } else {
890  __kmp_msg(kmp_ms_warning,
891  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
892  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
893  }
894  }
895  }
896  }
897 
898 #ifdef KMP_DEBUG
899  if (new_nthreads == 1) {
900  KC_TRACE(10,
901  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
902  "dead roots and rechecking; requested %d threads\n",
903  __kmp_get_gtid(), set_nthreads));
904  } else {
905  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
906  " %d threads\n",
907  __kmp_get_gtid(), new_nthreads, set_nthreads));
908  }
909 #endif // KMP_DEBUG
910  return new_nthreads;
911 }
912 
913 /* Allocate threads from the thread pool and assign them to the new team. We are
914  assured that there are enough threads available, because we checked on that
915  earlier within critical section forkjoin */
916 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
917  kmp_info_t *master_th, int master_gtid,
918  int fork_teams_workers) {
919  int i;
920  int use_hot_team;
921 
922  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
923  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
924  KMP_MB();
925 
926  /* first, let's setup the primary thread */
927  master_th->th.th_info.ds.ds_tid = 0;
928  master_th->th.th_team = team;
929  master_th->th.th_team_nproc = team->t.t_nproc;
930  master_th->th.th_team_master = master_th;
931  master_th->th.th_team_serialized = FALSE;
932  master_th->th.th_dispatch = &team->t.t_dispatch[0];
933 
934 /* make sure we are not the optimized hot team */
935 #if KMP_NESTED_HOT_TEAMS
936  use_hot_team = 0;
937  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
938  if (hot_teams) { // hot teams array is not allocated if
939  // KMP_HOT_TEAMS_MAX_LEVEL=0
940  int level = team->t.t_active_level - 1; // index in array of hot teams
941  if (master_th->th.th_teams_microtask) { // are we inside the teams?
942  if (master_th->th.th_teams_size.nteams > 1) {
943  ++level; // level was not increased in teams construct for
944  // team_of_masters
945  }
946  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
947  master_th->th.th_teams_level == team->t.t_level) {
948  ++level; // level was not increased in teams construct for
949  // team_of_workers before the parallel
950  } // team->t.t_level will be increased inside parallel
951  }
952  if (level < __kmp_hot_teams_max_level) {
953  if (hot_teams[level].hot_team) {
954  // hot team has already been allocated for given level
955  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
956  use_hot_team = 1; // the team is ready to use
957  } else {
958  use_hot_team = 0; // AC: threads are not allocated yet
959  hot_teams[level].hot_team = team; // remember new hot team
960  hot_teams[level].hot_team_nth = team->t.t_nproc;
961  }
962  } else {
963  use_hot_team = 0;
964  }
965  }
966 #else
967  use_hot_team = team == root->r.r_hot_team;
968 #endif
969  if (!use_hot_team) {
970 
971  /* install the primary thread */
972  team->t.t_threads[0] = master_th;
973  __kmp_initialize_info(master_th, team, 0, master_gtid);
974 
975  /* now, install the worker threads */
976  for (i = 1; i < team->t.t_nproc; i++) {
977 
978  /* fork or reallocate a new thread and install it in team */
979  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
980  team->t.t_threads[i] = thr;
981  KMP_DEBUG_ASSERT(thr);
982  KMP_DEBUG_ASSERT(thr->th.th_team == team);
983  /* align team and thread arrived states */
984  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
985  "T#%d(%d:%d) join =%llu, plain=%llu\n",
986  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
987  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
988  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
989  team->t.t_bar[bs_plain_barrier].b_arrived));
990  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
991  thr->th.th_teams_level = master_th->th.th_teams_level;
992  thr->th.th_teams_size = master_th->th.th_teams_size;
993  { // Initialize threads' barrier data.
994  int b;
995  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
996  for (b = 0; b < bs_last_barrier; ++b) {
997  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
998  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
999 #if USE_DEBUGGER
1000  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1001 #endif
1002  }
1003  }
1004  }
1005 
1006 #if KMP_AFFINITY_SUPPORTED
1007  // Do not partition the places list for teams construct workers who
1008  // haven't actually been forked to do real work yet. This partitioning
1009  // will take place in the parallel region nested within the teams construct.
1010  if (!fork_teams_workers) {
1011  __kmp_partition_places(team);
1012  }
1013 #endif
1014  }
1015 
1016  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1017  for (i = 0; i < team->t.t_nproc; i++) {
1018  kmp_info_t *thr = team->t.t_threads[i];
1019  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1020  thr->th.th_prev_level != team->t.t_level) {
1021  team->t.t_display_affinity = 1;
1022  break;
1023  }
1024  }
1025  }
1026 
1027  KMP_MB();
1028 }
1029 
1030 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1031 // Propagate any changes to the floating point control registers out to the team
1032 // We try to avoid unnecessary writes to the relevant cache line in the team
1033 // structure, so we don't make changes unless they are needed.
1034 inline static void propagateFPControl(kmp_team_t *team) {
1035  if (__kmp_inherit_fp_control) {
1036  kmp_int16 x87_fpu_control_word;
1037  kmp_uint32 mxcsr;
1038 
1039  // Get primary thread's values of FPU control flags (both X87 and vector)
1040  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1041  __kmp_store_mxcsr(&mxcsr);
1042  mxcsr &= KMP_X86_MXCSR_MASK;
1043 
1044  // There is no point looking at t_fp_control_saved here.
1045  // If it is TRUE, we still have to update the values if they are different
1046  // from those we now have. If it is FALSE we didn't save anything yet, but
1047  // our objective is the same. We have to ensure that the values in the team
1048  // are the same as those we have.
1049  // So, this code achieves what we need whether or not t_fp_control_saved is
1050  // true. By checking whether the value needs updating we avoid unnecessary
1051  // writes that would put the cache-line into a written state, causing all
1052  // threads in the team to have to read it again.
1053  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1054  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1055  // Although we don't use this value, other code in the runtime wants to know
1056  // whether it should restore them. So we must ensure it is correct.
1057  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1058  } else {
1059  // Similarly here. Don't write to this cache-line in the team structure
1060  // unless we have to.
1061  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1062  }
1063 }
1064 
1065 // Do the opposite, setting the hardware registers to the updated values from
1066 // the team.
1067 inline static void updateHWFPControl(kmp_team_t *team) {
1068  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1069  // Only reset the fp control regs if they have been changed in the team.
1070  // the parallel region that we are exiting.
1071  kmp_int16 x87_fpu_control_word;
1072  kmp_uint32 mxcsr;
1073  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1074  __kmp_store_mxcsr(&mxcsr);
1075  mxcsr &= KMP_X86_MXCSR_MASK;
1076 
1077  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1078  __kmp_clear_x87_fpu_status_word();
1079  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1080  }
1081 
1082  if (team->t.t_mxcsr != mxcsr) {
1083  __kmp_load_mxcsr(&team->t.t_mxcsr);
1084  }
1085  }
1086 }
1087 #else
1088 #define propagateFPControl(x) ((void)0)
1089 #define updateHWFPControl(x) ((void)0)
1090 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1091 
1092 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1093  int realloc); // forward declaration
1094 
1095 /* Run a parallel region that has been serialized, so runs only in a team of the
1096  single primary thread. */
1097 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1098  kmp_info_t *this_thr;
1099  kmp_team_t *serial_team;
1100 
1101  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1102 
1103  /* Skip all this code for autopar serialized loops since it results in
1104  unacceptable overhead */
1105  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1106  return;
1107 
1108  if (!TCR_4(__kmp_init_parallel))
1109  __kmp_parallel_initialize();
1110  __kmp_resume_if_soft_paused();
1111 
1112  this_thr = __kmp_threads[global_tid];
1113  serial_team = this_thr->th.th_serial_team;
1114 
1115  /* utilize the serialized team held by this thread */
1116  KMP_DEBUG_ASSERT(serial_team);
1117  KMP_MB();
1118 
1119  if (__kmp_tasking_mode != tskm_immediate_exec) {
1120  KMP_DEBUG_ASSERT(
1121  this_thr->th.th_task_team ==
1122  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1123  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1124  NULL);
1125  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1126  "team %p, new task_team = NULL\n",
1127  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1128  this_thr->th.th_task_team = NULL;
1129  }
1130 
1131  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1132  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1133  proc_bind = proc_bind_false;
1134  } else if (proc_bind == proc_bind_default) {
1135  // No proc_bind clause was specified, so use the current value
1136  // of proc-bind-var for this parallel region.
1137  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1138  }
1139  // Reset for next parallel region
1140  this_thr->th.th_set_proc_bind = proc_bind_default;
1141 
1142 #if OMPT_SUPPORT
1143  ompt_data_t ompt_parallel_data = ompt_data_none;
1144  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1145  if (ompt_enabled.enabled &&
1146  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1147 
1148  ompt_task_info_t *parent_task_info;
1149  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1150 
1151  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1152  if (ompt_enabled.ompt_callback_parallel_begin) {
1153  int team_size = 1;
1154 
1155  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1156  &(parent_task_info->task_data), &(parent_task_info->frame),
1157  &ompt_parallel_data, team_size,
1158  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1159  }
1160  }
1161 #endif // OMPT_SUPPORT
1162 
1163  if (this_thr->th.th_team != serial_team) {
1164  // Nested level will be an index in the nested nthreads array
1165  int level = this_thr->th.th_team->t.t_level;
1166 
1167  if (serial_team->t.t_serialized) {
1168  /* this serial team was already used
1169  TODO increase performance by making this locks more specific */
1170  kmp_team_t *new_team;
1171 
1172  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1173 
1174  new_team =
1175  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1176 #if OMPT_SUPPORT
1177  ompt_parallel_data,
1178 #endif
1179  proc_bind, &this_thr->th.th_current_task->td_icvs,
1180  0 USE_NESTED_HOT_ARG(NULL));
1181  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1182  KMP_ASSERT(new_team);
1183 
1184  /* setup new serialized team and install it */
1185  new_team->t.t_threads[0] = this_thr;
1186  new_team->t.t_parent = this_thr->th.th_team;
1187  serial_team = new_team;
1188  this_thr->th.th_serial_team = serial_team;
1189 
1190  KF_TRACE(
1191  10,
1192  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1193  global_tid, serial_team));
1194 
1195  /* TODO the above breaks the requirement that if we run out of resources,
1196  then we can still guarantee that serialized teams are ok, since we may
1197  need to allocate a new one */
1198  } else {
1199  KF_TRACE(
1200  10,
1201  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1202  global_tid, serial_team));
1203  }
1204 
1205  /* we have to initialize this serial team */
1206  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1207  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1208  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1209  serial_team->t.t_ident = loc;
1210  serial_team->t.t_serialized = 1;
1211  serial_team->t.t_nproc = 1;
1212  serial_team->t.t_parent = this_thr->th.th_team;
1213  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1214  this_thr->th.th_team = serial_team;
1215  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1216 
1217  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1218  this_thr->th.th_current_task));
1219  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1220  this_thr->th.th_current_task->td_flags.executing = 0;
1221 
1222  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1223 
1224  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1225  implicit task for each serialized task represented by
1226  team->t.t_serialized? */
1227  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1228  &this_thr->th.th_current_task->td_parent->td_icvs);
1229 
1230  // Thread value exists in the nested nthreads array for the next nested
1231  // level
1232  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1233  this_thr->th.th_current_task->td_icvs.nproc =
1234  __kmp_nested_nth.nth[level + 1];
1235  }
1236 
1237  if (__kmp_nested_proc_bind.used &&
1238  (level + 1 < __kmp_nested_proc_bind.used)) {
1239  this_thr->th.th_current_task->td_icvs.proc_bind =
1240  __kmp_nested_proc_bind.bind_types[level + 1];
1241  }
1242 
1243 #if USE_DEBUGGER
1244  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1245 #endif
1246  this_thr->th.th_info.ds.ds_tid = 0;
1247 
1248  /* set thread cache values */
1249  this_thr->th.th_team_nproc = 1;
1250  this_thr->th.th_team_master = this_thr;
1251  this_thr->th.th_team_serialized = 1;
1252 
1253  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1254  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1255  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1256 
1257  propagateFPControl(serial_team);
1258 
1259  /* check if we need to allocate dispatch buffers stack */
1260  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1261  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1262  serial_team->t.t_dispatch->th_disp_buffer =
1263  (dispatch_private_info_t *)__kmp_allocate(
1264  sizeof(dispatch_private_info_t));
1265  }
1266  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1267 
1268  KMP_MB();
1269 
1270  } else {
1271  /* this serialized team is already being used,
1272  * that's fine, just add another nested level */
1273  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1274  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1275  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1276  ++serial_team->t.t_serialized;
1277  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1278 
1279  // Nested level will be an index in the nested nthreads array
1280  int level = this_thr->th.th_team->t.t_level;
1281  // Thread value exists in the nested nthreads array for the next nested
1282  // level
1283  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1284  this_thr->th.th_current_task->td_icvs.nproc =
1285  __kmp_nested_nth.nth[level + 1];
1286  }
1287  serial_team->t.t_level++;
1288  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1289  "of serial team %p to %d\n",
1290  global_tid, serial_team, serial_team->t.t_level));
1291 
1292  /* allocate/push dispatch buffers stack */
1293  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1294  {
1295  dispatch_private_info_t *disp_buffer =
1296  (dispatch_private_info_t *)__kmp_allocate(
1297  sizeof(dispatch_private_info_t));
1298  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1299  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1300  }
1301  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1302 
1303  KMP_MB();
1304  }
1305  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1306 
1307  // Perform the display affinity functionality for
1308  // serialized parallel regions
1309  if (__kmp_display_affinity) {
1310  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1311  this_thr->th.th_prev_num_threads != 1) {
1312  // NULL means use the affinity-format-var ICV
1313  __kmp_aux_display_affinity(global_tid, NULL);
1314  this_thr->th.th_prev_level = serial_team->t.t_level;
1315  this_thr->th.th_prev_num_threads = 1;
1316  }
1317  }
1318 
1319  if (__kmp_env_consistency_check)
1320  __kmp_push_parallel(global_tid, NULL);
1321 #if OMPT_SUPPORT
1322  serial_team->t.ompt_team_info.master_return_address = codeptr;
1323  if (ompt_enabled.enabled &&
1324  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1325  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1326  OMPT_GET_FRAME_ADDRESS(0);
1327 
1328  ompt_lw_taskteam_t lw_taskteam;
1329  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1330  &ompt_parallel_data, codeptr);
1331 
1332  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1333  // don't use lw_taskteam after linking. content was swaped
1334 
1335  /* OMPT implicit task begin */
1336  if (ompt_enabled.ompt_callback_implicit_task) {
1337  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1338  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1339  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1340  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1341  OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1342  __kmp_tid_from_gtid(global_tid);
1343  }
1344 
1345  /* OMPT state */
1346  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1347  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1348  OMPT_GET_FRAME_ADDRESS(0);
1349  }
1350 #endif
1351 }
1352 
1353 /* most of the work for a fork */
1354 /* return true if we really went parallel, false if serialized */
1355 int __kmp_fork_call(ident_t *loc, int gtid,
1356  enum fork_context_e call_context, // Intel, GNU, ...
1357  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1358  kmp_va_list ap) {
1359  void **argv;
1360  int i;
1361  int master_tid;
1362  int master_this_cons;
1363  kmp_team_t *team;
1364  kmp_team_t *parent_team;
1365  kmp_info_t *master_th;
1366  kmp_root_t *root;
1367  int nthreads;
1368  int master_active;
1369  int master_set_numthreads;
1370  int level;
1371  int active_level;
1372  int teams_level;
1373 #if KMP_NESTED_HOT_TEAMS
1374  kmp_hot_team_ptr_t **p_hot_teams;
1375 #endif
1376  { // KMP_TIME_BLOCK
1377  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1378  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1379 
1380  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1381  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1382  /* Some systems prefer the stack for the root thread(s) to start with */
1383  /* some gap from the parent stack to prevent false sharing. */
1384  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1385  /* These 2 lines below are so this does not get optimized out */
1386  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1387  __kmp_stkpadding += (short)((kmp_int64)dummy);
1388  }
1389 
1390  /* initialize if needed */
1391  KMP_DEBUG_ASSERT(
1392  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1393  if (!TCR_4(__kmp_init_parallel))
1394  __kmp_parallel_initialize();
1395  __kmp_resume_if_soft_paused();
1396 
1397  /* setup current data */
1398  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1399  // shutdown
1400  parent_team = master_th->th.th_team;
1401  master_tid = master_th->th.th_info.ds.ds_tid;
1402  master_this_cons = master_th->th.th_local.this_construct;
1403  root = master_th->th.th_root;
1404  master_active = root->r.r_active;
1405  master_set_numthreads = master_th->th.th_set_nproc;
1406 
1407 #if OMPT_SUPPORT
1408  ompt_data_t ompt_parallel_data = ompt_data_none;
1409  ompt_data_t *parent_task_data;
1410  ompt_frame_t *ompt_frame;
1411  ompt_data_t *implicit_task_data;
1412  void *return_address = NULL;
1413 
1414  if (ompt_enabled.enabled) {
1415  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1416  NULL, NULL);
1417  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1418  }
1419 #endif
1420 
1421  // Assign affinity to root thread if it hasn't happened yet
1422  __kmp_assign_root_init_mask();
1423 
1424  // Nested level will be an index in the nested nthreads array
1425  level = parent_team->t.t_level;
1426  // used to launch non-serial teams even if nested is not allowed
1427  active_level = parent_team->t.t_active_level;
1428  // needed to check nesting inside the teams
1429  teams_level = master_th->th.th_teams_level;
1430 #if KMP_NESTED_HOT_TEAMS
1431  p_hot_teams = &master_th->th.th_hot_teams;
1432  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1433  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1434  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1435  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1436  // it is either actual or not needed (when active_level > 0)
1437  (*p_hot_teams)[0].hot_team_nth = 1;
1438  }
1439 #endif
1440 
1441 #if OMPT_SUPPORT
1442  if (ompt_enabled.enabled) {
1443  if (ompt_enabled.ompt_callback_parallel_begin) {
1444  int team_size = master_set_numthreads
1445  ? master_set_numthreads
1446  : get__nproc_2(parent_team, master_tid);
1447  int flags = OMPT_INVOKER(call_context) |
1448  ((microtask == (microtask_t)__kmp_teams_master)
1449  ? ompt_parallel_league
1450  : ompt_parallel_team);
1451  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1452  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1453  return_address);
1454  }
1455  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1456  }
1457 #endif
1458 
1459  master_th->th.th_ident = loc;
1460 
1461  if (master_th->th.th_teams_microtask && ap &&
1462  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1463  // AC: This is start of parallel that is nested inside teams construct.
1464  // The team is actual (hot), all workers are ready at the fork barrier.
1465  // No lock needed to initialize the team a bit, then free workers.
1466  parent_team->t.t_ident = loc;
1467  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1468  parent_team->t.t_argc = argc;
1469  argv = (void **)parent_team->t.t_argv;
1470  for (i = argc - 1; i >= 0; --i)
1471  *argv++ = va_arg(kmp_va_deref(ap), void *);
1472  // Increment our nested depth levels, but not increase the serialization
1473  if (parent_team == master_th->th.th_serial_team) {
1474  // AC: we are in serialized parallel
1475  __kmpc_serialized_parallel(loc, gtid);
1476  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1477 
1478  if (call_context == fork_context_gnu) {
1479  // AC: need to decrement t_serialized for enquiry functions to work
1480  // correctly, will restore at join time
1481  parent_team->t.t_serialized--;
1482  return TRUE;
1483  }
1484 
1485 #if OMPD_SUPPORT
1486  parent_team->t.t_pkfn = microtask;
1487 #endif
1488 
1489 #if OMPT_SUPPORT
1490  void *dummy;
1491  void **exit_frame_p;
1492 
1493  ompt_lw_taskteam_t lw_taskteam;
1494 
1495  if (ompt_enabled.enabled) {
1496  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1497  &ompt_parallel_data, return_address);
1498  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1499 
1500  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1501  // don't use lw_taskteam after linking. content was swaped
1502 
1503  /* OMPT implicit task begin */
1504  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1505  if (ompt_enabled.ompt_callback_implicit_task) {
1506  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1507  __kmp_tid_from_gtid(gtid);
1508  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1509  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1510  implicit_task_data, 1,
1511  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1512  }
1513 
1514  /* OMPT state */
1515  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1516  } else {
1517  exit_frame_p = &dummy;
1518  }
1519 #endif
1520  // AC: need to decrement t_serialized for enquiry functions to work
1521  // correctly, will restore at join time
1522  parent_team->t.t_serialized--;
1523 
1524  {
1525  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1526  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1527  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1528 #if OMPT_SUPPORT
1529  ,
1530  exit_frame_p
1531 #endif
1532  );
1533  }
1534 
1535 #if OMPT_SUPPORT
1536  if (ompt_enabled.enabled) {
1537  *exit_frame_p = NULL;
1538  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1539  if (ompt_enabled.ompt_callback_implicit_task) {
1540  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1541  ompt_scope_end, NULL, implicit_task_data, 1,
1542  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1543  }
1544  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1545  __ompt_lw_taskteam_unlink(master_th);
1546  if (ompt_enabled.ompt_callback_parallel_end) {
1547  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1548  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1549  OMPT_INVOKER(call_context) | ompt_parallel_team,
1550  return_address);
1551  }
1552  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1553  }
1554 #endif
1555  return TRUE;
1556  }
1557 
1558  parent_team->t.t_pkfn = microtask;
1559  parent_team->t.t_invoke = invoker;
1560  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1561  parent_team->t.t_active_level++;
1562  parent_team->t.t_level++;
1563  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1564 
1565 #if OMPT_SUPPORT
1566  if (ompt_enabled.enabled) {
1567  ompt_lw_taskteam_t lw_taskteam;
1568  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1569  &ompt_parallel_data, return_address);
1570  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1571  }
1572 #endif
1573 
1574  /* Change number of threads in the team if requested */
1575  if (master_set_numthreads) { // The parallel has num_threads clause
1576  if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1577  // AC: only can reduce number of threads dynamically, can't increase
1578  kmp_info_t **other_threads = parent_team->t.t_threads;
1579  // NOTE: if using distributed barrier, we need to run this code block
1580  // even when the team size appears not to have changed from the max.
1581  int old_proc = master_th->th.th_teams_size.nth;
1582  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
1583  bp_dist_bar) {
1584  __kmp_resize_dist_barrier(parent_team, old_proc,
1585  master_set_numthreads);
1586  __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1587  }
1588  parent_team->t.t_nproc = master_set_numthreads;
1589  for (i = 0; i < master_set_numthreads; ++i) {
1590  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1591  }
1592  }
1593  // Keep extra threads hot in the team for possible next parallels
1594  master_th->th.th_set_nproc = 0;
1595  }
1596 
1597 #if USE_DEBUGGER
1598  if (__kmp_debugging) { // Let debugger override number of threads.
1599  int nth = __kmp_omp_num_threads(loc);
1600  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1601  master_set_numthreads = nth;
1602  }
1603  }
1604 #endif
1605 
1606  // Figure out the proc_bind policy for the nested parallel within teams
1607  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1608  // proc_bind_default means don't update
1609  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1610  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1611  proc_bind = proc_bind_false;
1612  } else {
1613  // No proc_bind clause specified; use current proc-bind-var
1614  if (proc_bind == proc_bind_default) {
1615  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1616  }
1617  /* else: The proc_bind policy was specified explicitly on parallel
1618  clause.
1619  This overrides proc-bind-var for this parallel region, but does not
1620  change proc-bind-var. */
1621  // Figure the value of proc-bind-var for the child threads.
1622  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1623  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1624  master_th->th.th_current_task->td_icvs.proc_bind)) {
1625  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1626  }
1627  }
1628  KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1629  // Need to change the bind-var ICV to correct value for each implicit task
1630  if (proc_bind_icv != proc_bind_default &&
1631  master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1632  kmp_info_t **other_threads = parent_team->t.t_threads;
1633  for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1634  other_threads[i]->th.th_current_task->td_icvs.proc_bind =
1635  proc_bind_icv;
1636  }
1637  }
1638  // Reset for next parallel region
1639  master_th->th.th_set_proc_bind = proc_bind_default;
1640 
1641 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1642  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1643  KMP_ITT_DEBUG) &&
1644  __kmp_forkjoin_frames_mode == 3 &&
1645  parent_team->t.t_active_level == 1 // only report frames at level 1
1646  && master_th->th.th_teams_size.nteams == 1) {
1647  kmp_uint64 tmp_time = __itt_get_timestamp();
1648  master_th->th.th_frame_time = tmp_time;
1649  parent_team->t.t_region_time = tmp_time;
1650  }
1651  if (__itt_stack_caller_create_ptr) {
1652  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1653  // create new stack stitching id before entering fork barrier
1654  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1655  }
1656 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1657 #if KMP_AFFINITY_SUPPORTED
1658  __kmp_partition_places(parent_team);
1659 #endif
1660 
1661  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1662  "master_th=%p, gtid=%d\n",
1663  root, parent_team, master_th, gtid));
1664  __kmp_internal_fork(loc, gtid, parent_team);
1665  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1666  "master_th=%p, gtid=%d\n",
1667  root, parent_team, master_th, gtid));
1668 
1669  if (call_context == fork_context_gnu)
1670  return TRUE;
1671 
1672  /* Invoke microtask for PRIMARY thread */
1673  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1674  parent_team->t.t_id, parent_team->t.t_pkfn));
1675 
1676  if (!parent_team->t.t_invoke(gtid)) {
1677  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1678  }
1679  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1680  parent_team->t.t_id, parent_team->t.t_pkfn));
1681  KMP_MB(); /* Flush all pending memory write invalidates. */
1682 
1683  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1684 
1685  return TRUE;
1686  } // Parallel closely nested in teams construct
1687 
1688 #if KMP_DEBUG
1689  if (__kmp_tasking_mode != tskm_immediate_exec) {
1690  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1691  parent_team->t.t_task_team[master_th->th.th_task_state]);
1692  }
1693 #endif
1694 
1695  // Need this to happen before we determine the number of threads, not while
1696  // we are allocating the team
1697  //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1698  int enter_teams = 0;
1699  if (parent_team->t.t_active_level >=
1700  master_th->th.th_current_task->td_icvs.max_active_levels) {
1701  nthreads = 1;
1702  } else {
1703  enter_teams = ((ap == NULL && active_level == 0) ||
1704  (ap && teams_level > 0 && teams_level == level));
1705  nthreads = master_set_numthreads
1706  ? master_set_numthreads
1707  // TODO: get nproc directly from current task
1708  : get__nproc_2(parent_team, master_tid);
1709  // Check if we need to take forkjoin lock? (no need for serialized
1710  // parallel out of teams construct). This code moved here from
1711  // __kmp_reserve_threads() to speedup nested serialized parallels.
1712  if (nthreads > 1) {
1713  if ((get__max_active_levels(master_th) == 1 &&
1714  (root->r.r_in_parallel && !enter_teams)) ||
1715  (__kmp_library == library_serial)) {
1716  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1717  " threads\n",
1718  gtid, nthreads));
1719  nthreads = 1;
1720  }
1721  }
1722  if (nthreads > 1) {
1723  /* determine how many new threads we can use */
1724  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1725  /* AC: If we execute teams from parallel region (on host), then teams
1726  should be created but each can only have 1 thread if nesting is
1727  disabled. If teams called from serial region, then teams and their
1728  threads should be created regardless of the nesting setting. */
1729  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1730  nthreads, enter_teams);
1731  if (nthreads == 1) {
1732  // Free lock for single thread execution here; for multi-thread
1733  // execution it will be freed later after team of threads created
1734  // and initialized
1735  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1736  }
1737  }
1738  }
1739  KMP_DEBUG_ASSERT(nthreads > 0);
1740 
1741  // If we temporarily changed the set number of threads then restore it now
1742  master_th->th.th_set_nproc = 0;
1743 
1744  /* create a serialized parallel region? */
1745  if (nthreads == 1) {
1746 /* josh todo: hypothetical question: what do we do for OS X*? */
1747 #if KMP_OS_LINUX && \
1748  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1749  void *args[argc];
1750 #else
1751  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1752 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1753  KMP_ARCH_AARCH64) */
1754 
1755  KA_TRACE(20,
1756  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1757 
1758  __kmpc_serialized_parallel(loc, gtid);
1759 
1760 #if OMPD_SUPPORT
1761  master_th->th.th_serial_team->t.t_pkfn = microtask;
1762 #endif
1763 
1764  if (call_context == fork_context_intel) {
1765  /* TODO this sucks, use the compiler itself to pass args! :) */
1766  master_th->th.th_serial_team->t.t_ident = loc;
1767  if (!ap) {
1768  // revert change made in __kmpc_serialized_parallel()
1769  master_th->th.th_serial_team->t.t_level--;
1770  // Get args from parent team for teams construct
1771 
1772 #if OMPT_SUPPORT
1773  void *dummy;
1774  void **exit_frame_p;
1775  ompt_task_info_t *task_info;
1776 
1777  ompt_lw_taskteam_t lw_taskteam;
1778 
1779  if (ompt_enabled.enabled) {
1780  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1781  &ompt_parallel_data, return_address);
1782 
1783  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1784  // don't use lw_taskteam after linking. content was swaped
1785 
1786  task_info = OMPT_CUR_TASK_INFO(master_th);
1787  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1788  if (ompt_enabled.ompt_callback_implicit_task) {
1789  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1790  __kmp_tid_from_gtid(gtid);
1791  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1792  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1793  &(task_info->task_data), 1,
1794  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1795  ompt_task_implicit);
1796  }
1797 
1798  /* OMPT state */
1799  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1800  } else {
1801  exit_frame_p = &dummy;
1802  }
1803 #endif
1804 
1805  {
1806  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1807  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1808  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1809  parent_team->t.t_argv
1810 #if OMPT_SUPPORT
1811  ,
1812  exit_frame_p
1813 #endif
1814  );
1815  }
1816 
1817 #if OMPT_SUPPORT
1818  if (ompt_enabled.enabled) {
1819  *exit_frame_p = NULL;
1820  if (ompt_enabled.ompt_callback_implicit_task) {
1821  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1822  ompt_scope_end, NULL, &(task_info->task_data), 1,
1823  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1824  ompt_task_implicit);
1825  }
1826  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1827  __ompt_lw_taskteam_unlink(master_th);
1828  if (ompt_enabled.ompt_callback_parallel_end) {
1829  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1830  &ompt_parallel_data, parent_task_data,
1831  OMPT_INVOKER(call_context) | ompt_parallel_team,
1832  return_address);
1833  }
1834  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1835  }
1836 #endif
1837  } else if (microtask == (microtask_t)__kmp_teams_master) {
1838  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1839  master_th->th.th_serial_team);
1840  team = master_th->th.th_team;
1841  // team->t.t_pkfn = microtask;
1842  team->t.t_invoke = invoker;
1843  __kmp_alloc_argv_entries(argc, team, TRUE);
1844  team->t.t_argc = argc;
1845  argv = (void **)team->t.t_argv;
1846  if (ap) {
1847  for (i = argc - 1; i >= 0; --i)
1848  *argv++ = va_arg(kmp_va_deref(ap), void *);
1849  } else {
1850  for (i = 0; i < argc; ++i)
1851  // Get args from parent team for teams construct
1852  argv[i] = parent_team->t.t_argv[i];
1853  }
1854  // AC: revert change made in __kmpc_serialized_parallel()
1855  // because initial code in teams should have level=0
1856  team->t.t_level--;
1857  // AC: call special invoker for outer "parallel" of teams construct
1858  invoker(gtid);
1859 #if OMPT_SUPPORT
1860  if (ompt_enabled.enabled) {
1861  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1862  if (ompt_enabled.ompt_callback_implicit_task) {
1863  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1864  ompt_scope_end, NULL, &(task_info->task_data), 0,
1865  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1866  }
1867  if (ompt_enabled.ompt_callback_parallel_end) {
1868  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1869  &ompt_parallel_data, parent_task_data,
1870  OMPT_INVOKER(call_context) | ompt_parallel_league,
1871  return_address);
1872  }
1873  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1874  }
1875 #endif
1876  } else {
1877  argv = args;
1878  for (i = argc - 1; i >= 0; --i)
1879  *argv++ = va_arg(kmp_va_deref(ap), void *);
1880  KMP_MB();
1881 
1882 #if OMPT_SUPPORT
1883  void *dummy;
1884  void **exit_frame_p;
1885  ompt_task_info_t *task_info;
1886 
1887  ompt_lw_taskteam_t lw_taskteam;
1888 
1889  if (ompt_enabled.enabled) {
1890  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1891  &ompt_parallel_data, return_address);
1892  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1893  // don't use lw_taskteam after linking. content was swaped
1894  task_info = OMPT_CUR_TASK_INFO(master_th);
1895  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1896 
1897  /* OMPT implicit task begin */
1898  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1899  if (ompt_enabled.ompt_callback_implicit_task) {
1900  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1901  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1902  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1903  ompt_task_implicit);
1904  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1905  __kmp_tid_from_gtid(gtid);
1906  }
1907 
1908  /* OMPT state */
1909  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1910  } else {
1911  exit_frame_p = &dummy;
1912  }
1913 #endif
1914 
1915  {
1916  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1917  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1918  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1919 #if OMPT_SUPPORT
1920  ,
1921  exit_frame_p
1922 #endif
1923  );
1924  }
1925 
1926 #if OMPT_SUPPORT
1927  if (ompt_enabled.enabled) {
1928  *exit_frame_p = NULL;
1929  if (ompt_enabled.ompt_callback_implicit_task) {
1930  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1931  ompt_scope_end, NULL, &(task_info->task_data), 1,
1932  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1933  ompt_task_implicit);
1934  }
1935 
1936  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1937  __ompt_lw_taskteam_unlink(master_th);
1938  if (ompt_enabled.ompt_callback_parallel_end) {
1939  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1940  &ompt_parallel_data, parent_task_data,
1941  OMPT_INVOKER(call_context) | ompt_parallel_team,
1942  return_address);
1943  }
1944  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1945  }
1946 #endif
1947  }
1948  } else if (call_context == fork_context_gnu) {
1949 #if OMPT_SUPPORT
1950  ompt_lw_taskteam_t lwt;
1951  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1952  return_address);
1953 
1954  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1955  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1956 // don't use lw_taskteam after linking. content was swaped
1957 #endif
1958 
1959  // we were called from GNU native code
1960  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1961  return FALSE;
1962  } else {
1963  KMP_ASSERT2(call_context < fork_context_last,
1964  "__kmp_fork_call: unknown fork_context parameter");
1965  }
1966 
1967  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1968  KMP_MB();
1969  return FALSE;
1970  } // if (nthreads == 1)
1971 
1972  // GEH: only modify the executing flag in the case when not serialized
1973  // serialized case is handled in kmpc_serialized_parallel
1974  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1975  "curtask=%p, curtask_max_aclevel=%d\n",
1976  parent_team->t.t_active_level, master_th,
1977  master_th->th.th_current_task,
1978  master_th->th.th_current_task->td_icvs.max_active_levels));
1979  // TODO: GEH - cannot do this assertion because root thread not set up as
1980  // executing
1981  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1982  master_th->th.th_current_task->td_flags.executing = 0;
1983 
1984  if (!master_th->th.th_teams_microtask || level > teams_level) {
1985  /* Increment our nested depth level */
1986  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1987  }
1988 
1989  // See if we need to make a copy of the ICVs.
1990  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1991  if ((level + 1 < __kmp_nested_nth.used) &&
1992  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1993  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1994  } else {
1995  nthreads_icv = 0; // don't update
1996  }
1997 
1998  // Figure out the proc_bind_policy for the new team.
1999  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2000  // proc_bind_default means don't update
2001  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2002  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2003  proc_bind = proc_bind_false;
2004  } else {
2005  // No proc_bind clause specified; use current proc-bind-var for this
2006  // parallel region
2007  if (proc_bind == proc_bind_default) {
2008  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2009  }
2010  // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2011  if (master_th->th.th_teams_microtask &&
2012  microtask == (microtask_t)__kmp_teams_master) {
2013  proc_bind = __kmp_teams_proc_bind;
2014  }
2015  /* else: The proc_bind policy was specified explicitly on parallel clause.
2016  This overrides proc-bind-var for this parallel region, but does not
2017  change proc-bind-var. */
2018  // Figure the value of proc-bind-var for the child threads.
2019  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2020  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2021  master_th->th.th_current_task->td_icvs.proc_bind)) {
2022  // Do not modify the proc bind icv for the two teams construct forks
2023  // They just let the proc bind icv pass through
2024  if (!master_th->th.th_teams_microtask ||
2025  !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2026  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2027  }
2028  }
2029 
2030  // Reset for next parallel region
2031  master_th->th.th_set_proc_bind = proc_bind_default;
2032 
2033  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2034  kmp_internal_control_t new_icvs;
2035  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2036  new_icvs.next = NULL;
2037  if (nthreads_icv > 0) {
2038  new_icvs.nproc = nthreads_icv;
2039  }
2040  if (proc_bind_icv != proc_bind_default) {
2041  new_icvs.proc_bind = proc_bind_icv;
2042  }
2043 
2044  /* allocate a new parallel team */
2045  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2046  team = __kmp_allocate_team(root, nthreads, nthreads,
2047 #if OMPT_SUPPORT
2048  ompt_parallel_data,
2049 #endif
2050  proc_bind, &new_icvs,
2051  argc USE_NESTED_HOT_ARG(master_th));
2052  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2053  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2054  } else {
2055  /* allocate a new parallel team */
2056  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2057  team = __kmp_allocate_team(root, nthreads, nthreads,
2058 #if OMPT_SUPPORT
2059  ompt_parallel_data,
2060 #endif
2061  proc_bind,
2062  &master_th->th.th_current_task->td_icvs,
2063  argc USE_NESTED_HOT_ARG(master_th));
2064  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2065  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2066  &master_th->th.th_current_task->td_icvs);
2067  }
2068  KF_TRACE(
2069  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2070 
2071  /* setup the new team */
2072  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2073  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2074  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2075  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2076  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2077 #if OMPT_SUPPORT
2078  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2079  return_address);
2080 #endif
2081  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2082  // TODO: parent_team->t.t_level == INT_MAX ???
2083  if (!master_th->th.th_teams_microtask || level > teams_level) {
2084  int new_level = parent_team->t.t_level + 1;
2085  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2086  new_level = parent_team->t.t_active_level + 1;
2087  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2088  } else {
2089  // AC: Do not increase parallel level at start of the teams construct
2090  int new_level = parent_team->t.t_level;
2091  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2092  new_level = parent_team->t.t_active_level;
2093  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2094  }
2095  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2096  // set primary thread's schedule as new run-time schedule
2097  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2098 
2099  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2100  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2101 
2102  // Update the floating point rounding in the team if required.
2103  propagateFPControl(team);
2104 #if OMPD_SUPPORT
2105  if (ompd_state & OMPD_ENABLE_BP)
2106  ompd_bp_parallel_begin();
2107 #endif
2108 
2109  if (__kmp_tasking_mode != tskm_immediate_exec) {
2110  // Set primary thread's task team to team's task team. Unless this is hot
2111  // team, it should be NULL.
2112  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2113  parent_team->t.t_task_team[master_th->th.th_task_state]);
2114  KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2115  "%p, new task_team %p / team %p\n",
2116  __kmp_gtid_from_thread(master_th),
2117  master_th->th.th_task_team, parent_team,
2118  team->t.t_task_team[master_th->th.th_task_state], team));
2119 
2120  if (active_level || master_th->th.th_task_team) {
2121  // Take a memo of primary thread's task_state
2122  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2123  if (master_th->th.th_task_state_top >=
2124  master_th->th.th_task_state_stack_sz) { // increase size
2125  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2126  kmp_uint8 *old_stack, *new_stack;
2127  kmp_uint32 i;
2128  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2129  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2130  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2131  }
2132  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2133  ++i) { // zero-init rest of stack
2134  new_stack[i] = 0;
2135  }
2136  old_stack = master_th->th.th_task_state_memo_stack;
2137  master_th->th.th_task_state_memo_stack = new_stack;
2138  master_th->th.th_task_state_stack_sz = new_size;
2139  __kmp_free(old_stack);
2140  }
2141  // Store primary thread's task_state on stack
2142  master_th->th
2143  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2144  master_th->th.th_task_state;
2145  master_th->th.th_task_state_top++;
2146 #if KMP_NESTED_HOT_TEAMS
2147  if (master_th->th.th_hot_teams &&
2148  active_level < __kmp_hot_teams_max_level &&
2149  team == master_th->th.th_hot_teams[active_level].hot_team) {
2150  // Restore primary thread's nested state if nested hot team
2151  master_th->th.th_task_state =
2152  master_th->th
2153  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2154  } else {
2155 #endif
2156  master_th->th.th_task_state = 0;
2157 #if KMP_NESTED_HOT_TEAMS
2158  }
2159 #endif
2160  }
2161 #if !KMP_NESTED_HOT_TEAMS
2162  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2163  (team == root->r.r_hot_team));
2164 #endif
2165  }
2166 
2167  KA_TRACE(
2168  20,
2169  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2170  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2171  team->t.t_nproc));
2172  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2173  (team->t.t_master_tid == 0 &&
2174  (team->t.t_parent == root->r.r_root_team ||
2175  team->t.t_parent->t.t_serialized)));
2176  KMP_MB();
2177 
2178  /* now, setup the arguments */
2179  argv = (void **)team->t.t_argv;
2180  if (ap) {
2181  for (i = argc - 1; i >= 0; --i) {
2182  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2183  KMP_CHECK_UPDATE(*argv, new_argv);
2184  argv++;
2185  }
2186  } else {
2187  for (i = 0; i < argc; ++i) {
2188  // Get args from parent team for teams construct
2189  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2190  }
2191  }
2192 
2193  /* now actually fork the threads */
2194  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2195  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2196  root->r.r_active = TRUE;
2197 
2198  __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2199  __kmp_setup_icv_copy(team, nthreads,
2200  &master_th->th.th_current_task->td_icvs, loc);
2201 
2202 #if OMPT_SUPPORT
2203  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2204 #endif
2205 
2206  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2207 
2208 #if USE_ITT_BUILD
2209  if (team->t.t_active_level == 1 // only report frames at level 1
2210  && !master_th->th.th_teams_microtask) { // not in teams construct
2211 #if USE_ITT_NOTIFY
2212  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2213  (__kmp_forkjoin_frames_mode == 3 ||
2214  __kmp_forkjoin_frames_mode == 1)) {
2215  kmp_uint64 tmp_time = 0;
2216  if (__itt_get_timestamp_ptr)
2217  tmp_time = __itt_get_timestamp();
2218  // Internal fork - report frame begin
2219  master_th->th.th_frame_time = tmp_time;
2220  if (__kmp_forkjoin_frames_mode == 3)
2221  team->t.t_region_time = tmp_time;
2222  } else
2223 // only one notification scheme (either "submit" or "forking/joined", not both)
2224 #endif /* USE_ITT_NOTIFY */
2225  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2226  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2227  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2228  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2229  }
2230  }
2231 #endif /* USE_ITT_BUILD */
2232 
2233  /* now go on and do the work */
2234  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2235  KMP_MB();
2236  KF_TRACE(10,
2237  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2238  root, team, master_th, gtid));
2239 
2240 #if USE_ITT_BUILD
2241  if (__itt_stack_caller_create_ptr) {
2242  // create new stack stitching id before entering fork barrier
2243  if (!enter_teams) {
2244  KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2245  team->t.t_stack_id = __kmp_itt_stack_caller_create();
2246  } else if (parent_team->t.t_serialized) {
2247  // keep stack stitching id in the serialized parent_team;
2248  // current team will be used for parallel inside the teams;
2249  // if parent_team is active, then it already keeps stack stitching id
2250  // for the league of teams
2251  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2252  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2253  }
2254  }
2255 #endif /* USE_ITT_BUILD */
2256 
2257  // AC: skip __kmp_internal_fork at teams construct, let only primary
2258  // threads execute
2259  if (ap) {
2260  __kmp_internal_fork(loc, gtid, team);
2261  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2262  "master_th=%p, gtid=%d\n",
2263  root, team, master_th, gtid));
2264  }
2265 
2266  if (call_context == fork_context_gnu) {
2267  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2268  return TRUE;
2269  }
2270 
2271  /* Invoke microtask for PRIMARY thread */
2272  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2273  team->t.t_id, team->t.t_pkfn));
2274  } // END of timer KMP_fork_call block
2275 
2276 #if KMP_STATS_ENABLED
2277  // If beginning a teams construct, then change thread state
2278  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2279  if (!ap) {
2280  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2281  }
2282 #endif
2283 
2284  if (!team->t.t_invoke(gtid)) {
2285  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2286  }
2287 
2288 #if KMP_STATS_ENABLED
2289  // If was beginning of a teams construct, then reset thread state
2290  if (!ap) {
2291  KMP_SET_THREAD_STATE(previous_state);
2292  }
2293 #endif
2294 
2295  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2296  team->t.t_id, team->t.t_pkfn));
2297  KMP_MB(); /* Flush all pending memory write invalidates. */
2298 
2299  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2300 #if OMPT_SUPPORT
2301  if (ompt_enabled.enabled) {
2302  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2303  }
2304 #endif
2305 
2306  return TRUE;
2307 }
2308 
2309 #if OMPT_SUPPORT
2310 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2311  kmp_team_t *team) {
2312  // restore state outside the region
2313  thread->th.ompt_thread_info.state =
2314  ((team->t.t_serialized) ? ompt_state_work_serial
2315  : ompt_state_work_parallel);
2316 }
2317 
2318 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2319  kmp_team_t *team, ompt_data_t *parallel_data,
2320  int flags, void *codeptr) {
2321  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2322  if (ompt_enabled.ompt_callback_parallel_end) {
2323  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2324  parallel_data, &(task_info->task_data), flags, codeptr);
2325  }
2326 
2327  task_info->frame.enter_frame = ompt_data_none;
2328  __kmp_join_restore_state(thread, team);
2329 }
2330 #endif
2331 
2332 void __kmp_join_call(ident_t *loc, int gtid
2333 #if OMPT_SUPPORT
2334  ,
2335  enum fork_context_e fork_context
2336 #endif
2337  ,
2338  int exit_teams) {
2339  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2340  kmp_team_t *team;
2341  kmp_team_t *parent_team;
2342  kmp_info_t *master_th;
2343  kmp_root_t *root;
2344  int master_active;
2345 
2346  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2347 
2348  /* setup current data */
2349  master_th = __kmp_threads[gtid];
2350  root = master_th->th.th_root;
2351  team = master_th->th.th_team;
2352  parent_team = team->t.t_parent;
2353 
2354  master_th->th.th_ident = loc;
2355 
2356 #if OMPT_SUPPORT
2357  void *team_microtask = (void *)team->t.t_pkfn;
2358  // For GOMP interface with serialized parallel, need the
2359  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2360  // and end-parallel events.
2361  if (ompt_enabled.enabled &&
2362  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2363  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2364  }
2365 #endif
2366 
2367 #if KMP_DEBUG
2368  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2369  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2370  "th_task_team = %p\n",
2371  __kmp_gtid_from_thread(master_th), team,
2372  team->t.t_task_team[master_th->th.th_task_state],
2373  master_th->th.th_task_team));
2374  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2375  team->t.t_task_team[master_th->th.th_task_state]);
2376  }
2377 #endif
2378 
2379  if (team->t.t_serialized) {
2380  if (master_th->th.th_teams_microtask) {
2381  // We are in teams construct
2382  int level = team->t.t_level;
2383  int tlevel = master_th->th.th_teams_level;
2384  if (level == tlevel) {
2385  // AC: we haven't incremented it earlier at start of teams construct,
2386  // so do it here - at the end of teams construct
2387  team->t.t_level++;
2388  } else if (level == tlevel + 1) {
2389  // AC: we are exiting parallel inside teams, need to increment
2390  // serialization in order to restore it in the next call to
2391  // __kmpc_end_serialized_parallel
2392  team->t.t_serialized++;
2393  }
2394  }
2395  __kmpc_end_serialized_parallel(loc, gtid);
2396 
2397 #if OMPT_SUPPORT
2398  if (ompt_enabled.enabled) {
2399  __kmp_join_restore_state(master_th, parent_team);
2400  }
2401 #endif
2402 
2403  return;
2404  }
2405 
2406  master_active = team->t.t_master_active;
2407 
2408  if (!exit_teams) {
2409  // AC: No barrier for internal teams at exit from teams construct.
2410  // But there is barrier for external team (league).
2411  __kmp_internal_join(loc, gtid, team);
2412 #if USE_ITT_BUILD
2413  if (__itt_stack_caller_create_ptr) {
2414  KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2415  // destroy the stack stitching id after join barrier
2416  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2417  team->t.t_stack_id = NULL;
2418  }
2419 #endif
2420  } else {
2421  master_th->th.th_task_state =
2422  0; // AC: no tasking in teams (out of any parallel)
2423 #if USE_ITT_BUILD
2424  if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2425  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2426  // destroy the stack stitching id on exit from the teams construct
2427  // if parent_team is active, then the id will be destroyed later on
2428  // by master of the league of teams
2429  __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2430  parent_team->t.t_stack_id = NULL;
2431  }
2432 #endif
2433 
2434  if (team->t.t_nproc > 1 &&
2435  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2436  team->t.b->update_num_threads(team->t.t_nproc);
2437  __kmp_add_threads_to_team(team, team->t.t_nproc);
2438  }
2439  }
2440 
2441  KMP_MB();
2442 
2443 #if OMPT_SUPPORT
2444  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2445  void *codeptr = team->t.ompt_team_info.master_return_address;
2446 #endif
2447 
2448 #if USE_ITT_BUILD
2449  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2450  if (team->t.t_active_level == 1 &&
2451  (!master_th->th.th_teams_microtask || /* not in teams construct */
2452  master_th->th.th_teams_size.nteams == 1)) {
2453  master_th->th.th_ident = loc;
2454  // only one notification scheme (either "submit" or "forking/joined", not
2455  // both)
2456  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2457  __kmp_forkjoin_frames_mode == 3)
2458  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2459  master_th->th.th_frame_time, 0, loc,
2460  master_th->th.th_team_nproc, 1);
2461  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2462  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2463  __kmp_itt_region_joined(gtid);
2464  } // active_level == 1
2465 #endif /* USE_ITT_BUILD */
2466 
2467 #if KMP_AFFINITY_SUPPORTED
2468  if (!exit_teams) {
2469  // Restore master thread's partition.
2470  master_th->th.th_first_place = team->t.t_first_place;
2471  master_th->th.th_last_place = team->t.t_last_place;
2472  }
2473 #endif // KMP_AFFINITY_SUPPORTED
2474 
2475  if (master_th->th.th_teams_microtask && !exit_teams &&
2476  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2477  team->t.t_level == master_th->th.th_teams_level + 1) {
2478 // AC: We need to leave the team structure intact at the end of parallel
2479 // inside the teams construct, so that at the next parallel same (hot) team
2480 // works, only adjust nesting levels
2481 #if OMPT_SUPPORT
2482  ompt_data_t ompt_parallel_data = ompt_data_none;
2483  if (ompt_enabled.enabled) {
2484  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2485  if (ompt_enabled.ompt_callback_implicit_task) {
2486  int ompt_team_size = team->t.t_nproc;
2487  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2488  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2489  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2490  }
2491  task_info->frame.exit_frame = ompt_data_none;
2492  task_info->task_data = ompt_data_none;
2493  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2494  __ompt_lw_taskteam_unlink(master_th);
2495  }
2496 #endif
2497  /* Decrement our nested depth level */
2498  team->t.t_level--;
2499  team->t.t_active_level--;
2500  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2501 
2502  // Restore number of threads in the team if needed. This code relies on
2503  // the proper adjustment of th_teams_size.nth after the fork in
2504  // __kmp_teams_master on each teams primary thread in the case that
2505  // __kmp_reserve_threads reduced it.
2506  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2507  int old_num = master_th->th.th_team_nproc;
2508  int new_num = master_th->th.th_teams_size.nth;
2509  kmp_info_t **other_threads = team->t.t_threads;
2510  team->t.t_nproc = new_num;
2511  for (int i = 0; i < old_num; ++i) {
2512  other_threads[i]->th.th_team_nproc = new_num;
2513  }
2514  // Adjust states of non-used threads of the team
2515  for (int i = old_num; i < new_num; ++i) {
2516  // Re-initialize thread's barrier data.
2517  KMP_DEBUG_ASSERT(other_threads[i]);
2518  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2519  for (int b = 0; b < bs_last_barrier; ++b) {
2520  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2521  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2522 #if USE_DEBUGGER
2523  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2524 #endif
2525  }
2526  if (__kmp_tasking_mode != tskm_immediate_exec) {
2527  // Synchronize thread's task state
2528  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2529  }
2530  }
2531  }
2532 
2533 #if OMPT_SUPPORT
2534  if (ompt_enabled.enabled) {
2535  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2536  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2537  }
2538 #endif
2539 
2540  return;
2541  }
2542 
2543  /* do cleanup and restore the parent team */
2544  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2545  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2546 
2547  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2548 
2549  /* jc: The following lock has instructions with REL and ACQ semantics,
2550  separating the parallel user code called in this parallel region
2551  from the serial user code called after this function returns. */
2552  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2553 
2554  if (!master_th->th.th_teams_microtask ||
2555  team->t.t_level > master_th->th.th_teams_level) {
2556  /* Decrement our nested depth level */
2557  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2558  }
2559  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2560 
2561 #if OMPT_SUPPORT
2562  if (ompt_enabled.enabled) {
2563  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2564  if (ompt_enabled.ompt_callback_implicit_task) {
2565  int flags = (team_microtask == (void *)__kmp_teams_master)
2566  ? ompt_task_initial
2567  : ompt_task_implicit;
2568  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2569  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2570  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2571  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2572  }
2573  task_info->frame.exit_frame = ompt_data_none;
2574  task_info->task_data = ompt_data_none;
2575  }
2576 #endif
2577 
2578  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2579  master_th, team));
2580  __kmp_pop_current_task_from_thread(master_th);
2581 
2582  master_th->th.th_def_allocator = team->t.t_def_allocator;
2583 
2584 #if OMPD_SUPPORT
2585  if (ompd_state & OMPD_ENABLE_BP)
2586  ompd_bp_parallel_end();
2587 #endif
2588  updateHWFPControl(team);
2589 
2590  if (root->r.r_active != master_active)
2591  root->r.r_active = master_active;
2592 
2593  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2594  master_th)); // this will free worker threads
2595 
2596  /* this race was fun to find. make sure the following is in the critical
2597  region otherwise assertions may fail occasionally since the old team may be
2598  reallocated and the hierarchy appears inconsistent. it is actually safe to
2599  run and won't cause any bugs, but will cause those assertion failures. it's
2600  only one deref&assign so might as well put this in the critical region */
2601  master_th->th.th_team = parent_team;
2602  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2603  master_th->th.th_team_master = parent_team->t.t_threads[0];
2604  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2605 
2606  /* restore serialized team, if need be */
2607  if (parent_team->t.t_serialized &&
2608  parent_team != master_th->th.th_serial_team &&
2609  parent_team != root->r.r_root_team) {
2610  __kmp_free_team(root,
2611  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2612  master_th->th.th_serial_team = parent_team;
2613  }
2614 
2615  if (__kmp_tasking_mode != tskm_immediate_exec) {
2616  if (master_th->th.th_task_state_top >
2617  0) { // Restore task state from memo stack
2618  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2619  // Remember primary thread's state if we re-use this nested hot team
2620  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2621  master_th->th.th_task_state;
2622  --master_th->th.th_task_state_top; // pop
2623  // Now restore state at this level
2624  master_th->th.th_task_state =
2625  master_th->th
2626  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2627  }
2628  // Copy the task team from the parent team to the primary thread
2629  master_th->th.th_task_team =
2630  parent_team->t.t_task_team[master_th->th.th_task_state];
2631  KA_TRACE(20,
2632  ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2633  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2634  parent_team));
2635  }
2636 
2637  // TODO: GEH - cannot do this assertion because root thread not set up as
2638  // executing
2639  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2640  master_th->th.th_current_task->td_flags.executing = 1;
2641 
2642  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2643 
2644 #if OMPT_SUPPORT
2645  int flags =
2646  OMPT_INVOKER(fork_context) |
2647  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2648  : ompt_parallel_team);
2649  if (ompt_enabled.enabled) {
2650  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2651  codeptr);
2652  }
2653 #endif
2654 
2655  KMP_MB();
2656  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2657 }
2658 
2659 /* Check whether we should push an internal control record onto the
2660  serial team stack. If so, do it. */
2661 void __kmp_save_internal_controls(kmp_info_t *thread) {
2662 
2663  if (thread->th.th_team != thread->th.th_serial_team) {
2664  return;
2665  }
2666  if (thread->th.th_team->t.t_serialized > 1) {
2667  int push = 0;
2668 
2669  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2670  push = 1;
2671  } else {
2672  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2673  thread->th.th_team->t.t_serialized) {
2674  push = 1;
2675  }
2676  }
2677  if (push) { /* push a record on the serial team's stack */
2678  kmp_internal_control_t *control =
2679  (kmp_internal_control_t *)__kmp_allocate(
2680  sizeof(kmp_internal_control_t));
2681 
2682  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2683 
2684  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2685 
2686  control->next = thread->th.th_team->t.t_control_stack_top;
2687  thread->th.th_team->t.t_control_stack_top = control;
2688  }
2689  }
2690 }
2691 
2692 /* Changes set_nproc */
2693 void __kmp_set_num_threads(int new_nth, int gtid) {
2694  kmp_info_t *thread;
2695  kmp_root_t *root;
2696 
2697  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2698  KMP_DEBUG_ASSERT(__kmp_init_serial);
2699 
2700  if (new_nth < 1)
2701  new_nth = 1;
2702  else if (new_nth > __kmp_max_nth)
2703  new_nth = __kmp_max_nth;
2704 
2705  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2706  thread = __kmp_threads[gtid];
2707  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2708  return; // nothing to do
2709 
2710  __kmp_save_internal_controls(thread);
2711 
2712  set__nproc(thread, new_nth);
2713 
2714  // If this omp_set_num_threads() call will cause the hot team size to be
2715  // reduced (in the absence of a num_threads clause), then reduce it now,
2716  // rather than waiting for the next parallel region.
2717  root = thread->th.th_root;
2718  if (__kmp_init_parallel && (!root->r.r_active) &&
2719  (root->r.r_hot_team->t.t_nproc > new_nth)
2720 #if KMP_NESTED_HOT_TEAMS
2721  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2722 #endif
2723  ) {
2724  kmp_team_t *hot_team = root->r.r_hot_team;
2725  int f;
2726 
2727  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2728 
2729  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2730  __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2731  }
2732  // Release the extra threads we don't need any more.
2733  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2734  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2735  if (__kmp_tasking_mode != tskm_immediate_exec) {
2736  // When decreasing team size, threads no longer in the team should unref
2737  // task team.
2738  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2739  }
2740  __kmp_free_thread(hot_team->t.t_threads[f]);
2741  hot_team->t.t_threads[f] = NULL;
2742  }
2743  hot_team->t.t_nproc = new_nth;
2744 #if KMP_NESTED_HOT_TEAMS
2745  if (thread->th.th_hot_teams) {
2746  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2747  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2748  }
2749 #endif
2750 
2751  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2752  hot_team->t.b->update_num_threads(new_nth);
2753  __kmp_add_threads_to_team(hot_team, new_nth);
2754  }
2755 
2756  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2757 
2758  // Update the t_nproc field in the threads that are still active.
2759  for (f = 0; f < new_nth; f++) {
2760  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2761  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2762  }
2763  // Special flag in case omp_set_num_threads() call
2764  hot_team->t.t_size_changed = -1;
2765  }
2766 }
2767 
2768 /* Changes max_active_levels */
2769 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2770  kmp_info_t *thread;
2771 
2772  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2773  "%d = (%d)\n",
2774  gtid, max_active_levels));
2775  KMP_DEBUG_ASSERT(__kmp_init_serial);
2776 
2777  // validate max_active_levels
2778  if (max_active_levels < 0) {
2779  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2780  // We ignore this call if the user has specified a negative value.
2781  // The current setting won't be changed. The last valid setting will be
2782  // used. A warning will be issued (if warnings are allowed as controlled by
2783  // the KMP_WARNINGS env var).
2784  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2785  "max_active_levels for thread %d = (%d)\n",
2786  gtid, max_active_levels));
2787  return;
2788  }
2789  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2790  // it's OK, the max_active_levels is within the valid range: [ 0;
2791  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2792  // We allow a zero value. (implementation defined behavior)
2793  } else {
2794  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2795  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2796  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2797  // Current upper limit is MAX_INT. (implementation defined behavior)
2798  // If the input exceeds the upper limit, we correct the input to be the
2799  // upper limit. (implementation defined behavior)
2800  // Actually, the flow should never get here until we use MAX_INT limit.
2801  }
2802  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2803  "max_active_levels for thread %d = (%d)\n",
2804  gtid, max_active_levels));
2805 
2806  thread = __kmp_threads[gtid];
2807 
2808  __kmp_save_internal_controls(thread);
2809 
2810  set__max_active_levels(thread, max_active_levels);
2811 }
2812 
2813 /* Gets max_active_levels */
2814 int __kmp_get_max_active_levels(int gtid) {
2815  kmp_info_t *thread;
2816 
2817  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2818  KMP_DEBUG_ASSERT(__kmp_init_serial);
2819 
2820  thread = __kmp_threads[gtid];
2821  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2822  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2823  "curtask_maxaclevel=%d\n",
2824  gtid, thread->th.th_current_task,
2825  thread->th.th_current_task->td_icvs.max_active_levels));
2826  return thread->th.th_current_task->td_icvs.max_active_levels;
2827 }
2828 
2829 // nteams-var per-device ICV
2830 void __kmp_set_num_teams(int num_teams) {
2831  if (num_teams > 0)
2832  __kmp_nteams = num_teams;
2833 }
2834 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2835 // teams-thread-limit-var per-device ICV
2836 void __kmp_set_teams_thread_limit(int limit) {
2837  if (limit > 0)
2838  __kmp_teams_thread_limit = limit;
2839 }
2840 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2841 
2842 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2843 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2844 
2845 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2846 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2847  kmp_info_t *thread;
2848  kmp_sched_t orig_kind;
2849  // kmp_team_t *team;
2850 
2851  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2852  gtid, (int)kind, chunk));
2853  KMP_DEBUG_ASSERT(__kmp_init_serial);
2854 
2855  // Check if the kind parameter is valid, correct if needed.
2856  // Valid parameters should fit in one of two intervals - standard or extended:
2857  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2858  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2859  orig_kind = kind;
2860  kind = __kmp_sched_without_mods(kind);
2861 
2862  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2863  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2864  // TODO: Hint needs attention in case we change the default schedule.
2865  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2866  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2867  __kmp_msg_null);
2868  kind = kmp_sched_default;
2869  chunk = 0; // ignore chunk value in case of bad kind
2870  }
2871 
2872  thread = __kmp_threads[gtid];
2873 
2874  __kmp_save_internal_controls(thread);
2875 
2876  if (kind < kmp_sched_upper_std) {
2877  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2878  // differ static chunked vs. unchunked: chunk should be invalid to
2879  // indicate unchunked schedule (which is the default)
2880  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2881  } else {
2882  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2883  __kmp_sch_map[kind - kmp_sched_lower - 1];
2884  }
2885  } else {
2886  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2887  // kmp_sched_lower - 2 ];
2888  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2889  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2890  kmp_sched_lower - 2];
2891  }
2892  __kmp_sched_apply_mods_intkind(
2893  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2894  if (kind == kmp_sched_auto || chunk < 1) {
2895  // ignore parameter chunk for schedule auto
2896  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2897  } else {
2898  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2899  }
2900 }
2901 
2902 /* Gets def_sched_var ICV values */
2903 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2904  kmp_info_t *thread;
2905  enum sched_type th_type;
2906 
2907  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2908  KMP_DEBUG_ASSERT(__kmp_init_serial);
2909 
2910  thread = __kmp_threads[gtid];
2911 
2912  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2913  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2914  case kmp_sch_static:
2915  case kmp_sch_static_greedy:
2916  case kmp_sch_static_balanced:
2917  *kind = kmp_sched_static;
2918  __kmp_sched_apply_mods_stdkind(kind, th_type);
2919  *chunk = 0; // chunk was not set, try to show this fact via zero value
2920  return;
2921  case kmp_sch_static_chunked:
2922  *kind = kmp_sched_static;
2923  break;
2924  case kmp_sch_dynamic_chunked:
2925  *kind = kmp_sched_dynamic;
2926  break;
2928  case kmp_sch_guided_iterative_chunked:
2929  case kmp_sch_guided_analytical_chunked:
2930  *kind = kmp_sched_guided;
2931  break;
2932  case kmp_sch_auto:
2933  *kind = kmp_sched_auto;
2934  break;
2935  case kmp_sch_trapezoidal:
2936  *kind = kmp_sched_trapezoidal;
2937  break;
2938 #if KMP_STATIC_STEAL_ENABLED
2939  case kmp_sch_static_steal:
2940  *kind = kmp_sched_static_steal;
2941  break;
2942 #endif
2943  default:
2944  KMP_FATAL(UnknownSchedulingType, th_type);
2945  }
2946 
2947  __kmp_sched_apply_mods_stdkind(kind, th_type);
2948  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2949 }
2950 
2951 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2952 
2953  int ii, dd;
2954  kmp_team_t *team;
2955  kmp_info_t *thr;
2956 
2957  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2958  KMP_DEBUG_ASSERT(__kmp_init_serial);
2959 
2960  // validate level
2961  if (level == 0)
2962  return 0;
2963  if (level < 0)
2964  return -1;
2965  thr = __kmp_threads[gtid];
2966  team = thr->th.th_team;
2967  ii = team->t.t_level;
2968  if (level > ii)
2969  return -1;
2970 
2971  if (thr->th.th_teams_microtask) {
2972  // AC: we are in teams region where multiple nested teams have same level
2973  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2974  if (level <=
2975  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2976  KMP_DEBUG_ASSERT(ii >= tlevel);
2977  // AC: As we need to pass by the teams league, we need to artificially
2978  // increase ii
2979  if (ii == tlevel) {
2980  ii += 2; // three teams have same level
2981  } else {
2982  ii++; // two teams have same level
2983  }
2984  }
2985  }
2986 
2987  if (ii == level)
2988  return __kmp_tid_from_gtid(gtid);
2989 
2990  dd = team->t.t_serialized;
2991  level++;
2992  while (ii > level) {
2993  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2994  }
2995  if ((team->t.t_serialized) && (!dd)) {
2996  team = team->t.t_parent;
2997  continue;
2998  }
2999  if (ii > level) {
3000  team = team->t.t_parent;
3001  dd = team->t.t_serialized;
3002  ii--;
3003  }
3004  }
3005 
3006  return (dd > 1) ? (0) : (team->t.t_master_tid);
3007 }
3008 
3009 int __kmp_get_team_size(int gtid, int level) {
3010 
3011  int ii, dd;
3012  kmp_team_t *team;
3013  kmp_info_t *thr;
3014 
3015  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3016  KMP_DEBUG_ASSERT(__kmp_init_serial);
3017 
3018  // validate level
3019  if (level == 0)
3020  return 1;
3021  if (level < 0)
3022  return -1;
3023  thr = __kmp_threads[gtid];
3024  team = thr->th.th_team;
3025  ii = team->t.t_level;
3026  if (level > ii)
3027  return -1;
3028 
3029  if (thr->th.th_teams_microtask) {
3030  // AC: we are in teams region where multiple nested teams have same level
3031  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3032  if (level <=
3033  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3034  KMP_DEBUG_ASSERT(ii >= tlevel);
3035  // AC: As we need to pass by the teams league, we need to artificially
3036  // increase ii
3037  if (ii == tlevel) {
3038  ii += 2; // three teams have same level
3039  } else {
3040  ii++; // two teams have same level
3041  }
3042  }
3043  }
3044 
3045  while (ii > level) {
3046  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3047  }
3048  if (team->t.t_serialized && (!dd)) {
3049  team = team->t.t_parent;
3050  continue;
3051  }
3052  if (ii > level) {
3053  team = team->t.t_parent;
3054  ii--;
3055  }
3056  }
3057 
3058  return team->t.t_nproc;
3059 }
3060 
3061 kmp_r_sched_t __kmp_get_schedule_global() {
3062  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3063  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3064  // independently. So one can get the updated schedule here.
3065 
3066  kmp_r_sched_t r_sched;
3067 
3068  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3069  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3070  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3071  // different roots (even in OMP 2.5)
3072  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3073  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3074  if (s == kmp_sch_static) {
3075  // replace STATIC with more detailed schedule (balanced or greedy)
3076  r_sched.r_sched_type = __kmp_static;
3077  } else if (s == kmp_sch_guided_chunked) {
3078  // replace GUIDED with more detailed schedule (iterative or analytical)
3079  r_sched.r_sched_type = __kmp_guided;
3080  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3081  r_sched.r_sched_type = __kmp_sched;
3082  }
3083  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3084 
3085  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3086  // __kmp_chunk may be wrong here (if it was not ever set)
3087  r_sched.chunk = KMP_DEFAULT_CHUNK;
3088  } else {
3089  r_sched.chunk = __kmp_chunk;
3090  }
3091 
3092  return r_sched;
3093 }
3094 
3095 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3096  at least argc number of *t_argv entries for the requested team. */
3097 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3098 
3099  KMP_DEBUG_ASSERT(team);
3100  if (!realloc || argc > team->t.t_max_argc) {
3101 
3102  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3103  "current entries=%d\n",
3104  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3105  /* if previously allocated heap space for args, free them */
3106  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3107  __kmp_free((void *)team->t.t_argv);
3108 
3109  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3110  /* use unused space in the cache line for arguments */
3111  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3112  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3113  "argv entries\n",
3114  team->t.t_id, team->t.t_max_argc));
3115  team->t.t_argv = &team->t.t_inline_argv[0];
3116  if (__kmp_storage_map) {
3117  __kmp_print_storage_map_gtid(
3118  -1, &team->t.t_inline_argv[0],
3119  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3120  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3121  team->t.t_id);
3122  }
3123  } else {
3124  /* allocate space for arguments in the heap */
3125  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3126  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3127  : 2 * argc;
3128  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3129  "argv entries\n",
3130  team->t.t_id, team->t.t_max_argc));
3131  team->t.t_argv =
3132  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3133  if (__kmp_storage_map) {
3134  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3135  &team->t.t_argv[team->t.t_max_argc],
3136  sizeof(void *) * team->t.t_max_argc,
3137  "team_%d.t_argv", team->t.t_id);
3138  }
3139  }
3140  }
3141 }
3142 
3143 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3144  int i;
3145  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3146  team->t.t_threads =
3147  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3148  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3149  sizeof(dispatch_shared_info_t) * num_disp_buff);
3150  team->t.t_dispatch =
3151  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3152  team->t.t_implicit_task_taskdata =
3153  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3154  team->t.t_max_nproc = max_nth;
3155 
3156  /* setup dispatch buffers */
3157  for (i = 0; i < num_disp_buff; ++i) {
3158  team->t.t_disp_buffer[i].buffer_index = i;
3159  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3160  }
3161 }
3162 
3163 static void __kmp_free_team_arrays(kmp_team_t *team) {
3164  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3165  int i;
3166  for (i = 0; i < team->t.t_max_nproc; ++i) {
3167  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3168  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3169  team->t.t_dispatch[i].th_disp_buffer = NULL;
3170  }
3171  }
3172 #if KMP_USE_HIER_SCHED
3173  __kmp_dispatch_free_hierarchies(team);
3174 #endif
3175  __kmp_free(team->t.t_threads);
3176  __kmp_free(team->t.t_disp_buffer);
3177  __kmp_free(team->t.t_dispatch);
3178  __kmp_free(team->t.t_implicit_task_taskdata);
3179  team->t.t_threads = NULL;
3180  team->t.t_disp_buffer = NULL;
3181  team->t.t_dispatch = NULL;
3182  team->t.t_implicit_task_taskdata = 0;
3183 }
3184 
3185 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3186  kmp_info_t **oldThreads = team->t.t_threads;
3187 
3188  __kmp_free(team->t.t_disp_buffer);
3189  __kmp_free(team->t.t_dispatch);
3190  __kmp_free(team->t.t_implicit_task_taskdata);
3191  __kmp_allocate_team_arrays(team, max_nth);
3192 
3193  KMP_MEMCPY(team->t.t_threads, oldThreads,
3194  team->t.t_nproc * sizeof(kmp_info_t *));
3195 
3196  __kmp_free(oldThreads);
3197 }
3198 
3199 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3200 
3201  kmp_r_sched_t r_sched =
3202  __kmp_get_schedule_global(); // get current state of scheduling globals
3203 
3204  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3205 
3206  kmp_internal_control_t g_icvs = {
3207  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3208  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3209  // adjustment of threads (per thread)
3210  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3211  // whether blocktime is explicitly set
3212  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3213 #if KMP_USE_MONITOR
3214  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3215 // intervals
3216 #endif
3217  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3218  // next parallel region (per thread)
3219  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3220  __kmp_cg_max_nth, // int thread_limit;
3221  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3222  // for max_active_levels
3223  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3224  // {sched,chunk} pair
3225  __kmp_nested_proc_bind.bind_types[0],
3226  __kmp_default_device,
3227  NULL // struct kmp_internal_control *next;
3228  };
3229 
3230  return g_icvs;
3231 }
3232 
3233 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3234 
3235  kmp_internal_control_t gx_icvs;
3236  gx_icvs.serial_nesting_level =
3237  0; // probably =team->t.t_serial like in save_inter_controls
3238  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3239  gx_icvs.next = NULL;
3240 
3241  return gx_icvs;
3242 }
3243 
3244 static void __kmp_initialize_root(kmp_root_t *root) {
3245  int f;
3246  kmp_team_t *root_team;
3247  kmp_team_t *hot_team;
3248  int hot_team_max_nth;
3249  kmp_r_sched_t r_sched =
3250  __kmp_get_schedule_global(); // get current state of scheduling globals
3251  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3252  KMP_DEBUG_ASSERT(root);
3253  KMP_ASSERT(!root->r.r_begin);
3254 
3255  /* setup the root state structure */
3256  __kmp_init_lock(&root->r.r_begin_lock);
3257  root->r.r_begin = FALSE;
3258  root->r.r_active = FALSE;
3259  root->r.r_in_parallel = 0;
3260  root->r.r_blocktime = __kmp_dflt_blocktime;
3261 #if KMP_AFFINITY_SUPPORTED
3262  root->r.r_affinity_assigned = FALSE;
3263 #endif
3264 
3265  /* setup the root team for this task */
3266  /* allocate the root team structure */
3267  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3268 
3269  root_team =
3270  __kmp_allocate_team(root,
3271  1, // new_nproc
3272  1, // max_nproc
3273 #if OMPT_SUPPORT
3274  ompt_data_none, // root parallel id
3275 #endif
3276  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3277  0 // argc
3278  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3279  );
3280 #if USE_DEBUGGER
3281  // Non-NULL value should be assigned to make the debugger display the root
3282  // team.
3283  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3284 #endif
3285 
3286  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3287 
3288  root->r.r_root_team = root_team;
3289  root_team->t.t_control_stack_top = NULL;
3290 
3291  /* initialize root team */
3292  root_team->t.t_threads[0] = NULL;
3293  root_team->t.t_nproc = 1;
3294  root_team->t.t_serialized = 1;
3295  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3296  root_team->t.t_sched.sched = r_sched.sched;
3297  KA_TRACE(
3298  20,
3299  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3300  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3301 
3302  /* setup the hot team for this task */
3303  /* allocate the hot team structure */
3304  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3305 
3306  hot_team =
3307  __kmp_allocate_team(root,
3308  1, // new_nproc
3309  __kmp_dflt_team_nth_ub * 2, // max_nproc
3310 #if OMPT_SUPPORT
3311  ompt_data_none, // root parallel id
3312 #endif
3313  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3314  0 // argc
3315  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3316  );
3317  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3318 
3319  root->r.r_hot_team = hot_team;
3320  root_team->t.t_control_stack_top = NULL;
3321 
3322  /* first-time initialization */
3323  hot_team->t.t_parent = root_team;
3324 
3325  /* initialize hot team */
3326  hot_team_max_nth = hot_team->t.t_max_nproc;
3327  for (f = 0; f < hot_team_max_nth; ++f) {
3328  hot_team->t.t_threads[f] = NULL;
3329  }
3330  hot_team->t.t_nproc = 1;
3331  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3332  hot_team->t.t_sched.sched = r_sched.sched;
3333  hot_team->t.t_size_changed = 0;
3334 }
3335 
3336 #ifdef KMP_DEBUG
3337 
3338 typedef struct kmp_team_list_item {
3339  kmp_team_p const *entry;
3340  struct kmp_team_list_item *next;
3341 } kmp_team_list_item_t;
3342 typedef kmp_team_list_item_t *kmp_team_list_t;
3343 
3344 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3345  kmp_team_list_t list, // List of teams.
3346  kmp_team_p const *team // Team to add.
3347 ) {
3348 
3349  // List must terminate with item where both entry and next are NULL.
3350  // Team is added to the list only once.
3351  // List is sorted in ascending order by team id.
3352  // Team id is *not* a key.
3353 
3354  kmp_team_list_t l;
3355 
3356  KMP_DEBUG_ASSERT(list != NULL);
3357  if (team == NULL) {
3358  return;
3359  }
3360 
3361  __kmp_print_structure_team_accum(list, team->t.t_parent);
3362  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3363 
3364  // Search list for the team.
3365  l = list;
3366  while (l->next != NULL && l->entry != team) {
3367  l = l->next;
3368  }
3369  if (l->next != NULL) {
3370  return; // Team has been added before, exit.
3371  }
3372 
3373  // Team is not found. Search list again for insertion point.
3374  l = list;
3375  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3376  l = l->next;
3377  }
3378 
3379  // Insert team.
3380  {
3381  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3382  sizeof(kmp_team_list_item_t));
3383  *item = *l;
3384  l->entry = team;
3385  l->next = item;
3386  }
3387 }
3388 
3389 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3390 
3391 ) {
3392  __kmp_printf("%s", title);
3393  if (team != NULL) {
3394  __kmp_printf("%2x %p\n", team->t.t_id, team);
3395  } else {
3396  __kmp_printf(" - (nil)\n");
3397  }
3398 }
3399 
3400 static void __kmp_print_structure_thread(char const *title,
3401  kmp_info_p const *thread) {
3402  __kmp_printf("%s", title);
3403  if (thread != NULL) {
3404  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3405  } else {
3406  __kmp_printf(" - (nil)\n");
3407  }
3408 }
3409 
3410 void __kmp_print_structure(void) {
3411 
3412  kmp_team_list_t list;
3413 
3414  // Initialize list of teams.
3415  list =
3416  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3417  list->entry = NULL;
3418  list->next = NULL;
3419 
3420  __kmp_printf("\n------------------------------\nGlobal Thread "
3421  "Table\n------------------------------\n");
3422  {
3423  int gtid;
3424  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3425  __kmp_printf("%2d", gtid);
3426  if (__kmp_threads != NULL) {
3427  __kmp_printf(" %p", __kmp_threads[gtid]);
3428  }
3429  if (__kmp_root != NULL) {
3430  __kmp_printf(" %p", __kmp_root[gtid]);
3431  }
3432  __kmp_printf("\n");
3433  }
3434  }
3435 
3436  // Print out __kmp_threads array.
3437  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3438  "----------\n");
3439  if (__kmp_threads != NULL) {
3440  int gtid;
3441  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3442  kmp_info_t const *thread = __kmp_threads[gtid];
3443  if (thread != NULL) {
3444  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3445  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3446  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3447  __kmp_print_structure_team(" Serial Team: ",
3448  thread->th.th_serial_team);
3449  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3450  __kmp_print_structure_thread(" Primary: ",
3451  thread->th.th_team_master);
3452  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3453  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3454  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3455  __kmp_print_structure_thread(" Next in pool: ",
3456  thread->th.th_next_pool);
3457  __kmp_printf("\n");
3458  __kmp_print_structure_team_accum(list, thread->th.th_team);
3459  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3460  }
3461  }
3462  } else {
3463  __kmp_printf("Threads array is not allocated.\n");
3464  }
3465 
3466  // Print out __kmp_root array.
3467  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3468  "--------\n");
3469  if (__kmp_root != NULL) {
3470  int gtid;
3471  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3472  kmp_root_t const *root = __kmp_root[gtid];
3473  if (root != NULL) {
3474  __kmp_printf("GTID %2d %p:\n", gtid, root);
3475  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3476  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3477  __kmp_print_structure_thread(" Uber Thread: ",
3478  root->r.r_uber_thread);
3479  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3480  __kmp_printf(" In Parallel: %2d\n",
3481  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3482  __kmp_printf("\n");
3483  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3484  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3485  }
3486  }
3487  } else {
3488  __kmp_printf("Ubers array is not allocated.\n");
3489  }
3490 
3491  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3492  "--------\n");
3493  while (list->next != NULL) {
3494  kmp_team_p const *team = list->entry;
3495  int i;
3496  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3497  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3498  __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3499  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3500  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3501  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3502  for (i = 0; i < team->t.t_nproc; ++i) {
3503  __kmp_printf(" Thread %2d: ", i);
3504  __kmp_print_structure_thread("", team->t.t_threads[i]);
3505  }
3506  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3507  __kmp_printf("\n");
3508  list = list->next;
3509  }
3510 
3511  // Print out __kmp_thread_pool and __kmp_team_pool.
3512  __kmp_printf("\n------------------------------\nPools\n----------------------"
3513  "--------\n");
3514  __kmp_print_structure_thread("Thread pool: ",
3515  CCAST(kmp_info_t *, __kmp_thread_pool));
3516  __kmp_print_structure_team("Team pool: ",
3517  CCAST(kmp_team_t *, __kmp_team_pool));
3518  __kmp_printf("\n");
3519 
3520  // Free team list.
3521  while (list != NULL) {
3522  kmp_team_list_item_t *item = list;
3523  list = list->next;
3524  KMP_INTERNAL_FREE(item);
3525  }
3526 }
3527 
3528 #endif
3529 
3530 //---------------------------------------------------------------------------
3531 // Stuff for per-thread fast random number generator
3532 // Table of primes
3533 static const unsigned __kmp_primes[] = {
3534  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3535  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3536  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3537  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3538  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3539  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3540  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3541  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3542  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3543  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3544  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3545 
3546 //---------------------------------------------------------------------------
3547 // __kmp_get_random: Get a random number using a linear congruential method.
3548 unsigned short __kmp_get_random(kmp_info_t *thread) {
3549  unsigned x = thread->th.th_x;
3550  unsigned short r = (unsigned short)(x >> 16);
3551 
3552  thread->th.th_x = x * thread->th.th_a + 1;
3553 
3554  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3555  thread->th.th_info.ds.ds_tid, r));
3556 
3557  return r;
3558 }
3559 //--------------------------------------------------------
3560 // __kmp_init_random: Initialize a random number generator
3561 void __kmp_init_random(kmp_info_t *thread) {
3562  unsigned seed = thread->th.th_info.ds.ds_tid;
3563 
3564  thread->th.th_a =
3565  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3566  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3567  KA_TRACE(30,
3568  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3569 }
3570 
3571 #if KMP_OS_WINDOWS
3572 /* reclaim array entries for root threads that are already dead, returns number
3573  * reclaimed */
3574 static int __kmp_reclaim_dead_roots(void) {
3575  int i, r = 0;
3576 
3577  for (i = 0; i < __kmp_threads_capacity; ++i) {
3578  if (KMP_UBER_GTID(i) &&
3579  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3580  !__kmp_root[i]
3581  ->r.r_active) { // AC: reclaim only roots died in non-active state
3582  r += __kmp_unregister_root_other_thread(i);
3583  }
3584  }
3585  return r;
3586 }
3587 #endif
3588 
3589 /* This function attempts to create free entries in __kmp_threads and
3590  __kmp_root, and returns the number of free entries generated.
3591 
3592  For Windows* OS static library, the first mechanism used is to reclaim array
3593  entries for root threads that are already dead.
3594 
3595  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3596  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3597  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3598  threadprivate cache array has been created. Synchronization with
3599  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3600 
3601  After any dead root reclamation, if the clipping value allows array expansion
3602  to result in the generation of a total of nNeed free slots, the function does
3603  that expansion. If not, nothing is done beyond the possible initial root
3604  thread reclamation.
3605 
3606  If any argument is negative, the behavior is undefined. */
3607 static int __kmp_expand_threads(int nNeed) {
3608  int added = 0;
3609  int minimumRequiredCapacity;
3610  int newCapacity;
3611  kmp_info_t **newThreads;
3612  kmp_root_t **newRoot;
3613 
3614  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3615  // resizing __kmp_threads does not need additional protection if foreign
3616  // threads are present
3617 
3618 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3619  /* only for Windows static library */
3620  /* reclaim array entries for root threads that are already dead */
3621  added = __kmp_reclaim_dead_roots();
3622 
3623  if (nNeed) {
3624  nNeed -= added;
3625  if (nNeed < 0)
3626  nNeed = 0;
3627  }
3628 #endif
3629  if (nNeed <= 0)
3630  return added;
3631 
3632  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3633  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3634  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3635  // > __kmp_max_nth in one of two ways:
3636  //
3637  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3638  // may not be reused by another thread, so we may need to increase
3639  // __kmp_threads_capacity to __kmp_max_nth + 1.
3640  //
3641  // 2) New foreign root(s) are encountered. We always register new foreign
3642  // roots. This may cause a smaller # of threads to be allocated at
3643  // subsequent parallel regions, but the worker threads hang around (and
3644  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3645  //
3646  // Anyway, that is the reason for moving the check to see if
3647  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3648  // instead of having it performed here. -BB
3649 
3650  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3651 
3652  /* compute expansion headroom to check if we can expand */
3653  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3654  /* possible expansion too small -- give up */
3655  return added;
3656  }
3657  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3658 
3659  newCapacity = __kmp_threads_capacity;
3660  do {
3661  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3662  : __kmp_sys_max_nth;
3663  } while (newCapacity < minimumRequiredCapacity);
3664  newThreads = (kmp_info_t **)__kmp_allocate(
3665  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3666  newRoot =
3667  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3668  KMP_MEMCPY(newThreads, __kmp_threads,
3669  __kmp_threads_capacity * sizeof(kmp_info_t *));
3670  KMP_MEMCPY(newRoot, __kmp_root,
3671  __kmp_threads_capacity * sizeof(kmp_root_t *));
3672  // Put old __kmp_threads array on a list. Any ongoing references to the old
3673  // list will be valid. This list is cleaned up at library shutdown.
3674  kmp_old_threads_list_t *node =
3675  (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3676  node->threads = __kmp_threads;
3677  node->next = __kmp_old_threads_list;
3678  __kmp_old_threads_list = node;
3679 
3680  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3681  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3682  added += newCapacity - __kmp_threads_capacity;
3683  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3684 
3685  if (newCapacity > __kmp_tp_capacity) {
3686  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3687  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3688  __kmp_threadprivate_resize_cache(newCapacity);
3689  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3690  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3691  }
3692  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3693  }
3694 
3695  return added;
3696 }
3697 
3698 /* Register the current thread as a root thread and obtain our gtid. We must
3699  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3700  thread that calls from __kmp_do_serial_initialize() */
3701 int __kmp_register_root(int initial_thread) {
3702  kmp_info_t *root_thread;
3703  kmp_root_t *root;
3704  int gtid;
3705  int capacity;
3706  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3707  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3708  KMP_MB();
3709 
3710  /* 2007-03-02:
3711  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3712  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3713  work as expected -- it may return false (that means there is at least one
3714  empty slot in __kmp_threads array), but it is possible the only free slot
3715  is #0, which is reserved for initial thread and so cannot be used for this
3716  one. Following code workarounds this bug.
3717 
3718  However, right solution seems to be not reserving slot #0 for initial
3719  thread because:
3720  (1) there is no magic in slot #0,
3721  (2) we cannot detect initial thread reliably (the first thread which does
3722  serial initialization may be not a real initial thread).
3723  */
3724  capacity = __kmp_threads_capacity;
3725  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3726  --capacity;
3727  }
3728 
3729  // If it is not for initializing the hidden helper team, we need to take
3730  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3731  // in __kmp_threads_capacity.
3732  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3733  capacity -= __kmp_hidden_helper_threads_num;
3734  }
3735 
3736  /* see if there are too many threads */
3737  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3738  if (__kmp_tp_cached) {
3739  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3740  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3741  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3742  } else {
3743  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3744  __kmp_msg_null);
3745  }
3746  }
3747 
3748  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3749  // 0: initial thread, also a regular OpenMP thread.
3750  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3751  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3752  // regular OpenMP threads.
3753  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3754  // Find an available thread slot for hidden helper thread. Slots for hidden
3755  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3756  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3757  gtid <= __kmp_hidden_helper_threads_num;
3758  gtid++)
3759  ;
3760  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3761  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3762  "hidden helper thread: T#%d\n",
3763  gtid));
3764  } else {
3765  /* find an available thread slot */
3766  // Don't reassign the zero slot since we need that to only be used by
3767  // initial thread. Slots for hidden helper threads should also be skipped.
3768  if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3769  gtid = 0;
3770  } else {
3771  for (gtid = __kmp_hidden_helper_threads_num + 1;
3772  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3773  ;
3774  }
3775  KA_TRACE(
3776  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3777  KMP_ASSERT(gtid < __kmp_threads_capacity);
3778  }
3779 
3780  /* update global accounting */
3781  __kmp_all_nth++;
3782  TCW_4(__kmp_nth, __kmp_nth + 1);
3783 
3784  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3785  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3786  if (__kmp_adjust_gtid_mode) {
3787  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3788  if (TCR_4(__kmp_gtid_mode) != 2) {
3789  TCW_4(__kmp_gtid_mode, 2);
3790  }
3791  } else {
3792  if (TCR_4(__kmp_gtid_mode) != 1) {
3793  TCW_4(__kmp_gtid_mode, 1);
3794  }
3795  }
3796  }
3797 
3798 #ifdef KMP_ADJUST_BLOCKTIME
3799  /* Adjust blocktime to zero if necessary */
3800  /* Middle initialization might not have occurred yet */
3801  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3802  if (__kmp_nth > __kmp_avail_proc) {
3803  __kmp_zero_bt = TRUE;
3804  }
3805  }
3806 #endif /* KMP_ADJUST_BLOCKTIME */
3807 
3808  /* setup this new hierarchy */
3809  if (!(root = __kmp_root[gtid])) {
3810  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3811  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3812  }
3813 
3814 #if KMP_STATS_ENABLED
3815  // Initialize stats as soon as possible (right after gtid assignment).
3816  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3817  __kmp_stats_thread_ptr->startLife();
3818  KMP_SET_THREAD_STATE(SERIAL_REGION);
3819  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3820 #endif
3821  __kmp_initialize_root(root);
3822 
3823  /* setup new root thread structure */
3824  if (root->r.r_uber_thread) {
3825  root_thread = root->r.r_uber_thread;
3826  } else {
3827  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3828  if (__kmp_storage_map) {
3829  __kmp_print_thread_storage_map(root_thread, gtid);
3830  }
3831  root_thread->th.th_info.ds.ds_gtid = gtid;
3832 #if OMPT_SUPPORT
3833  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3834 #endif
3835  root_thread->th.th_root = root;
3836  if (__kmp_env_consistency_check) {
3837  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3838  }
3839 #if USE_FAST_MEMORY
3840  __kmp_initialize_fast_memory(root_thread);
3841 #endif /* USE_FAST_MEMORY */
3842 
3843 #if KMP_USE_BGET
3844  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3845  __kmp_initialize_bget(root_thread);
3846 #endif
3847  __kmp_init_random(root_thread); // Initialize random number generator
3848  }
3849 
3850  /* setup the serial team held in reserve by the root thread */
3851  if (!root_thread->th.th_serial_team) {
3852  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3853  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3854  root_thread->th.th_serial_team = __kmp_allocate_team(
3855  root, 1, 1,
3856 #if OMPT_SUPPORT
3857  ompt_data_none, // root parallel id
3858 #endif
3859  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3860  }
3861  KMP_ASSERT(root_thread->th.th_serial_team);
3862  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3863  root_thread->th.th_serial_team));
3864 
3865  /* drop root_thread into place */
3866  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3867 
3868  root->r.r_root_team->t.t_threads[0] = root_thread;
3869  root->r.r_hot_team->t.t_threads[0] = root_thread;
3870  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3871  // AC: the team created in reserve, not for execution (it is unused for now).
3872  root_thread->th.th_serial_team->t.t_serialized = 0;
3873  root->r.r_uber_thread = root_thread;
3874 
3875  /* initialize the thread, get it ready to go */
3876  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3877  TCW_4(__kmp_init_gtid, TRUE);
3878 
3879  /* prepare the primary thread for get_gtid() */
3880  __kmp_gtid_set_specific(gtid);
3881 
3882 #if USE_ITT_BUILD
3883  __kmp_itt_thread_name(gtid);
3884 #endif /* USE_ITT_BUILD */
3885 
3886 #ifdef KMP_TDATA_GTID
3887  __kmp_gtid = gtid;
3888 #endif
3889  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3890  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3891 
3892  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3893  "plain=%u\n",
3894  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3895  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3896  KMP_INIT_BARRIER_STATE));
3897  { // Initialize barrier data.
3898  int b;
3899  for (b = 0; b < bs_last_barrier; ++b) {
3900  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3901 #if USE_DEBUGGER
3902  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3903 #endif
3904  }
3905  }
3906  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3907  KMP_INIT_BARRIER_STATE);
3908 
3909 #if KMP_AFFINITY_SUPPORTED
3910  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3911  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3912  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3913  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3914 #endif /* KMP_AFFINITY_SUPPORTED */
3915  root_thread->th.th_def_allocator = __kmp_def_allocator;
3916  root_thread->th.th_prev_level = 0;
3917  root_thread->th.th_prev_num_threads = 1;
3918 
3919  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3920  tmp->cg_root = root_thread;
3921  tmp->cg_thread_limit = __kmp_cg_max_nth;
3922  tmp->cg_nthreads = 1;
3923  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3924  " cg_nthreads init to 1\n",
3925  root_thread, tmp));
3926  tmp->up = NULL;
3927  root_thread->th.th_cg_roots = tmp;
3928 
3929  __kmp_root_counter++;
3930 
3931 #if OMPT_SUPPORT
3932  if (!initial_thread && ompt_enabled.enabled) {
3933 
3934  kmp_info_t *root_thread = ompt_get_thread();
3935 
3936  ompt_set_thread_state(root_thread, ompt_state_overhead);
3937 
3938  if (ompt_enabled.ompt_callback_thread_begin) {
3939  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3940  ompt_thread_initial, __ompt_get_thread_data_internal());
3941  }
3942  ompt_data_t *task_data;
3943  ompt_data_t *parallel_data;
3944  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3945  NULL);
3946  if (ompt_enabled.ompt_callback_implicit_task) {
3947  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3948  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3949  }
3950 
3951  ompt_set_thread_state(root_thread, ompt_state_work_serial);
3952  }
3953 #endif
3954 #if OMPD_SUPPORT
3955  if (ompd_state & OMPD_ENABLE_BP)
3956  ompd_bp_thread_begin();
3957 #endif
3958 
3959  KMP_MB();
3960  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3961 
3962  return gtid;
3963 }
3964 
3965 #if KMP_NESTED_HOT_TEAMS
3966 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3967  const int max_level) {
3968  int i, n, nth;
3969  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3970  if (!hot_teams || !hot_teams[level].hot_team) {
3971  return 0;
3972  }
3973  KMP_DEBUG_ASSERT(level < max_level);
3974  kmp_team_t *team = hot_teams[level].hot_team;
3975  nth = hot_teams[level].hot_team_nth;
3976  n = nth - 1; // primary thread is not freed
3977  if (level < max_level - 1) {
3978  for (i = 0; i < nth; ++i) {
3979  kmp_info_t *th = team->t.t_threads[i];
3980  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3981  if (i > 0 && th->th.th_hot_teams) {
3982  __kmp_free(th->th.th_hot_teams);
3983  th->th.th_hot_teams = NULL;
3984  }
3985  }
3986  }
3987  __kmp_free_team(root, team, NULL);
3988  return n;
3989 }
3990 #endif
3991 
3992 // Resets a root thread and clear its root and hot teams.
3993 // Returns the number of __kmp_threads entries directly and indirectly freed.
3994 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3995  kmp_team_t *root_team = root->r.r_root_team;
3996  kmp_team_t *hot_team = root->r.r_hot_team;
3997  int n = hot_team->t.t_nproc;
3998  int i;
3999 
4000  KMP_DEBUG_ASSERT(!root->r.r_active);
4001 
4002  root->r.r_root_team = NULL;
4003  root->r.r_hot_team = NULL;
4004  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4005  // before call to __kmp_free_team().
4006  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4007 #if KMP_NESTED_HOT_TEAMS
4008  if (__kmp_hot_teams_max_level >
4009  0) { // need to free nested hot teams and their threads if any
4010  for (i = 0; i < hot_team->t.t_nproc; ++i) {
4011  kmp_info_t *th = hot_team->t.t_threads[i];
4012  if (__kmp_hot_teams_max_level > 1) {
4013  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4014  }
4015  if (th->th.th_hot_teams) {
4016  __kmp_free(th->th.th_hot_teams);
4017  th->th.th_hot_teams = NULL;
4018  }
4019  }
4020  }
4021 #endif
4022  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4023 
4024  // Before we can reap the thread, we need to make certain that all other
4025  // threads in the teams that had this root as ancestor have stopped trying to
4026  // steal tasks.
4027  if (__kmp_tasking_mode != tskm_immediate_exec) {
4028  __kmp_wait_to_unref_task_teams();
4029  }
4030 
4031 #if KMP_OS_WINDOWS
4032  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4033  KA_TRACE(
4034  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4035  "\n",
4036  (LPVOID) & (root->r.r_uber_thread->th),
4037  root->r.r_uber_thread->th.th_info.ds.ds_thread));
4038  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4039 #endif /* KMP_OS_WINDOWS */
4040 
4041 #if OMPD_SUPPORT
4042  if (ompd_state & OMPD_ENABLE_BP)
4043  ompd_bp_thread_end();
4044 #endif
4045 
4046 #if OMPT_SUPPORT
4047  ompt_data_t *task_data;
4048  ompt_data_t *parallel_data;
4049  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4050  NULL);
4051  if (ompt_enabled.ompt_callback_implicit_task) {
4052  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4053  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4054  }
4055  if (ompt_enabled.ompt_callback_thread_end) {
4056  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4057  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4058  }
4059 #endif
4060 
4061  TCW_4(__kmp_nth,
4062  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4063  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4064  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4065  " to %d\n",
4066  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4067  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4068  if (i == 1) {
4069  // need to free contention group structure
4070  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4071  root->r.r_uber_thread->th.th_cg_roots->cg_root);
4072  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4073  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4074  root->r.r_uber_thread->th.th_cg_roots = NULL;
4075  }
4076  __kmp_reap_thread(root->r.r_uber_thread, 1);
4077 
4078  // We canot put root thread to __kmp_thread_pool, so we have to reap it
4079  // instead of freeing.
4080  root->r.r_uber_thread = NULL;
4081  /* mark root as no longer in use */
4082  root->r.r_begin = FALSE;
4083 
4084  return n;
4085 }
4086 
4087 void __kmp_unregister_root_current_thread(int gtid) {
4088  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4089  /* this lock should be ok, since unregister_root_current_thread is never
4090  called during an abort, only during a normal close. furthermore, if you
4091  have the forkjoin lock, you should never try to get the initz lock */
4092  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4093  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4094  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4095  "exiting T#%d\n",
4096  gtid));
4097  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4098  return;
4099  }
4100  kmp_root_t *root = __kmp_root[gtid];
4101 
4102  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4103  KMP_ASSERT(KMP_UBER_GTID(gtid));
4104  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4105  KMP_ASSERT(root->r.r_active == FALSE);
4106 
4107  KMP_MB();
4108 
4109  kmp_info_t *thread = __kmp_threads[gtid];
4110  kmp_team_t *team = thread->th.th_team;
4111  kmp_task_team_t *task_team = thread->th.th_task_team;
4112 
4113  // we need to wait for the proxy tasks before finishing the thread
4114  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4115  task_team->tt.tt_hidden_helper_task_encountered)) {
4116 #if OMPT_SUPPORT
4117  // the runtime is shutting down so we won't report any events
4118  thread->th.ompt_thread_info.state = ompt_state_undefined;
4119 #endif
4120  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4121  }
4122 
4123  __kmp_reset_root(gtid, root);
4124 
4125  KMP_MB();
4126  KC_TRACE(10,
4127  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4128 
4129  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4130 }
4131 
4132 #if KMP_OS_WINDOWS
4133 /* __kmp_forkjoin_lock must be already held
4134  Unregisters a root thread that is not the current thread. Returns the number
4135  of __kmp_threads entries freed as a result. */
4136 static int __kmp_unregister_root_other_thread(int gtid) {
4137  kmp_root_t *root = __kmp_root[gtid];
4138  int r;
4139 
4140  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4141  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4142  KMP_ASSERT(KMP_UBER_GTID(gtid));
4143  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4144  KMP_ASSERT(root->r.r_active == FALSE);
4145 
4146  r = __kmp_reset_root(gtid, root);
4147  KC_TRACE(10,
4148  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4149  return r;
4150 }
4151 #endif
4152 
4153 #if KMP_DEBUG
4154 void __kmp_task_info() {
4155 
4156  kmp_int32 gtid = __kmp_entry_gtid();
4157  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4158  kmp_info_t *this_thr = __kmp_threads[gtid];
4159  kmp_team_t *steam = this_thr->th.th_serial_team;
4160  kmp_team_t *team = this_thr->th.th_team;
4161 
4162  __kmp_printf(
4163  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4164  "ptask=%p\n",
4165  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4166  team->t.t_implicit_task_taskdata[tid].td_parent);
4167 }
4168 #endif // KMP_DEBUG
4169 
4170 /* TODO optimize with one big memclr, take out what isn't needed, split
4171  responsibility to workers as much as possible, and delay initialization of
4172  features as much as possible */
4173 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4174  int tid, int gtid) {
4175  /* this_thr->th.th_info.ds.ds_gtid is setup in
4176  kmp_allocate_thread/create_worker.
4177  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4178  KMP_DEBUG_ASSERT(this_thr != NULL);
4179  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4180  KMP_DEBUG_ASSERT(team);
4181  KMP_DEBUG_ASSERT(team->t.t_threads);
4182  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4183  kmp_info_t *master = team->t.t_threads[0];
4184  KMP_DEBUG_ASSERT(master);
4185  KMP_DEBUG_ASSERT(master->th.th_root);
4186 
4187  KMP_MB();
4188 
4189  TCW_SYNC_PTR(this_thr->th.th_team, team);
4190 
4191  this_thr->th.th_info.ds.ds_tid = tid;
4192  this_thr->th.th_set_nproc = 0;
4193  if (__kmp_tasking_mode != tskm_immediate_exec)
4194  // When tasking is possible, threads are not safe to reap until they are
4195  // done tasking; this will be set when tasking code is exited in wait
4196  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4197  else // no tasking --> always safe to reap
4198  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4199  this_thr->th.th_set_proc_bind = proc_bind_default;
4200 #if KMP_AFFINITY_SUPPORTED
4201  this_thr->th.th_new_place = this_thr->th.th_current_place;
4202 #endif
4203  this_thr->th.th_root = master->th.th_root;
4204 
4205  /* setup the thread's cache of the team structure */
4206  this_thr->th.th_team_nproc = team->t.t_nproc;
4207  this_thr->th.th_team_master = master;
4208  this_thr->th.th_team_serialized = team->t.t_serialized;
4209 
4210  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4211 
4212  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4213  tid, gtid, this_thr, this_thr->th.th_current_task));
4214 
4215  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4216  team, tid, TRUE);
4217 
4218  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4219  tid, gtid, this_thr, this_thr->th.th_current_task));
4220  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4221  // __kmp_initialize_team()?
4222 
4223  /* TODO no worksharing in speculative threads */
4224  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4225 
4226  this_thr->th.th_local.this_construct = 0;
4227 
4228  if (!this_thr->th.th_pri_common) {
4229  this_thr->th.th_pri_common =
4230  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4231  if (__kmp_storage_map) {
4232  __kmp_print_storage_map_gtid(
4233  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4234  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4235  }
4236  this_thr->th.th_pri_head = NULL;
4237  }
4238 
4239  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4240  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4241  // Make new thread's CG root same as primary thread's
4242  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4243  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4244  if (tmp) {
4245  // worker changes CG, need to check if old CG should be freed
4246  int i = tmp->cg_nthreads--;
4247  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4248  " on node %p of thread %p to %d\n",
4249  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4250  if (i == 1) {
4251  __kmp_free(tmp); // last thread left CG --> free it
4252  }
4253  }
4254  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4255  // Increment new thread's CG root's counter to add the new thread
4256  this_thr->th.th_cg_roots->cg_nthreads++;
4257  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4258  " node %p of thread %p to %d\n",
4259  this_thr, this_thr->th.th_cg_roots,
4260  this_thr->th.th_cg_roots->cg_root,
4261  this_thr->th.th_cg_roots->cg_nthreads));
4262  this_thr->th.th_current_task->td_icvs.thread_limit =
4263  this_thr->th.th_cg_roots->cg_thread_limit;
4264  }
4265 
4266  /* Initialize dynamic dispatch */
4267  {
4268  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4269  // Use team max_nproc since this will never change for the team.
4270  size_t disp_size =
4271  sizeof(dispatch_private_info_t) *
4272  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4273  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4274  team->t.t_max_nproc));
4275  KMP_ASSERT(dispatch);
4276  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4277  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4278 
4279  dispatch->th_disp_index = 0;
4280  dispatch->th_doacross_buf_idx = 0;
4281  if (!dispatch->th_disp_buffer) {
4282  dispatch->th_disp_buffer =
4283  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4284 
4285  if (__kmp_storage_map) {
4286  __kmp_print_storage_map_gtid(
4287  gtid, &dispatch->th_disp_buffer[0],
4288  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4289  ? 1
4290  : __kmp_dispatch_num_buffers],
4291  disp_size,
4292  "th_%d.th_dispatch.th_disp_buffer "
4293  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4294  gtid, team->t.t_id, gtid);
4295  }
4296  } else {
4297  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4298  }
4299 
4300  dispatch->th_dispatch_pr_current = 0;
4301  dispatch->th_dispatch_sh_current = 0;
4302 
4303  dispatch->th_deo_fcn = 0; /* ORDERED */
4304  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4305  }
4306 
4307  this_thr->th.th_next_pool = NULL;
4308 
4309  if (!this_thr->th.th_task_state_memo_stack) {
4310  size_t i;
4311  this_thr->th.th_task_state_memo_stack =
4312  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4313  this_thr->th.th_task_state_top = 0;
4314  this_thr->th.th_task_state_stack_sz = 4;
4315  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4316  ++i) // zero init the stack
4317  this_thr->th.th_task_state_memo_stack[i] = 0;
4318  }
4319 
4320  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4321  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4322 
4323  KMP_MB();
4324 }
4325 
4326 /* allocate a new thread for the requesting team. this is only called from
4327  within a forkjoin critical section. we will first try to get an available
4328  thread from the thread pool. if none is available, we will fork a new one
4329  assuming we are able to create a new one. this should be assured, as the
4330  caller should check on this first. */
4331 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4332  int new_tid) {
4333  kmp_team_t *serial_team;
4334  kmp_info_t *new_thr;
4335  int new_gtid;
4336 
4337  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4338  KMP_DEBUG_ASSERT(root && team);
4339 #if !KMP_NESTED_HOT_TEAMS
4340  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4341 #endif
4342  KMP_MB();
4343 
4344  /* first, try to get one from the thread pool */
4345  if (__kmp_thread_pool) {
4346  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4347  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4348  if (new_thr == __kmp_thread_pool_insert_pt) {
4349  __kmp_thread_pool_insert_pt = NULL;
4350  }
4351  TCW_4(new_thr->th.th_in_pool, FALSE);
4352  __kmp_suspend_initialize_thread(new_thr);
4353  __kmp_lock_suspend_mx(new_thr);
4354  if (new_thr->th.th_active_in_pool == TRUE) {
4355  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4356  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4357  new_thr->th.th_active_in_pool = FALSE;
4358  }
4359  __kmp_unlock_suspend_mx(new_thr);
4360 
4361  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4362  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4363  KMP_ASSERT(!new_thr->th.th_team);
4364  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4365 
4366  /* setup the thread structure */
4367  __kmp_initialize_info(new_thr, team, new_tid,
4368  new_thr->th.th_info.ds.ds_gtid);
4369  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4370 
4371  TCW_4(__kmp_nth, __kmp_nth + 1);
4372 
4373  new_thr->th.th_task_state = 0;
4374  new_thr->th.th_task_state_top = 0;
4375  new_thr->th.th_task_state_stack_sz = 4;
4376 
4377  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4378  // Make sure pool thread has transitioned to waiting on own thread struct
4379  KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4380  // Thread activated in __kmp_allocate_team when increasing team size
4381  }
4382 
4383 #ifdef KMP_ADJUST_BLOCKTIME
4384  /* Adjust blocktime back to zero if necessary */
4385  /* Middle initialization might not have occurred yet */
4386  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4387  if (__kmp_nth > __kmp_avail_proc) {
4388  __kmp_zero_bt = TRUE;
4389  }
4390  }
4391 #endif /* KMP_ADJUST_BLOCKTIME */
4392 
4393 #if KMP_DEBUG
4394  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4395  // KMP_BARRIER_PARENT_FLAG.
4396  int b;
4397  kmp_balign_t *balign = new_thr->th.th_bar;
4398  for (b = 0; b < bs_last_barrier; ++b)
4399  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4400 #endif
4401 
4402  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4403  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4404 
4405  KMP_MB();
4406  return new_thr;
4407  }
4408 
4409  /* no, well fork a new one */
4410  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4411  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4412 
4413 #if KMP_USE_MONITOR
4414  // If this is the first worker thread the RTL is creating, then also
4415  // launch the monitor thread. We try to do this as early as possible.
4416  if (!TCR_4(__kmp_init_monitor)) {
4417  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4418  if (!TCR_4(__kmp_init_monitor)) {
4419  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4420  TCW_4(__kmp_init_monitor, 1);
4421  __kmp_create_monitor(&__kmp_monitor);
4422  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4423 #if KMP_OS_WINDOWS
4424  // AC: wait until monitor has started. This is a fix for CQ232808.
4425  // The reason is that if the library is loaded/unloaded in a loop with
4426  // small (parallel) work in between, then there is high probability that
4427  // monitor thread started after the library shutdown. At shutdown it is
4428  // too late to cope with the problem, because when the primary thread is
4429  // in DllMain (process detach) the monitor has no chances to start (it is
4430  // blocked), and primary thread has no means to inform the monitor that
4431  // the library has gone, because all the memory which the monitor can
4432  // access is going to be released/reset.
4433  while (TCR_4(__kmp_init_monitor) < 2) {
4434  KMP_YIELD(TRUE);
4435  }
4436  KF_TRACE(10, ("after monitor thread has started\n"));
4437 #endif
4438  }
4439  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4440  }
4441 #endif
4442 
4443  KMP_MB();
4444 
4445  {
4446  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4447  ? 1
4448  : __kmp_hidden_helper_threads_num + 1;
4449 
4450  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4451  ++new_gtid) {
4452  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4453  }
4454 
4455  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4456  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4457  }
4458  }
4459 
4460  /* allocate space for it. */
4461  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4462 
4463  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4464 
4465 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4466  // suppress race conditions detection on synchronization flags in debug mode
4467  // this helps to analyze library internals eliminating false positives
4468  __itt_suppress_mark_range(
4469  __itt_suppress_range, __itt_suppress_threading_errors,
4470  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4471  __itt_suppress_mark_range(
4472  __itt_suppress_range, __itt_suppress_threading_errors,
4473  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4474 #if KMP_OS_WINDOWS
4475  __itt_suppress_mark_range(
4476  __itt_suppress_range, __itt_suppress_threading_errors,
4477  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4478 #else
4479  __itt_suppress_mark_range(__itt_suppress_range,
4480  __itt_suppress_threading_errors,
4481  &new_thr->th.th_suspend_init_count,
4482  sizeof(new_thr->th.th_suspend_init_count));
4483 #endif
4484  // TODO: check if we need to also suppress b_arrived flags
4485  __itt_suppress_mark_range(__itt_suppress_range,
4486  __itt_suppress_threading_errors,
4487  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4488  sizeof(new_thr->th.th_bar[0].bb.b_go));
4489  __itt_suppress_mark_range(__itt_suppress_range,
4490  __itt_suppress_threading_errors,
4491  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4492  sizeof(new_thr->th.th_bar[1].bb.b_go));
4493  __itt_suppress_mark_range(__itt_suppress_range,
4494  __itt_suppress_threading_errors,
4495  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4496  sizeof(new_thr->th.th_bar[2].bb.b_go));
4497 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4498  if (__kmp_storage_map) {
4499  __kmp_print_thread_storage_map(new_thr, new_gtid);
4500  }
4501 
4502  // add the reserve serialized team, initialized from the team's primary thread
4503  {
4504  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4505  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4506  new_thr->th.th_serial_team = serial_team =
4507  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4508 #if OMPT_SUPPORT
4509  ompt_data_none, // root parallel id
4510 #endif
4511  proc_bind_default, &r_icvs,
4512  0 USE_NESTED_HOT_ARG(NULL));
4513  }
4514  KMP_ASSERT(serial_team);
4515  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4516  // execution (it is unused for now).
4517  serial_team->t.t_threads[0] = new_thr;
4518  KF_TRACE(10,
4519  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4520  new_thr));
4521 
4522  /* setup the thread structures */
4523  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4524 
4525 #if USE_FAST_MEMORY
4526  __kmp_initialize_fast_memory(new_thr);
4527 #endif /* USE_FAST_MEMORY */
4528 
4529 #if KMP_USE_BGET
4530  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4531  __kmp_initialize_bget(new_thr);
4532 #endif
4533 
4534  __kmp_init_random(new_thr); // Initialize random number generator
4535 
4536  /* Initialize these only once when thread is grabbed for a team allocation */
4537  KA_TRACE(20,
4538  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4539  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4540 
4541  int b;
4542  kmp_balign_t *balign = new_thr->th.th_bar;
4543  for (b = 0; b < bs_last_barrier; ++b) {
4544  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4545  balign[b].bb.team = NULL;
4546  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4547  balign[b].bb.use_oncore_barrier = 0;
4548  }
4549 
4550  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4551  new_thr->th.th_sleep_loc_type = flag_unset;
4552 
4553  new_thr->th.th_spin_here = FALSE;
4554  new_thr->th.th_next_waiting = 0;
4555 #if KMP_OS_UNIX
4556  new_thr->th.th_blocking = false;
4557 #endif
4558 
4559 #if KMP_AFFINITY_SUPPORTED
4560  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4561  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4562  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4563  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4564 #endif
4565  new_thr->th.th_def_allocator = __kmp_def_allocator;
4566  new_thr->th.th_prev_level = 0;
4567  new_thr->th.th_prev_num_threads = 1;
4568 
4569  TCW_4(new_thr->th.th_in_pool, FALSE);
4570  new_thr->th.th_active_in_pool = FALSE;
4571  TCW_4(new_thr->th.th_active, TRUE);
4572 
4573  /* adjust the global counters */
4574  __kmp_all_nth++;
4575  __kmp_nth++;
4576 
4577  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4578  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4579  if (__kmp_adjust_gtid_mode) {
4580  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4581  if (TCR_4(__kmp_gtid_mode) != 2) {
4582  TCW_4(__kmp_gtid_mode, 2);
4583  }
4584  } else {
4585  if (TCR_4(__kmp_gtid_mode) != 1) {
4586  TCW_4(__kmp_gtid_mode, 1);
4587  }
4588  }
4589  }
4590 
4591 #ifdef KMP_ADJUST_BLOCKTIME
4592  /* Adjust blocktime back to zero if necessary */
4593  /* Middle initialization might not have occurred yet */
4594  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4595  if (__kmp_nth > __kmp_avail_proc) {
4596  __kmp_zero_bt = TRUE;
4597  }
4598  }
4599 #endif /* KMP_ADJUST_BLOCKTIME */
4600 
4601  /* actually fork it and create the new worker thread */
4602  KF_TRACE(
4603  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4604  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4605  KF_TRACE(10,
4606  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4607 
4608  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4609  new_gtid));
4610  KMP_MB();
4611  return new_thr;
4612 }
4613 
4614 /* Reinitialize team for reuse.
4615  The hot team code calls this case at every fork barrier, so EPCC barrier
4616  test are extremely sensitive to changes in it, esp. writes to the team
4617  struct, which cause a cache invalidation in all threads.
4618  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4619 static void __kmp_reinitialize_team(kmp_team_t *team,
4620  kmp_internal_control_t *new_icvs,
4621  ident_t *loc) {
4622  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4623  team->t.t_threads[0], team));
4624  KMP_DEBUG_ASSERT(team && new_icvs);
4625  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4626  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4627 
4628  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4629  // Copy ICVs to the primary thread's implicit taskdata
4630  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4631  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4632 
4633  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4634  team->t.t_threads[0], team));
4635 }
4636 
4637 /* Initialize the team data structure.
4638  This assumes the t_threads and t_max_nproc are already set.
4639  Also, we don't touch the arguments */
4640 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4641  kmp_internal_control_t *new_icvs,
4642  ident_t *loc) {
4643  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4644 
4645  /* verify */
4646  KMP_DEBUG_ASSERT(team);
4647  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4648  KMP_DEBUG_ASSERT(team->t.t_threads);
4649  KMP_MB();
4650 
4651  team->t.t_master_tid = 0; /* not needed */
4652  /* team->t.t_master_bar; not needed */
4653  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4654  team->t.t_nproc = new_nproc;
4655 
4656  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4657  team->t.t_next_pool = NULL;
4658  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4659  * up hot team */
4660 
4661  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4662  team->t.t_invoke = NULL; /* not needed */
4663 
4664  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4665  team->t.t_sched.sched = new_icvs->sched.sched;
4666 
4667 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4668  team->t.t_fp_control_saved = FALSE; /* not needed */
4669  team->t.t_x87_fpu_control_word = 0; /* not needed */
4670  team->t.t_mxcsr = 0; /* not needed */
4671 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4672 
4673  team->t.t_construct = 0;
4674 
4675  team->t.t_ordered.dt.t_value = 0;
4676  team->t.t_master_active = FALSE;
4677 
4678 #ifdef KMP_DEBUG
4679  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4680 #endif
4681 #if KMP_OS_WINDOWS
4682  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4683 #endif
4684 
4685  team->t.t_control_stack_top = NULL;
4686 
4687  __kmp_reinitialize_team(team, new_icvs, loc);
4688 
4689  KMP_MB();
4690  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4691 }
4692 
4693 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4694 /* Sets full mask for thread and returns old mask, no changes to structures. */
4695 static void
4696 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4697  if (KMP_AFFINITY_CAPABLE()) {
4698  int status;
4699  if (old_mask != NULL) {
4700  status = __kmp_get_system_affinity(old_mask, TRUE);
4701  int error = errno;
4702  if (status != 0) {
4703  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4704  __kmp_msg_null);
4705  }
4706  }
4707  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4708  }
4709 }
4710 #endif
4711 
4712 #if KMP_AFFINITY_SUPPORTED
4713 
4714 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4715 // It calculates the worker + primary thread's partition based upon the parent
4716 // thread's partition, and binds each worker to a thread in their partition.
4717 // The primary thread's partition should already include its current binding.
4718 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4719  // Do not partition places for the hidden helper team
4720  if (KMP_HIDDEN_HELPER_TEAM(team))
4721  return;
4722  // Copy the primary thread's place partition to the team struct
4723  kmp_info_t *master_th = team->t.t_threads[0];
4724  KMP_DEBUG_ASSERT(master_th != NULL);
4725  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4726  int first_place = master_th->th.th_first_place;
4727  int last_place = master_th->th.th_last_place;
4728  int masters_place = master_th->th.th_current_place;
4729  team->t.t_first_place = first_place;
4730  team->t.t_last_place = last_place;
4731 
4732  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4733  "bound to place %d partition = [%d,%d]\n",
4734  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4735  team->t.t_id, masters_place, first_place, last_place));
4736 
4737  switch (proc_bind) {
4738 
4739  case proc_bind_default:
4740  // Serial teams might have the proc_bind policy set to proc_bind_default.
4741  // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4742  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4743  break;
4744 
4745  case proc_bind_primary: {
4746  int f;
4747  int n_th = team->t.t_nproc;
4748  for (f = 1; f < n_th; f++) {
4749  kmp_info_t *th = team->t.t_threads[f];
4750  KMP_DEBUG_ASSERT(th != NULL);
4751  th->th.th_first_place = first_place;
4752  th->th.th_last_place = last_place;
4753  th->th.th_new_place = masters_place;
4754  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4755  team->t.t_display_affinity != 1) {
4756  team->t.t_display_affinity = 1;
4757  }
4758 
4759  KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4760  "partition = [%d,%d]\n",
4761  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4762  f, masters_place, first_place, last_place));
4763  }
4764  } break;
4765 
4766  case proc_bind_close: {
4767  int f;
4768  int n_th = team->t.t_nproc;
4769  int n_places;
4770  if (first_place <= last_place) {
4771  n_places = last_place - first_place + 1;
4772  } else {
4773  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4774  }
4775  if (n_th <= n_places) {
4776  int place = masters_place;
4777  for (f = 1; f < n_th; f++) {
4778  kmp_info_t *th = team->t.t_threads[f];
4779  KMP_DEBUG_ASSERT(th != NULL);
4780 
4781  if (place == last_place) {
4782  place = first_place;
4783  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4784  place = 0;
4785  } else {
4786  place++;
4787  }
4788  th->th.th_first_place = first_place;
4789  th->th.th_last_place = last_place;
4790  th->th.th_new_place = place;
4791  if (__kmp_display_affinity && place != th->th.th_current_place &&
4792  team->t.t_display_affinity != 1) {
4793  team->t.t_display_affinity = 1;
4794  }
4795 
4796  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4797  "partition = [%d,%d]\n",
4798  __kmp_gtid_from_thread(team->t.t_threads[f]),
4799  team->t.t_id, f, place, first_place, last_place));
4800  }
4801  } else {
4802  int S, rem, gap, s_count;
4803  S = n_th / n_places;
4804  s_count = 0;
4805  rem = n_th - (S * n_places);
4806  gap = rem > 0 ? n_places / rem : n_places;
4807  int place = masters_place;
4808  int gap_ct = gap;
4809  for (f = 0; f < n_th; f++) {
4810  kmp_info_t *th = team->t.t_threads[f];
4811  KMP_DEBUG_ASSERT(th != NULL);
4812 
4813  th->th.th_first_place = first_place;
4814  th->th.th_last_place = last_place;
4815  th->th.th_new_place = place;
4816  if (__kmp_display_affinity && place != th->th.th_current_place &&
4817  team->t.t_display_affinity != 1) {
4818  team->t.t_display_affinity = 1;
4819  }
4820  s_count++;
4821 
4822  if ((s_count == S) && rem && (gap_ct == gap)) {
4823  // do nothing, add an extra thread to place on next iteration
4824  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4825  // we added an extra thread to this place; move to next place
4826  if (place == last_place) {
4827  place = first_place;
4828  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4829  place = 0;
4830  } else {
4831  place++;
4832  }
4833  s_count = 0;
4834  gap_ct = 1;
4835  rem--;
4836  } else if (s_count == S) { // place full; don't add extra
4837  if (place == last_place) {
4838  place = first_place;
4839  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4840  place = 0;
4841  } else {
4842  place++;
4843  }
4844  gap_ct++;
4845  s_count = 0;
4846  }
4847 
4848  KA_TRACE(100,
4849  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4850  "partition = [%d,%d]\n",
4851  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4852  th->th.th_new_place, first_place, last_place));
4853  }
4854  KMP_DEBUG_ASSERT(place == masters_place);
4855  }
4856  } break;
4857 
4858  case proc_bind_spread: {
4859  int f;
4860  int n_th = team->t.t_nproc;
4861  int n_places;
4862  int thidx;
4863  if (first_place <= last_place) {
4864  n_places = last_place - first_place + 1;
4865  } else {
4866  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4867  }
4868  if (n_th <= n_places) {
4869  int place = -1;
4870 
4871  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4872  int S = n_places / n_th;
4873  int s_count, rem, gap, gap_ct;
4874 
4875  place = masters_place;
4876  rem = n_places - n_th * S;
4877  gap = rem ? n_th / rem : 1;
4878  gap_ct = gap;
4879  thidx = n_th;
4880  if (update_master_only == 1)
4881  thidx = 1;
4882  for (f = 0; f < thidx; f++) {
4883  kmp_info_t *th = team->t.t_threads[f];
4884  KMP_DEBUG_ASSERT(th != NULL);
4885 
4886  th->th.th_first_place = place;
4887  th->th.th_new_place = place;
4888  if (__kmp_display_affinity && place != th->th.th_current_place &&
4889  team->t.t_display_affinity != 1) {
4890  team->t.t_display_affinity = 1;
4891  }
4892  s_count = 1;
4893  while (s_count < S) {
4894  if (place == last_place) {
4895  place = first_place;
4896  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4897  place = 0;
4898  } else {
4899  place++;
4900  }
4901  s_count++;
4902  }
4903  if (rem && (gap_ct == gap)) {
4904  if (place == last_place) {
4905  place = first_place;
4906  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4907  place = 0;
4908  } else {
4909  place++;
4910  }
4911  rem--;
4912  gap_ct = 0;
4913  }
4914  th->th.th_last_place = place;
4915  gap_ct++;
4916 
4917  if (place == last_place) {
4918  place = first_place;
4919  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4920  place = 0;
4921  } else {
4922  place++;
4923  }
4924 
4925  KA_TRACE(100,
4926  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4927  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4928  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4929  f, th->th.th_new_place, th->th.th_first_place,
4930  th->th.th_last_place, __kmp_affinity_num_masks));
4931  }
4932  } else {
4933  /* Having uniform space of available computation places I can create
4934  T partitions of round(P/T) size and put threads into the first
4935  place of each partition. */
4936  double current = static_cast<double>(masters_place);
4937  double spacing =
4938  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4939  int first, last;
4940  kmp_info_t *th;
4941 
4942  thidx = n_th + 1;
4943  if (update_master_only == 1)
4944  thidx = 1;
4945  for (f = 0; f < thidx; f++) {
4946  first = static_cast<int>(current);
4947  last = static_cast<int>(current + spacing) - 1;
4948  KMP_DEBUG_ASSERT(last >= first);
4949  if (first >= n_places) {
4950  if (masters_place) {
4951  first -= n_places;
4952  last -= n_places;
4953  if (first == (masters_place + 1)) {
4954  KMP_DEBUG_ASSERT(f == n_th);
4955  first--;
4956  }
4957  if (last == masters_place) {
4958  KMP_DEBUG_ASSERT(f == (n_th - 1));
4959  last--;
4960  }
4961  } else {
4962  KMP_DEBUG_ASSERT(f == n_th);
4963  first = 0;
4964  last = 0;
4965  }
4966  }
4967  if (last >= n_places) {
4968  last = (n_places - 1);
4969  }
4970  place = first;
4971  current += spacing;
4972  if (f < n_th) {
4973  KMP_DEBUG_ASSERT(0 <= first);
4974  KMP_DEBUG_ASSERT(n_places > first);
4975  KMP_DEBUG_ASSERT(0 <= last);
4976  KMP_DEBUG_ASSERT(n_places > last);
4977  KMP_DEBUG_ASSERT(last_place >= first_place);
4978  th = team->t.t_threads[f];
4979  KMP_DEBUG_ASSERT(th);
4980  th->th.th_first_place = first;
4981  th->th.th_new_place = place;
4982  th->th.th_last_place = last;
4983  if (__kmp_display_affinity && place != th->th.th_current_place &&
4984  team->t.t_display_affinity != 1) {
4985  team->t.t_display_affinity = 1;
4986  }
4987  KA_TRACE(100,
4988  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4989  "partition = [%d,%d], spacing = %.4f\n",
4990  __kmp_gtid_from_thread(team->t.t_threads[f]),
4991  team->t.t_id, f, th->th.th_new_place,
4992  th->th.th_first_place, th->th.th_last_place, spacing));
4993  }
4994  }
4995  }
4996  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4997  } else {
4998  int S, rem, gap, s_count;
4999  S = n_th / n_places;
5000  s_count = 0;
5001  rem = n_th - (S * n_places);
5002  gap = rem > 0 ? n_places / rem : n_places;
5003  int place = masters_place;
5004  int gap_ct = gap;
5005  thidx = n_th;
5006  if (update_master_only == 1)
5007  thidx = 1;
5008  for (f = 0; f < thidx; f++) {
5009  kmp_info_t *th = team->t.t_threads[f];
5010  KMP_DEBUG_ASSERT(th != NULL);
5011 
5012  th->th.th_first_place = place;
5013  th->th.th_last_place = place;
5014  th->th.th_new_place = place;
5015  if (__kmp_display_affinity && place != th->th.th_current_place &&
5016  team->t.t_display_affinity != 1) {
5017  team->t.t_display_affinity = 1;
5018  }
5019  s_count++;
5020 
5021  if ((s_count == S) && rem && (gap_ct == gap)) {
5022  // do nothing, add an extra thread to place on next iteration
5023  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5024  // we added an extra thread to this place; move on to next place
5025  if (place == last_place) {
5026  place = first_place;
5027  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
5028  place = 0;
5029  } else {
5030  place++;
5031  }
5032  s_count = 0;
5033  gap_ct = 1;
5034  rem--;
5035  } else if (s_count == S) { // place is full; don't add extra thread
5036  if (place == last_place) {
5037  place = first_place;
5038  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
5039  place = 0;
5040  } else {
5041  place++;
5042  }
5043  gap_ct++;
5044  s_count = 0;
5045  }
5046 
5047  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5048  "partition = [%d,%d]\n",
5049  __kmp_gtid_from_thread(team->t.t_threads[f]),
5050  team->t.t_id, f, th->th.th_new_place,
5051  th->th.th_first_place, th->th.th_last_place));
5052  }
5053  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5054  }
5055  } break;
5056 
5057  default:
5058  break;
5059  }
5060 
5061  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5062 }
5063 
5064 #endif // KMP_AFFINITY_SUPPORTED
5065 
5066 /* allocate a new team data structure to use. take one off of the free pool if
5067  available */
5068 kmp_team_t *
5069 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5070 #if OMPT_SUPPORT
5071  ompt_data_t ompt_parallel_data,
5072 #endif
5073  kmp_proc_bind_t new_proc_bind,
5074  kmp_internal_control_t *new_icvs,
5075  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5076  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5077  int f;
5078  kmp_team_t *team;
5079  int use_hot_team = !root->r.r_active;
5080  int level = 0;
5081  int do_place_partition = 1;
5082 
5083  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5084  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5085  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5086  KMP_MB();
5087 
5088 #if KMP_NESTED_HOT_TEAMS
5089  kmp_hot_team_ptr_t *hot_teams;
5090  if (master) {
5091  team = master->th.th_team;
5092  level = team->t.t_active_level;
5093  if (master->th.th_teams_microtask) { // in teams construct?
5094  if (master->th.th_teams_size.nteams > 1 &&
5095  ( // #teams > 1
5096  team->t.t_pkfn ==
5097  (microtask_t)__kmp_teams_master || // inner fork of the teams
5098  master->th.th_teams_level <
5099  team->t.t_level)) { // or nested parallel inside the teams
5100  ++level; // not increment if #teams==1, or for outer fork of the teams;
5101  // increment otherwise
5102  }
5103  // Do not perform the place partition if inner fork of the teams
5104  // Wait until nested parallel region encountered inside teams construct
5105  if ((master->th.th_teams_size.nteams == 1 &&
5106  master->th.th_teams_level >= team->t.t_level) ||
5107  (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5108  do_place_partition = 0;
5109  }
5110  hot_teams = master->th.th_hot_teams;
5111  if (level < __kmp_hot_teams_max_level && hot_teams &&
5112  hot_teams[level].hot_team) {
5113  // hot team has already been allocated for given level
5114  use_hot_team = 1;
5115  } else {
5116  use_hot_team = 0;
5117  }
5118  } else {
5119  // check we won't access uninitialized hot_teams, just in case
5120  KMP_DEBUG_ASSERT(new_nproc == 1);
5121  }
5122 #endif
5123  // Optimization to use a "hot" team
5124  if (use_hot_team && new_nproc > 1) {
5125  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5126 #if KMP_NESTED_HOT_TEAMS
5127  team = hot_teams[level].hot_team;
5128 #else
5129  team = root->r.r_hot_team;
5130 #endif
5131 #if KMP_DEBUG
5132  if (__kmp_tasking_mode != tskm_immediate_exec) {
5133  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5134  "task_team[1] = %p before reinit\n",
5135  team->t.t_task_team[0], team->t.t_task_team[1]));
5136  }
5137 #endif
5138 
5139  if (team->t.t_nproc != new_nproc &&
5140  __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5141  // Distributed barrier may need a resize
5142  int old_nthr = team->t.t_nproc;
5143  __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5144  }
5145 
5146  // If not doing the place partition, then reset the team's proc bind
5147  // to indicate that partitioning of all threads still needs to take place
5148  if (do_place_partition == 0)
5149  team->t.t_proc_bind = proc_bind_default;
5150  // Has the number of threads changed?
5151  /* Let's assume the most common case is that the number of threads is
5152  unchanged, and put that case first. */
5153  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5154  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5155  // This case can mean that omp_set_num_threads() was called and the hot
5156  // team size was already reduced, so we check the special flag
5157  if (team->t.t_size_changed == -1) {
5158  team->t.t_size_changed = 1;
5159  } else {
5160  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5161  }
5162 
5163  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5164  kmp_r_sched_t new_sched = new_icvs->sched;
5165  // set primary thread's schedule as new run-time schedule
5166  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5167 
5168  __kmp_reinitialize_team(team, new_icvs,
5169  root->r.r_uber_thread->th.th_ident);
5170 
5171  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5172  team->t.t_threads[0], team));
5173  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5174 
5175 #if KMP_AFFINITY_SUPPORTED
5176  if ((team->t.t_size_changed == 0) &&
5177  (team->t.t_proc_bind == new_proc_bind)) {
5178  if (new_proc_bind == proc_bind_spread) {
5179  if (do_place_partition) {
5180  // add flag to update only master for spread
5181  __kmp_partition_places(team, 1);
5182  }
5183  }
5184  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5185  "proc_bind = %d, partition = [%d,%d]\n",
5186  team->t.t_id, new_proc_bind, team->t.t_first_place,
5187  team->t.t_last_place));
5188  } else {
5189  if (do_place_partition) {
5190  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5191  __kmp_partition_places(team);
5192  }
5193  }
5194 #else
5195  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5196 #endif /* KMP_AFFINITY_SUPPORTED */
5197  } else if (team->t.t_nproc > new_nproc) {
5198  KA_TRACE(20,
5199  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5200  new_nproc));
5201 
5202  team->t.t_size_changed = 1;
5203  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5204  // Barrier size already reduced earlier in this function
5205  // Activate team threads via th_used_in_team
5206  __kmp_add_threads_to_team(team, new_nproc);
5207  }
5208 #if KMP_NESTED_HOT_TEAMS
5209  if (__kmp_hot_teams_mode == 0) {
5210  // AC: saved number of threads should correspond to team's value in this
5211  // mode, can be bigger in mode 1, when hot team has threads in reserve
5212  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5213  hot_teams[level].hot_team_nth = new_nproc;
5214 #endif // KMP_NESTED_HOT_TEAMS
5215  /* release the extra threads we don't need any more */
5216  for (f = new_nproc; f < team->t.t_nproc; f++) {
5217  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5218  if (__kmp_tasking_mode != tskm_immediate_exec) {
5219  // When decreasing team size, threads no longer in the team should
5220  // unref task team.
5221  team->t.t_threads[f]->th.th_task_team = NULL;
5222  }
5223  __kmp_free_thread(team->t.t_threads[f]);
5224  team->t.t_threads[f] = NULL;
5225  }
5226 #if KMP_NESTED_HOT_TEAMS
5227  } // (__kmp_hot_teams_mode == 0)
5228  else {
5229  // When keeping extra threads in team, switch threads to wait on own
5230  // b_go flag
5231  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5232  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5233  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5234  for (int b = 0; b < bs_last_barrier; ++b) {
5235  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5236  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5237  }
5238  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5239  }
5240  }
5241  }
5242 #endif // KMP_NESTED_HOT_TEAMS
5243  team->t.t_nproc = new_nproc;
5244  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5245  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5246  __kmp_reinitialize_team(team, new_icvs,
5247  root->r.r_uber_thread->th.th_ident);
5248 
5249  // Update remaining threads
5250  for (f = 0; f < new_nproc; ++f) {
5251  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5252  }
5253 
5254  // restore the current task state of the primary thread: should be the
5255  // implicit task
5256  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5257  team->t.t_threads[0], team));
5258 
5259  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5260 
5261 #ifdef KMP_DEBUG
5262  for (f = 0; f < team->t.t_nproc; f++) {
5263  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5264  team->t.t_threads[f]->th.th_team_nproc ==
5265  team->t.t_nproc);
5266  }
5267 #endif
5268 
5269  if (do_place_partition) {
5270  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5271 #if KMP_AFFINITY_SUPPORTED
5272  __kmp_partition_places(team);
5273 #endif
5274  }
5275  } else { // team->t.t_nproc < new_nproc
5276 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5277  kmp_affin_mask_t *old_mask;
5278  if (KMP_AFFINITY_CAPABLE()) {
5279  KMP_CPU_ALLOC(old_mask);
5280  }
5281 #endif
5282 
5283  KA_TRACE(20,
5284  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5285  new_nproc));
5286  int old_nproc = team->t.t_nproc; // save old value and use to update only
5287  team->t.t_size_changed = 1;
5288 
5289 #if KMP_NESTED_HOT_TEAMS
5290  int avail_threads = hot_teams[level].hot_team_nth;
5291  if (new_nproc < avail_threads)
5292  avail_threads = new_nproc;
5293  kmp_info_t **other_threads = team->t.t_threads;
5294  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5295  // Adjust barrier data of reserved threads (if any) of the team
5296  // Other data will be set in __kmp_initialize_info() below.
5297  int b;
5298  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5299  for (b = 0; b < bs_last_barrier; ++b) {
5300  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5301  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5302 #if USE_DEBUGGER
5303  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5304 #endif
5305  }
5306  }
5307  if (hot_teams[level].hot_team_nth >= new_nproc) {
5308  // we have all needed threads in reserve, no need to allocate any
5309  // this only possible in mode 1, cannot have reserved threads in mode 0
5310  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5311  team->t.t_nproc = new_nproc; // just get reserved threads involved
5312  } else {
5313  // We may have some threads in reserve, but not enough;
5314  // get reserved threads involved if any.
5315  team->t.t_nproc = hot_teams[level].hot_team_nth;
5316  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5317 #endif // KMP_NESTED_HOT_TEAMS
5318  if (team->t.t_max_nproc < new_nproc) {
5319  /* reallocate larger arrays */
5320  __kmp_reallocate_team_arrays(team, new_nproc);
5321  __kmp_reinitialize_team(team, new_icvs, NULL);
5322  }
5323 
5324 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5325  /* Temporarily set full mask for primary thread before creation of
5326  workers. The reason is that workers inherit the affinity from the
5327  primary thread, so if a lot of workers are created on the single
5328  core quickly, they don't get a chance to set their own affinity for
5329  a long time. */
5330  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5331 #endif
5332 
5333  /* allocate new threads for the hot team */
5334  for (f = team->t.t_nproc; f < new_nproc; f++) {
5335  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5336  KMP_DEBUG_ASSERT(new_worker);
5337  team->t.t_threads[f] = new_worker;
5338 
5339  KA_TRACE(20,
5340  ("__kmp_allocate_team: team %d init T#%d arrived: "
5341  "join=%llu, plain=%llu\n",
5342  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5343  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5344  team->t.t_bar[bs_plain_barrier].b_arrived));
5345 
5346  { // Initialize barrier data for new threads.
5347  int b;
5348  kmp_balign_t *balign = new_worker->th.th_bar;
5349  for (b = 0; b < bs_last_barrier; ++b) {
5350  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5351  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5352  KMP_BARRIER_PARENT_FLAG);
5353 #if USE_DEBUGGER
5354  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5355 #endif
5356  }
5357  }
5358  }
5359 
5360 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5361  if (KMP_AFFINITY_CAPABLE()) {
5362  /* Restore initial primary thread's affinity mask */
5363  __kmp_set_system_affinity(old_mask, TRUE);
5364  KMP_CPU_FREE(old_mask);
5365  }
5366 #endif
5367 #if KMP_NESTED_HOT_TEAMS
5368  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5369 #endif // KMP_NESTED_HOT_TEAMS
5370  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5371  // Barrier size already increased earlier in this function
5372  // Activate team threads via th_used_in_team
5373  __kmp_add_threads_to_team(team, new_nproc);
5374  }
5375  /* make sure everyone is syncronized */
5376  // new threads below
5377  __kmp_initialize_team(team, new_nproc, new_icvs,
5378  root->r.r_uber_thread->th.th_ident);
5379 
5380  /* reinitialize the threads */
5381  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5382  for (f = 0; f < team->t.t_nproc; ++f)
5383  __kmp_initialize_info(team->t.t_threads[f], team, f,
5384  __kmp_gtid_from_tid(f, team));
5385 
5386  if (level) { // set th_task_state for new threads in nested hot team
5387  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5388  // only need to set the th_task_state for the new threads. th_task_state
5389  // for primary thread will not be accurate until after this in
5390  // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5391  // get the correct value.
5392  for (f = old_nproc; f < team->t.t_nproc; ++f)
5393  team->t.t_threads[f]->th.th_task_state =
5394  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5395  } else { // set th_task_state for new threads in non-nested hot team
5396  // copy primary thread's state
5397  kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5398  for (f = old_nproc; f < team->t.t_nproc; ++f)
5399  team->t.t_threads[f]->th.th_task_state = old_state;
5400  }
5401 
5402 #ifdef KMP_DEBUG
5403  for (f = 0; f < team->t.t_nproc; ++f) {
5404  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5405  team->t.t_threads[f]->th.th_team_nproc ==
5406  team->t.t_nproc);
5407  }
5408 #endif
5409 
5410  if (do_place_partition) {
5411  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5412 #if KMP_AFFINITY_SUPPORTED
5413  __kmp_partition_places(team);
5414 #endif
5415  }
5416  } // Check changes in number of threads
5417 
5418  kmp_info_t *master = team->t.t_threads[0];
5419  if (master->th.th_teams_microtask) {
5420  for (f = 1; f < new_nproc; ++f) {
5421  // propagate teams construct specific info to workers
5422  kmp_info_t *thr = team->t.t_threads[f];
5423  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5424  thr->th.th_teams_level = master->th.th_teams_level;
5425  thr->th.th_teams_size = master->th.th_teams_size;
5426  }
5427  }
5428 #if KMP_NESTED_HOT_TEAMS
5429  if (level) {
5430  // Sync barrier state for nested hot teams, not needed for outermost hot
5431  // team.
5432  for (f = 1; f < new_nproc; ++f) {
5433  kmp_info_t *thr = team->t.t_threads[f];
5434  int b;
5435  kmp_balign_t *balign = thr->th.th_bar;
5436  for (b = 0; b < bs_last_barrier; ++b) {
5437  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5438  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5439 #if USE_DEBUGGER
5440  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5441 #endif
5442  }
5443  }
5444  }
5445 #endif // KMP_NESTED_HOT_TEAMS
5446 
5447  /* reallocate space for arguments if necessary */
5448  __kmp_alloc_argv_entries(argc, team, TRUE);
5449  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5450  // The hot team re-uses the previous task team,
5451  // if untouched during the previous release->gather phase.
5452 
5453  KF_TRACE(10, (" hot_team = %p\n", team));
5454 
5455 #if KMP_DEBUG
5456  if (__kmp_tasking_mode != tskm_immediate_exec) {
5457  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5458  "task_team[1] = %p after reinit\n",
5459  team->t.t_task_team[0], team->t.t_task_team[1]));
5460  }
5461 #endif
5462 
5463 #if OMPT_SUPPORT
5464  __ompt_team_assign_id(team, ompt_parallel_data);
5465 #endif
5466 
5467  KMP_MB();
5468 
5469  return team;
5470  }
5471 
5472  /* next, let's try to take one from the team pool */
5473  KMP_MB();
5474  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5475  /* TODO: consider resizing undersized teams instead of reaping them, now
5476  that we have a resizing mechanism */
5477  if (team->t.t_max_nproc >= max_nproc) {
5478  /* take this team from the team pool */
5479  __kmp_team_pool = team->t.t_next_pool;
5480 
5481  if (max_nproc > 1 &&
5482  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5483  if (!team->t.b) { // Allocate barrier structure
5484  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5485  }
5486  }
5487 
5488  /* setup the team for fresh use */
5489  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5490 
5491  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5492  "task_team[1] %p to NULL\n",
5493  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5494  team->t.t_task_team[0] = NULL;
5495  team->t.t_task_team[1] = NULL;
5496 
5497  /* reallocate space for arguments if necessary */
5498  __kmp_alloc_argv_entries(argc, team, TRUE);
5499  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5500 
5501  KA_TRACE(
5502  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5503  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5504  { // Initialize barrier data.
5505  int b;
5506  for (b = 0; b < bs_last_barrier; ++b) {
5507  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5508 #if USE_DEBUGGER
5509  team->t.t_bar[b].b_master_arrived = 0;
5510  team->t.t_bar[b].b_team_arrived = 0;
5511 #endif
5512  }
5513  }
5514 
5515  team->t.t_proc_bind = new_proc_bind;
5516 
5517  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5518  team->t.t_id));
5519 
5520 #if OMPT_SUPPORT
5521  __ompt_team_assign_id(team, ompt_parallel_data);
5522 #endif
5523 
5524  KMP_MB();
5525 
5526  return team;
5527  }
5528 
5529  /* reap team if it is too small, then loop back and check the next one */
5530  // not sure if this is wise, but, will be redone during the hot-teams
5531  // rewrite.
5532  /* TODO: Use technique to find the right size hot-team, don't reap them */
5533  team = __kmp_reap_team(team);
5534  __kmp_team_pool = team;
5535  }
5536 
5537  /* nothing available in the pool, no matter, make a new team! */
5538  KMP_MB();
5539  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5540 
5541  /* and set it up */
5542  team->t.t_max_nproc = max_nproc;
5543  if (max_nproc > 1 &&
5544  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5545  // Allocate barrier structure
5546  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5547  }
5548 
5549  /* NOTE well, for some reason allocating one big buffer and dividing it up
5550  seems to really hurt performance a lot on the P4, so, let's not use this */
5551  __kmp_allocate_team_arrays(team, max_nproc);
5552 
5553  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5554  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5555 
5556  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5557  "%p to NULL\n",
5558  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5559  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5560  // memory, no need to duplicate
5561  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5562  // memory, no need to duplicate
5563 
5564  if (__kmp_storage_map) {
5565  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5566  }
5567 
5568  /* allocate space for arguments */
5569  __kmp_alloc_argv_entries(argc, team, FALSE);
5570  team->t.t_argc = argc;
5571 
5572  KA_TRACE(20,
5573  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5574  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5575  { // Initialize barrier data.
5576  int b;
5577  for (b = 0; b < bs_last_barrier; ++b) {
5578  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5579 #if USE_DEBUGGER
5580  team->t.t_bar[b].b_master_arrived = 0;
5581  team->t.t_bar[b].b_team_arrived = 0;
5582 #endif
5583  }
5584  }
5585 
5586  team->t.t_proc_bind = new_proc_bind;
5587 
5588 #if OMPT_SUPPORT
5589  __ompt_team_assign_id(team, ompt_parallel_data);
5590  team->t.ompt_serialized_team_info = NULL;
5591 #endif
5592 
5593  KMP_MB();
5594 
5595  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5596  team->t.t_id));
5597 
5598  return team;
5599 }
5600 
5601 /* TODO implement hot-teams at all levels */
5602 /* TODO implement lazy thread release on demand (disband request) */
5603 
5604 /* free the team. return it to the team pool. release all the threads
5605  * associated with it */
5606 void __kmp_free_team(kmp_root_t *root,
5607  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5608  int f;
5609  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5610  team->t.t_id));
5611 
5612  /* verify state */
5613  KMP_DEBUG_ASSERT(root);
5614  KMP_DEBUG_ASSERT(team);
5615  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5616  KMP_DEBUG_ASSERT(team->t.t_threads);
5617 
5618  int use_hot_team = team == root->r.r_hot_team;
5619 #if KMP_NESTED_HOT_TEAMS
5620  int level;
5621  if (master) {
5622  level = team->t.t_active_level - 1;
5623  if (master->th.th_teams_microtask) { // in teams construct?
5624  if (master->th.th_teams_size.nteams > 1) {
5625  ++level; // level was not increased in teams construct for
5626  // team_of_masters
5627  }
5628  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5629  master->th.th_teams_level == team->t.t_level) {
5630  ++level; // level was not increased in teams construct for
5631  // team_of_workers before the parallel
5632  } // team->t.t_level will be increased inside parallel
5633  }
5634 #if KMP_DEBUG
5635  kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5636 #endif
5637  if (level < __kmp_hot_teams_max_level) {
5638  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5639  use_hot_team = 1;
5640  }
5641  }
5642 #endif // KMP_NESTED_HOT_TEAMS
5643 
5644  /* team is done working */
5645  TCW_SYNC_PTR(team->t.t_pkfn,
5646  NULL); // Important for Debugging Support Library.
5647 #if KMP_OS_WINDOWS
5648  team->t.t_copyin_counter = 0; // init counter for possible reuse
5649 #endif
5650  // Do not reset pointer to parent team to NULL for hot teams.
5651 
5652  /* if we are non-hot team, release our threads */
5653  if (!use_hot_team) {
5654  if (__kmp_tasking_mode != tskm_immediate_exec) {
5655  // Wait for threads to reach reapable state
5656  for (f = 1; f < team->t.t_nproc; ++f) {
5657  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5658  kmp_info_t *th = team->t.t_threads[f];
5659  volatile kmp_uint32 *state = &th->th.th_reap_state;
5660  while (*state != KMP_SAFE_TO_REAP) {
5661 #if KMP_OS_WINDOWS
5662  // On Windows a thread can be killed at any time, check this
5663  DWORD ecode;
5664  if (!__kmp_is_thread_alive(th, &ecode)) {
5665  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5666  break;
5667  }
5668 #endif
5669  // first check if thread is sleeping
5670  kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5671  if (fl.is_sleeping())
5672  fl.resume(__kmp_gtid_from_thread(th));
5673  KMP_CPU_PAUSE();
5674  }
5675  }
5676 
5677  // Delete task teams
5678  int tt_idx;
5679  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5680  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5681  if (task_team != NULL) {
5682  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5683  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5684  team->t.t_threads[f]->th.th_task_team = NULL;
5685  }
5686  KA_TRACE(
5687  20,
5688  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5689  __kmp_get_gtid(), task_team, team->t.t_id));
5690 #if KMP_NESTED_HOT_TEAMS
5691  __kmp_free_task_team(master, task_team);
5692 #endif
5693  team->t.t_task_team[tt_idx] = NULL;
5694  }
5695  }
5696  }
5697 
5698  // Reset pointer to parent team only for non-hot teams.
5699  team->t.t_parent = NULL;
5700  team->t.t_level = 0;
5701  team->t.t_active_level = 0;
5702 
5703  /* free the worker threads */
5704  for (f = 1; f < team->t.t_nproc; ++f) {
5705  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5706  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5707  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5708  1, 2);
5709  }
5710  __kmp_free_thread(team->t.t_threads[f]);
5711  }
5712 
5713  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5714  if (team->t.b) {
5715  // wake up thread at old location
5716  team->t.b->go_release();
5717  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5718  for (f = 1; f < team->t.t_nproc; ++f) {
5719  if (team->t.b->sleep[f].sleep) {
5720  __kmp_atomic_resume_64(
5721  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5722  (kmp_atomic_flag_64<> *)NULL);
5723  }
5724  }
5725  }
5726  // Wait for threads to be removed from team
5727  for (int f = 1; f < team->t.t_nproc; ++f) {
5728  while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5729  KMP_CPU_PAUSE();
5730  }
5731  }
5732  }
5733 
5734  for (f = 1; f < team->t.t_nproc; ++f) {
5735  team->t.t_threads[f] = NULL;
5736  }
5737 
5738  if (team->t.t_max_nproc > 1 &&
5739  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5740  distributedBarrier::deallocate(team->t.b);
5741  team->t.b = NULL;
5742  }
5743  /* put the team back in the team pool */
5744  /* TODO limit size of team pool, call reap_team if pool too large */
5745  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5746  __kmp_team_pool = (volatile kmp_team_t *)team;
5747  } else { // Check if team was created for primary threads in teams construct
5748  // See if first worker is a CG root
5749  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5750  team->t.t_threads[1]->th.th_cg_roots);
5751  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5752  // Clean up the CG root nodes on workers so that this team can be re-used
5753  for (f = 1; f < team->t.t_nproc; ++f) {
5754  kmp_info_t *thr = team->t.t_threads[f];
5755  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5756  thr->th.th_cg_roots->cg_root == thr);
5757  // Pop current CG root off list
5758  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5759  thr->th.th_cg_roots = tmp->up;
5760  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5761  " up to node %p. cg_nthreads was %d\n",
5762  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5763  int i = tmp->cg_nthreads--;
5764  if (i == 1) {
5765  __kmp_free(tmp); // free CG if we are the last thread in it
5766  }
5767  // Restore current task's thread_limit from CG root
5768  if (thr->th.th_cg_roots)
5769  thr->th.th_current_task->td_icvs.thread_limit =
5770  thr->th.th_cg_roots->cg_thread_limit;
5771  }
5772  }
5773  }
5774 
5775  KMP_MB();
5776 }
5777 
5778 /* reap the team. destroy it, reclaim all its resources and free its memory */
5779 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5780  kmp_team_t *next_pool = team->t.t_next_pool;
5781 
5782  KMP_DEBUG_ASSERT(team);
5783  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5784  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5785  KMP_DEBUG_ASSERT(team->t.t_threads);
5786  KMP_DEBUG_ASSERT(team->t.t_argv);
5787 
5788  /* TODO clean the threads that are a part of this? */
5789 
5790  /* free stuff */
5791  __kmp_free_team_arrays(team);
5792  if (team->t.t_argv != &team->t.t_inline_argv[0])
5793  __kmp_free((void *)team->t.t_argv);
5794  __kmp_free(team);
5795 
5796  KMP_MB();
5797  return next_pool;
5798 }
5799 
5800 // Free the thread. Don't reap it, just place it on the pool of available
5801 // threads.
5802 //
5803 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5804 // binding for the affinity mechanism to be useful.
5805 //
5806 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5807 // However, we want to avoid a potential performance problem by always
5808 // scanning through the list to find the correct point at which to insert
5809 // the thread (potential N**2 behavior). To do this we keep track of the
5810 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5811 // With single-level parallelism, threads will always be added to the tail
5812 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5813 // parallelism, all bets are off and we may need to scan through the entire
5814 // free list.
5815 //
5816 // This change also has a potentially large performance benefit, for some
5817 // applications. Previously, as threads were freed from the hot team, they
5818 // would be placed back on the free list in inverse order. If the hot team
5819 // grew back to it's original size, then the freed thread would be placed
5820 // back on the hot team in reverse order. This could cause bad cache
5821 // locality problems on programs where the size of the hot team regularly
5822 // grew and shrunk.
5823 //
5824 // Now, for single-level parallelism, the OMP tid is always == gtid.
5825 void __kmp_free_thread(kmp_info_t *this_th) {
5826  int gtid;
5827  kmp_info_t **scan;
5828 
5829  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5830  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5831 
5832  KMP_DEBUG_ASSERT(this_th);
5833 
5834  // When moving thread to pool, switch thread to wait on own b_go flag, and
5835  // uninitialized (NULL team).
5836  int b;
5837  kmp_balign_t *balign = this_th->th.th_bar;
5838  for (b = 0; b < bs_last_barrier; ++b) {
5839  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5840  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5841  balign[b].bb.team = NULL;
5842  balign[b].bb.leaf_kids = 0;
5843  }
5844  this_th->th.th_task_state = 0;
5845  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5846 
5847  /* put thread back on the free pool */
5848  TCW_PTR(this_th->th.th_team, NULL);
5849  TCW_PTR(this_th->th.th_root, NULL);
5850  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5851 
5852  while (this_th->th.th_cg_roots) {
5853  this_th->th.th_cg_roots->cg_nthreads--;
5854  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5855  " %p of thread %p to %d\n",
5856  this_th, this_th->th.th_cg_roots,
5857  this_th->th.th_cg_roots->cg_root,
5858  this_th->th.th_cg_roots->cg_nthreads));
5859  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5860  if (tmp->cg_root == this_th) { // Thread is a cg_root
5861  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5862  KA_TRACE(
5863  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5864  this_th->th.th_cg_roots = tmp->up;
5865  __kmp_free(tmp);
5866  } else { // Worker thread
5867  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5868  __kmp_free(tmp);
5869  }
5870  this_th->th.th_cg_roots = NULL;
5871  break;
5872  }
5873  }
5874 
5875  /* If the implicit task assigned to this thread can be used by other threads
5876  * -> multiple threads can share the data and try to free the task at
5877  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5878  * with higher probability when hot team is disabled but can occurs even when
5879  * the hot team is enabled */
5880  __kmp_free_implicit_task(this_th);
5881  this_th->th.th_current_task = NULL;
5882 
5883  // If the __kmp_thread_pool_insert_pt is already past the new insert
5884  // point, then we need to re-scan the entire list.
5885  gtid = this_th->th.th_info.ds.ds_gtid;
5886  if (__kmp_thread_pool_insert_pt != NULL) {
5887  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5888  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5889  __kmp_thread_pool_insert_pt = NULL;
5890  }
5891  }
5892 
5893  // Scan down the list to find the place to insert the thread.
5894  // scan is the address of a link in the list, possibly the address of
5895  // __kmp_thread_pool itself.
5896  //
5897  // In the absence of nested parallelism, the for loop will have 0 iterations.
5898  if (__kmp_thread_pool_insert_pt != NULL) {
5899  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5900  } else {
5901  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5902  }
5903  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5904  scan = &((*scan)->th.th_next_pool))
5905  ;
5906 
5907  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5908  // to its address.
5909  TCW_PTR(this_th->th.th_next_pool, *scan);
5910  __kmp_thread_pool_insert_pt = *scan = this_th;
5911  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5912  (this_th->th.th_info.ds.ds_gtid <
5913  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5914  TCW_4(this_th->th.th_in_pool, TRUE);
5915  __kmp_suspend_initialize_thread(this_th);
5916  __kmp_lock_suspend_mx(this_th);
5917  if (this_th->th.th_active == TRUE) {
5918  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5919  this_th->th.th_active_in_pool = TRUE;
5920  }
5921 #if KMP_DEBUG
5922  else {
5923  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5924  }
5925 #endif
5926  __kmp_unlock_suspend_mx(this_th);
5927 
5928  TCW_4(__kmp_nth, __kmp_nth - 1);
5929 
5930 #ifdef KMP_ADJUST_BLOCKTIME
5931  /* Adjust blocktime back to user setting or default if necessary */
5932  /* Middle initialization might never have occurred */
5933  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5934  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5935  if (__kmp_nth <= __kmp_avail_proc) {
5936  __kmp_zero_bt = FALSE;
5937  }
5938  }
5939 #endif /* KMP_ADJUST_BLOCKTIME */
5940 
5941  KMP_MB();
5942 }
5943 
5944 /* ------------------------------------------------------------------------ */
5945 
5946 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5947 #if OMP_PROFILING_SUPPORT
5948  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5949  // TODO: add a configuration option for time granularity
5950  if (ProfileTraceFile)
5951  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5952 #endif
5953 
5954  int gtid = this_thr->th.th_info.ds.ds_gtid;
5955  /* void *stack_data;*/
5956  kmp_team_t **volatile pteam;
5957 
5958  KMP_MB();
5959  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5960 
5961  if (__kmp_env_consistency_check) {
5962  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5963  }
5964 
5965 #if OMPD_SUPPORT
5966  if (ompd_state & OMPD_ENABLE_BP)
5967  ompd_bp_thread_begin();
5968 #endif
5969 
5970 #if OMPT_SUPPORT
5971  ompt_data_t *thread_data = nullptr;
5972  if (ompt_enabled.enabled) {
5973  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5974  *thread_data = ompt_data_none;
5975 
5976  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5977  this_thr->th.ompt_thread_info.wait_id = 0;
5978  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5979  this_thr->th.ompt_thread_info.parallel_flags = 0;
5980  if (ompt_enabled.ompt_callback_thread_begin) {
5981  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5982  ompt_thread_worker, thread_data);
5983  }
5984  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5985  }
5986 #endif
5987 
5988  /* This is the place where threads wait for work */
5989  while (!TCR_4(__kmp_global.g.g_done)) {
5990  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5991  KMP_MB();
5992 
5993  /* wait for work to do */
5994  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5995 
5996  /* No tid yet since not part of a team */
5997  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5998 
5999 #if OMPT_SUPPORT
6000  if (ompt_enabled.enabled) {
6001  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6002  }
6003 #endif
6004 
6005  pteam = &this_thr->th.th_team;
6006 
6007  /* have we been allocated? */
6008  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6009  /* we were just woken up, so run our new task */
6010  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6011  int rc;
6012  KA_TRACE(20,
6013  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6014  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6015  (*pteam)->t.t_pkfn));
6016 
6017  updateHWFPControl(*pteam);
6018 
6019 #if OMPT_SUPPORT
6020  if (ompt_enabled.enabled) {
6021  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6022  }
6023 #endif
6024 
6025  rc = (*pteam)->t.t_invoke(gtid);
6026  KMP_ASSERT(rc);
6027 
6028  KMP_MB();
6029  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6030  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6031  (*pteam)->t.t_pkfn));
6032  }
6033 #if OMPT_SUPPORT
6034  if (ompt_enabled.enabled) {
6035  /* no frame set while outside task */
6036  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6037 
6038  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6039  }
6040 #endif
6041  /* join barrier after parallel region */
6042  __kmp_join_barrier(gtid);
6043  }
6044  }
6045  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
6046 
6047 #if OMPD_SUPPORT
6048  if (ompd_state & OMPD_ENABLE_BP)
6049  ompd_bp_thread_end();
6050 #endif
6051 
6052 #if OMPT_SUPPORT
6053  if (ompt_enabled.ompt_callback_thread_end) {
6054  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6055  }
6056 #endif
6057 
6058  this_thr->th.th_task_team = NULL;
6059  /* run the destructors for the threadprivate data for this thread */
6060  __kmp_common_destroy_gtid(gtid);
6061 
6062  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6063  KMP_MB();
6064 
6065 #if OMP_PROFILING_SUPPORT
6066  llvm::timeTraceProfilerFinishThread();
6067 #endif
6068  return this_thr;
6069 }
6070 
6071 /* ------------------------------------------------------------------------ */
6072 
6073 void __kmp_internal_end_dest(void *specific_gtid) {
6074  // Make sure no significant bits are lost
6075  int gtid;
6076  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6077 
6078  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6079  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6080  * this is because 0 is reserved for the nothing-stored case */
6081 
6082  __kmp_internal_end_thread(gtid);
6083 }
6084 
6085 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6086 
6087 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6088  __kmp_internal_end_atexit();
6089 }
6090 
6091 #endif
6092 
6093 /* [Windows] josh: when the atexit handler is called, there may still be more
6094  than one thread alive */
6095 void __kmp_internal_end_atexit(void) {
6096  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6097  /* [Windows]
6098  josh: ideally, we want to completely shutdown the library in this atexit
6099  handler, but stat code that depends on thread specific data for gtid fails
6100  because that data becomes unavailable at some point during the shutdown, so
6101  we call __kmp_internal_end_thread instead. We should eventually remove the
6102  dependency on __kmp_get_specific_gtid in the stat code and use
6103  __kmp_internal_end_library to cleanly shutdown the library.
6104 
6105  // TODO: Can some of this comment about GVS be removed?
6106  I suspect that the offending stat code is executed when the calling thread
6107  tries to clean up a dead root thread's data structures, resulting in GVS
6108  code trying to close the GVS structures for that thread, but since the stat
6109  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6110  the calling thread is cleaning up itself instead of another thread, it get
6111  confused. This happens because allowing a thread to unregister and cleanup
6112  another thread is a recent modification for addressing an issue.
6113  Based on the current design (20050722), a thread may end up
6114  trying to unregister another thread only if thread death does not trigger
6115  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6116  thread specific data destructor function to detect thread death. For
6117  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6118  is nothing. Thus, the workaround is applicable only for Windows static
6119  stat library. */
6120  __kmp_internal_end_library(-1);
6121 #if KMP_OS_WINDOWS
6122  __kmp_close_console();
6123 #endif
6124 }
6125 
6126 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6127  // It is assumed __kmp_forkjoin_lock is acquired.
6128 
6129  int gtid;
6130 
6131  KMP_DEBUG_ASSERT(thread != NULL);
6132 
6133  gtid = thread->th.th_info.ds.ds_gtid;
6134 
6135  if (!is_root) {
6136  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6137  /* Assume the threads are at the fork barrier here */
6138  KA_TRACE(
6139  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6140  gtid));
6141  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6142  while (
6143  !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6144  KMP_CPU_PAUSE();
6145  __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6146  } else {
6147  /* Need release fence here to prevent seg faults for tree forkjoin
6148  barrier (GEH) */
6149  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6150  thread);
6151  __kmp_release_64(&flag);
6152  }
6153  }
6154 
6155  // Terminate OS thread.
6156  __kmp_reap_worker(thread);
6157 
6158  // The thread was killed asynchronously. If it was actively
6159  // spinning in the thread pool, decrement the global count.
6160  //
6161  // There is a small timing hole here - if the worker thread was just waking
6162  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6163  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6164  // the global counter might not get updated.
6165  //
6166  // Currently, this can only happen as the library is unloaded,
6167  // so there are no harmful side effects.
6168  if (thread->th.th_active_in_pool) {
6169  thread->th.th_active_in_pool = FALSE;
6170  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6171  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6172  }
6173  }
6174 
6175  __kmp_free_implicit_task(thread);
6176 
6177 // Free the fast memory for tasking
6178 #if USE_FAST_MEMORY
6179  __kmp_free_fast_memory(thread);
6180 #endif /* USE_FAST_MEMORY */
6181 
6182  __kmp_suspend_uninitialize_thread(thread);
6183 
6184  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6185  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6186 
6187  --__kmp_all_nth;
6188  // __kmp_nth was decremented when thread is added to the pool.
6189 
6190 #ifdef KMP_ADJUST_BLOCKTIME
6191  /* Adjust blocktime back to user setting or default if necessary */
6192  /* Middle initialization might never have occurred */
6193  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6194  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6195  if (__kmp_nth <= __kmp_avail_proc) {
6196  __kmp_zero_bt = FALSE;
6197  }
6198  }
6199 #endif /* KMP_ADJUST_BLOCKTIME */
6200 
6201  /* free the memory being used */
6202  if (__kmp_env_consistency_check) {
6203  if (thread->th.th_cons) {
6204  __kmp_free_cons_stack(thread->th.th_cons);
6205  thread->th.th_cons = NULL;
6206  }
6207  }
6208 
6209  if (thread->th.th_pri_common != NULL) {
6210  __kmp_free(thread->th.th_pri_common);
6211  thread->th.th_pri_common = NULL;
6212  }
6213 
6214  if (thread->th.th_task_state_memo_stack != NULL) {
6215  __kmp_free(thread->th.th_task_state_memo_stack);
6216  thread->th.th_task_state_memo_stack = NULL;
6217  }
6218 
6219 #if KMP_USE_BGET
6220  if (thread->th.th_local.bget_data != NULL) {
6221  __kmp_finalize_bget(thread);
6222  }
6223 #endif
6224 
6225 #if KMP_AFFINITY_SUPPORTED
6226  if (thread->th.th_affin_mask != NULL) {
6227  KMP_CPU_FREE(thread->th.th_affin_mask);
6228  thread->th.th_affin_mask = NULL;
6229  }
6230 #endif /* KMP_AFFINITY_SUPPORTED */
6231 
6232 #if KMP_USE_HIER_SCHED
6233  if (thread->th.th_hier_bar_data != NULL) {
6234  __kmp_free(thread->th.th_hier_bar_data);
6235  thread->th.th_hier_bar_data = NULL;
6236  }
6237 #endif
6238 
6239  __kmp_reap_team(thread->th.th_serial_team);
6240  thread->th.th_serial_team = NULL;
6241  __kmp_free(thread);
6242 
6243  KMP_MB();
6244 
6245 } // __kmp_reap_thread
6246 
6247 static void __kmp_itthash_clean(kmp_info_t *th) {
6248 #if USE_ITT_NOTIFY
6249  if (__kmp_itt_region_domains.count > 0) {
6250  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6251  kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6252  while (bucket) {
6253  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6254  __kmp_thread_free(th, bucket);
6255  bucket = next;
6256  }
6257  }
6258  }
6259  if (__kmp_itt_barrier_domains.count > 0) {
6260  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6261  kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6262  while (bucket) {
6263  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6264  __kmp_thread_free(th, bucket);
6265  bucket = next;
6266  }
6267  }
6268  }
6269 #endif
6270 }
6271 
6272 static void __kmp_internal_end(void) {
6273  int i;
6274 
6275  /* First, unregister the library */
6276  __kmp_unregister_library();
6277 
6278 #if KMP_OS_WINDOWS
6279  /* In Win static library, we can't tell when a root actually dies, so we
6280  reclaim the data structures for any root threads that have died but not
6281  unregistered themselves, in order to shut down cleanly.
6282  In Win dynamic library we also can't tell when a thread dies. */
6283  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6284 // dead roots
6285 #endif
6286 
6287  for (i = 0; i < __kmp_threads_capacity; i++)
6288  if (__kmp_root[i])
6289  if (__kmp_root[i]->r.r_active)
6290  break;
6291  KMP_MB(); /* Flush all pending memory write invalidates. */
6292  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6293 
6294  if (i < __kmp_threads_capacity) {
6295 #if KMP_USE_MONITOR
6296  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6297  KMP_MB(); /* Flush all pending memory write invalidates. */
6298 
6299  // Need to check that monitor was initialized before reaping it. If we are
6300  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6301  // __kmp_monitor will appear to contain valid data, but it is only valid in
6302  // the parent process, not the child.
6303  // New behavior (201008): instead of keying off of the flag
6304  // __kmp_init_parallel, the monitor thread creation is keyed off
6305  // of the new flag __kmp_init_monitor.
6306  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6307  if (TCR_4(__kmp_init_monitor)) {
6308  __kmp_reap_monitor(&__kmp_monitor);
6309  TCW_4(__kmp_init_monitor, 0);
6310  }
6311  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6312  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6313 #endif // KMP_USE_MONITOR
6314  } else {
6315 /* TODO move this to cleanup code */
6316 #ifdef KMP_DEBUG
6317  /* make sure that everything has properly ended */
6318  for (i = 0; i < __kmp_threads_capacity; i++) {
6319  if (__kmp_root[i]) {
6320  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6321  // there can be uber threads alive here
6322  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6323  }
6324  }
6325 #endif
6326 
6327  KMP_MB();
6328 
6329  // Reap the worker threads.
6330  // This is valid for now, but be careful if threads are reaped sooner.
6331  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6332  // Get the next thread from the pool.
6333  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6334  __kmp_thread_pool = thread->th.th_next_pool;
6335  // Reap it.
6336  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6337  thread->th.th_next_pool = NULL;
6338  thread->th.th_in_pool = FALSE;
6339  __kmp_reap_thread(thread, 0);
6340  }
6341  __kmp_thread_pool_insert_pt = NULL;
6342 
6343  // Reap teams.
6344  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6345  // Get the next team from the pool.
6346  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6347  __kmp_team_pool = team->t.t_next_pool;
6348  // Reap it.
6349  team->t.t_next_pool = NULL;
6350  __kmp_reap_team(team);
6351  }
6352 
6353  __kmp_reap_task_teams();
6354 
6355 #if KMP_OS_UNIX
6356  // Threads that are not reaped should not access any resources since they
6357  // are going to be deallocated soon, so the shutdown sequence should wait
6358  // until all threads either exit the final spin-waiting loop or begin
6359  // sleeping after the given blocktime.
6360  for (i = 0; i < __kmp_threads_capacity; i++) {
6361  kmp_info_t *thr = __kmp_threads[i];
6362  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6363  KMP_CPU_PAUSE();
6364  }
6365 #endif
6366 
6367  for (i = 0; i < __kmp_threads_capacity; ++i) {
6368  // TBD: Add some checking...
6369  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6370  }
6371 
6372  /* Make sure all threadprivate destructors get run by joining with all
6373  worker threads before resetting this flag */
6374  TCW_SYNC_4(__kmp_init_common, FALSE);
6375 
6376  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6377  KMP_MB();
6378 
6379 #if KMP_USE_MONITOR
6380  // See note above: One of the possible fixes for CQ138434 / CQ140126
6381  //
6382  // FIXME: push both code fragments down and CSE them?
6383  // push them into __kmp_cleanup() ?
6384  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6385  if (TCR_4(__kmp_init_monitor)) {
6386  __kmp_reap_monitor(&__kmp_monitor);
6387  TCW_4(__kmp_init_monitor, 0);
6388  }
6389  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6390  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6391 #endif
6392  } /* else !__kmp_global.t_active */
6393  TCW_4(__kmp_init_gtid, FALSE);
6394  KMP_MB(); /* Flush all pending memory write invalidates. */
6395 
6396  __kmp_cleanup();
6397 #if OMPT_SUPPORT
6398  ompt_fini();
6399 #endif
6400 }
6401 
6402 void __kmp_internal_end_library(int gtid_req) {
6403  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6404  /* this shouldn't be a race condition because __kmp_internal_end() is the
6405  only place to clear __kmp_serial_init */
6406  /* we'll check this later too, after we get the lock */
6407  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6408  // redundant, because the next check will work in any case.
6409  if (__kmp_global.g.g_abort) {
6410  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6411  /* TODO abort? */
6412  return;
6413  }
6414  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6415  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6416  return;
6417  }
6418 
6419  // If hidden helper team has been initialized, we need to deinit it
6420  if (TCR_4(__kmp_init_hidden_helper) &&
6421  !TCR_4(__kmp_hidden_helper_team_done)) {
6422  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6423  // First release the main thread to let it continue its work
6424  __kmp_hidden_helper_main_thread_release();
6425  // Wait until the hidden helper team has been destroyed
6426  __kmp_hidden_helper_threads_deinitz_wait();
6427  }
6428 
6429  KMP_MB(); /* Flush all pending memory write invalidates. */
6430  /* find out who we are and what we should do */
6431  {
6432  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6433  KA_TRACE(
6434  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6435  if (gtid == KMP_GTID_SHUTDOWN) {
6436  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6437  "already shutdown\n"));
6438  return;
6439  } else if (gtid == KMP_GTID_MONITOR) {
6440  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6441  "registered, or system shutdown\n"));
6442  return;
6443  } else if (gtid == KMP_GTID_DNE) {
6444  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6445  "shutdown\n"));
6446  /* we don't know who we are, but we may still shutdown the library */
6447  } else if (KMP_UBER_GTID(gtid)) {
6448  /* unregister ourselves as an uber thread. gtid is no longer valid */
6449  if (__kmp_root[gtid]->r.r_active) {
6450  __kmp_global.g.g_abort = -1;
6451  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6452  __kmp_unregister_library();
6453  KA_TRACE(10,
6454  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6455  gtid));
6456  return;
6457  } else {
6458  __kmp_itthash_clean(__kmp_threads[gtid]);
6459  KA_TRACE(
6460  10,
6461  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6462  __kmp_unregister_root_current_thread(gtid);
6463  }
6464  } else {
6465 /* worker threads may call this function through the atexit handler, if they
6466  * call exit() */
6467 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6468  TODO: do a thorough shutdown instead */
6469 #ifdef DUMP_DEBUG_ON_EXIT
6470  if (__kmp_debug_buf)
6471  __kmp_dump_debug_buffer();
6472 #endif
6473  // added unregister library call here when we switch to shm linux
6474  // if we don't, it will leave lots of files in /dev/shm
6475  // cleanup shared memory file before exiting.
6476  __kmp_unregister_library();
6477  return;
6478  }
6479  }
6480  /* synchronize the termination process */
6481  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6482 
6483  /* have we already finished */
6484  if (__kmp_global.g.g_abort) {
6485  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6486  /* TODO abort? */
6487  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6488  return;
6489  }
6490  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6491  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6492  return;
6493  }
6494 
6495  /* We need this lock to enforce mutex between this reading of
6496  __kmp_threads_capacity and the writing by __kmp_register_root.
6497  Alternatively, we can use a counter of roots that is atomically updated by
6498  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6499  __kmp_internal_end_*. */
6500  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6501 
6502  /* now we can safely conduct the actual termination */
6503  __kmp_internal_end();
6504 
6505  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6506  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6507 
6508  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6509 
6510 #ifdef DUMP_DEBUG_ON_EXIT
6511  if (__kmp_debug_buf)
6512  __kmp_dump_debug_buffer();
6513 #endif
6514 
6515 #if KMP_OS_WINDOWS
6516  __kmp_close_console();
6517 #endif
6518 
6519  __kmp_fini_allocator();
6520 
6521 } // __kmp_internal_end_library
6522 
6523 void __kmp_internal_end_thread(int gtid_req) {
6524  int i;
6525 
6526  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6527  /* this shouldn't be a race condition because __kmp_internal_end() is the
6528  * only place to clear __kmp_serial_init */
6529  /* we'll check this later too, after we get the lock */
6530  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6531  // redundant, because the next check will work in any case.
6532  if (__kmp_global.g.g_abort) {
6533  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6534  /* TODO abort? */
6535  return;
6536  }
6537  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6538  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6539  return;
6540  }
6541 
6542  // If hidden helper team has been initialized, we need to deinit it
6543  if (TCR_4(__kmp_init_hidden_helper) &&
6544  !TCR_4(__kmp_hidden_helper_team_done)) {
6545  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6546  // First release the main thread to let it continue its work
6547  __kmp_hidden_helper_main_thread_release();
6548  // Wait until the hidden helper team has been destroyed
6549  __kmp_hidden_helper_threads_deinitz_wait();
6550  }
6551 
6552  KMP_MB(); /* Flush all pending memory write invalidates. */
6553 
6554  /* find out who we are and what we should do */
6555  {
6556  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6557  KA_TRACE(10,
6558  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6559  if (gtid == KMP_GTID_SHUTDOWN) {
6560  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6561  "already shutdown\n"));
6562  return;
6563  } else if (gtid == KMP_GTID_MONITOR) {
6564  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6565  "registered, or system shutdown\n"));
6566  return;
6567  } else if (gtid == KMP_GTID_DNE) {
6568  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6569  "shutdown\n"));
6570  return;
6571  /* we don't know who we are */
6572  } else if (KMP_UBER_GTID(gtid)) {
6573  /* unregister ourselves as an uber thread. gtid is no longer valid */
6574  if (__kmp_root[gtid]->r.r_active) {
6575  __kmp_global.g.g_abort = -1;
6576  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6577  KA_TRACE(10,
6578  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6579  gtid));
6580  return;
6581  } else {
6582  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6583  gtid));
6584  __kmp_unregister_root_current_thread(gtid);
6585  }
6586  } else {
6587  /* just a worker thread, let's leave */
6588  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6589 
6590  if (gtid >= 0) {
6591  __kmp_threads[gtid]->th.th_task_team = NULL;
6592  }
6593 
6594  KA_TRACE(10,
6595  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6596  gtid));
6597  return;
6598  }
6599  }
6600 #if KMP_DYNAMIC_LIB
6601  if (__kmp_pause_status != kmp_hard_paused)
6602  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6603  // because we will better shutdown later in the library destructor.
6604  {
6605  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6606  return;
6607  }
6608 #endif
6609  /* synchronize the termination process */
6610  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6611 
6612  /* have we already finished */
6613  if (__kmp_global.g.g_abort) {
6614  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6615  /* TODO abort? */
6616  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6617  return;
6618  }
6619  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6620  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6621  return;
6622  }
6623 
6624  /* We need this lock to enforce mutex between this reading of
6625  __kmp_threads_capacity and the writing by __kmp_register_root.
6626  Alternatively, we can use a counter of roots that is atomically updated by
6627  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6628  __kmp_internal_end_*. */
6629 
6630  /* should we finish the run-time? are all siblings done? */
6631  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6632 
6633  for (i = 0; i < __kmp_threads_capacity; ++i) {
6634  if (KMP_UBER_GTID(i)) {
6635  KA_TRACE(
6636  10,
6637  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6638  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6639  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6640  return;
6641  }
6642  }
6643 
6644  /* now we can safely conduct the actual termination */
6645 
6646  __kmp_internal_end();
6647 
6648  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6649  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6650 
6651  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6652 
6653 #ifdef DUMP_DEBUG_ON_EXIT
6654  if (__kmp_debug_buf)
6655  __kmp_dump_debug_buffer();
6656 #endif
6657 } // __kmp_internal_end_thread
6658 
6659 // -----------------------------------------------------------------------------
6660 // Library registration stuff.
6661 
6662 static long __kmp_registration_flag = 0;
6663 // Random value used to indicate library initialization.
6664 static char *__kmp_registration_str = NULL;
6665 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6666 
6667 static inline char *__kmp_reg_status_name() {
6668 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6669  each thread. If registration and unregistration go in different threads
6670  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6671  env var can not be found, because the name will contain different pid. */
6672 // macOS* complains about name being too long with additional getuid()
6673 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6674  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6675  (int)getuid());
6676 #else
6677  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6678 #endif
6679 } // __kmp_reg_status_get
6680 
6681 void __kmp_register_library_startup(void) {
6682 
6683  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6684  int done = 0;
6685  union {
6686  double dtime;
6687  long ltime;
6688  } time;
6689 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6690  __kmp_initialize_system_tick();
6691 #endif
6692  __kmp_read_system_time(&time.dtime);
6693  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6694  __kmp_registration_str =
6695  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6696  __kmp_registration_flag, KMP_LIBRARY_FILE);
6697 
6698  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6699  __kmp_registration_str));
6700 
6701  while (!done) {
6702 
6703  char *value = NULL; // Actual value of the environment variable.
6704 
6705 #if defined(KMP_USE_SHM)
6706  char *shm_name = __kmp_str_format("/%s", name);
6707  int shm_preexist = 0;
6708  char *data1;
6709  int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6710  if ((fd1 == -1) && (errno == EEXIST)) {
6711  // file didn't open because it already exists.
6712  // try opening existing file
6713  fd1 = shm_open(shm_name, O_RDWR, 0666);
6714  if (fd1 == -1) { // file didn't open
6715  // error out here
6716  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6717  __kmp_msg_null);
6718  } else {
6719  // able to open existing file
6720  shm_preexist = 1;
6721  }
6722  } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6723  // already exists.
6724  // error out here.
6725  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6726  __kmp_msg_null);
6727  }
6728  if (shm_preexist == 0) {
6729  // we created SHM now set size
6730  if (ftruncate(fd1, SHM_SIZE) == -1) {
6731  // error occured setting size;
6732  __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6733  KMP_ERR(errno), __kmp_msg_null);
6734  }
6735  }
6736  data1 =
6737  (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6738  if (data1 == MAP_FAILED) {
6739  // failed to map shared memory
6740  __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6741  __kmp_msg_null);
6742  }
6743  if (shm_preexist == 0) { // set data to SHM, set value
6744  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6745  }
6746  // Read value from either what we just wrote or existing file.
6747  value = __kmp_str_format("%s", data1); // read value from SHM
6748  munmap(data1, SHM_SIZE);
6749  close(fd1);
6750 #else // Windows and unix with static library
6751  // Set environment variable, but do not overwrite if it is exist.
6752  __kmp_env_set(name, __kmp_registration_str, 0);
6753  // read value to see if it got set
6754  value = __kmp_env_get(name);
6755 #endif
6756 
6757  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6758  done = 1; // Ok, environment variable set successfully, exit the loop.
6759  } else {
6760  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6761  // Check whether it alive or dead.
6762  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6763  char *tail = value;
6764  char *flag_addr_str = NULL;
6765  char *flag_val_str = NULL;
6766  char const *file_name = NULL;
6767  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6768  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6769  file_name = tail;
6770  if (tail != NULL) {
6771  unsigned long *flag_addr = 0;
6772  unsigned long flag_val = 0;
6773  KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6774  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6775  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6776  // First, check whether environment-encoded address is mapped into
6777  // addr space.
6778  // If so, dereference it to see if it still has the right value.
6779  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6780  neighbor = 1;
6781  } else {
6782  // If not, then we know the other copy of the library is no longer
6783  // running.
6784  neighbor = 2;
6785  }
6786  }
6787  }
6788  switch (neighbor) {
6789  case 0: // Cannot parse environment variable -- neighbor status unknown.
6790  // Assume it is the incompatible format of future version of the
6791  // library. Assume the other library is alive.
6792  // WARN( ... ); // TODO: Issue a warning.
6793  file_name = "unknown library";
6794  KMP_FALLTHROUGH();
6795  // Attention! Falling to the next case. That's intentional.
6796  case 1: { // Neighbor is alive.
6797  // Check it is allowed.
6798  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6799  if (!__kmp_str_match_true(duplicate_ok)) {
6800  // That's not allowed. Issue fatal error.
6801  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6802  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6803  }
6804  KMP_INTERNAL_FREE(duplicate_ok);
6805  __kmp_duplicate_library_ok = 1;
6806  done = 1; // Exit the loop.
6807  } break;
6808  case 2: { // Neighbor is dead.
6809 
6810 #if defined(KMP_USE_SHM)
6811  // close shared memory.
6812  shm_unlink(shm_name); // this removes file in /dev/shm
6813 #else
6814  // Clear the variable and try to register library again.
6815  __kmp_env_unset(name);
6816 #endif
6817  } break;
6818  default: {
6819  KMP_DEBUG_ASSERT(0);
6820  } break;
6821  }
6822  }
6823  KMP_INTERNAL_FREE((void *)value);
6824 #if defined(KMP_USE_SHM)
6825  KMP_INTERNAL_FREE((void *)shm_name);
6826 #endif
6827  } // while
6828  KMP_INTERNAL_FREE((void *)name);
6829 
6830 } // func __kmp_register_library_startup
6831 
6832 void __kmp_unregister_library(void) {
6833 
6834  char *name = __kmp_reg_status_name();
6835  char *value = NULL;
6836 
6837 #if defined(KMP_USE_SHM)
6838  char *shm_name = __kmp_str_format("/%s", name);
6839  int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6840  if (fd1 == -1) {
6841  // file did not open. return.
6842  return;
6843  }
6844  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6845  if (data1 != MAP_FAILED) {
6846  value = __kmp_str_format("%s", data1); // read value from SHM
6847  munmap(data1, SHM_SIZE);
6848  }
6849  close(fd1);
6850 #else
6851  value = __kmp_env_get(name);
6852 #endif
6853 
6854  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6855  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6856  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6857 // Ok, this is our variable. Delete it.
6858 #if defined(KMP_USE_SHM)
6859  shm_unlink(shm_name); // this removes file in /dev/shm
6860 #else
6861  __kmp_env_unset(name);
6862 #endif
6863  }
6864 
6865 #if defined(KMP_USE_SHM)
6866  KMP_INTERNAL_FREE(shm_name);
6867 #endif
6868 
6869  KMP_INTERNAL_FREE(__kmp_registration_str);
6870  KMP_INTERNAL_FREE(value);
6871  KMP_INTERNAL_FREE(name);
6872 
6873  __kmp_registration_flag = 0;
6874  __kmp_registration_str = NULL;
6875 
6876 } // __kmp_unregister_library
6877 
6878 // End of Library registration stuff.
6879 // -----------------------------------------------------------------------------
6880 
6881 #if KMP_MIC_SUPPORTED
6882 
6883 static void __kmp_check_mic_type() {
6884  kmp_cpuid_t cpuid_state = {0};
6885  kmp_cpuid_t *cs_p = &cpuid_state;
6886  __kmp_x86_cpuid(1, 0, cs_p);
6887  // We don't support mic1 at the moment
6888  if ((cs_p->eax & 0xff0) == 0xB10) {
6889  __kmp_mic_type = mic2;
6890  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6891  __kmp_mic_type = mic3;
6892  } else {
6893  __kmp_mic_type = non_mic;
6894  }
6895 }
6896 
6897 #endif /* KMP_MIC_SUPPORTED */
6898 
6899 #if KMP_HAVE_UMWAIT
6900 static void __kmp_user_level_mwait_init() {
6901  struct kmp_cpuid buf;
6902  __kmp_x86_cpuid(7, 0, &buf);
6903  __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
6904  __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
6905  __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
6906  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6907  __kmp_umwait_enabled));
6908 }
6909 #elif KMP_HAVE_MWAIT
6910 #ifndef AT_INTELPHIUSERMWAIT
6911 // Spurious, non-existent value that should always fail to return anything.
6912 // Will be replaced with the correct value when we know that.
6913 #define AT_INTELPHIUSERMWAIT 10000
6914 #endif
6915 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6916 // earlier OS is used to build the RTL, we'll use the following internal
6917 // function when the entry is not found.
6918 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6919 unsigned long getauxval(unsigned long) { return 0; }
6920 
6921 static void __kmp_user_level_mwait_init() {
6922  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6923  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6924  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6925  // KMP_USER_LEVEL_MWAIT was set to TRUE.
6926  if (__kmp_mic_type == mic3) {
6927  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6928  if ((res & 0x1) || __kmp_user_level_mwait) {
6929  __kmp_mwait_enabled = TRUE;
6930  if (__kmp_user_level_mwait) {
6931  KMP_INFORM(EnvMwaitWarn);
6932  }
6933  } else {
6934  __kmp_mwait_enabled = FALSE;
6935  }
6936  }
6937  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6938  "__kmp_mwait_enabled = %d\n",
6939  __kmp_mic_type, __kmp_mwait_enabled));
6940 }
6941 #endif /* KMP_HAVE_UMWAIT */
6942 
6943 static void __kmp_do_serial_initialize(void) {
6944  int i, gtid;
6945  size_t size;
6946 
6947  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6948 
6949  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6950  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6951  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6952  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6953  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6954 
6955 #if OMPT_SUPPORT
6956  ompt_pre_init();
6957 #endif
6958 #if OMPD_SUPPORT
6959  __kmp_env_dump();
6960  ompd_init();
6961 #endif
6962 
6963  __kmp_validate_locks();
6964 
6965  /* Initialize internal memory allocator */
6966  __kmp_init_allocator();
6967 
6968  /* Register the library startup via an environment variable and check to see
6969  whether another copy of the library is already registered. */
6970 
6971  __kmp_register_library_startup();
6972 
6973  /* TODO reinitialization of library */
6974  if (TCR_4(__kmp_global.g.g_done)) {
6975  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6976  }
6977 
6978  __kmp_global.g.g_abort = 0;
6979  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6980 
6981 /* initialize the locks */
6982 #if KMP_USE_ADAPTIVE_LOCKS
6983 #if KMP_DEBUG_ADAPTIVE_LOCKS
6984  __kmp_init_speculative_stats();
6985 #endif
6986 #endif
6987 #if KMP_STATS_ENABLED
6988  __kmp_stats_init();
6989 #endif
6990  __kmp_init_lock(&__kmp_global_lock);
6991  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6992  __kmp_init_lock(&__kmp_debug_lock);
6993  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6994  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6995  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6996  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6997  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6998  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6999  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7000  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7001  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7002  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7003  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7004  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7005  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7006  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7007  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7008 #if KMP_USE_MONITOR
7009  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7010 #endif
7011  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7012 
7013  /* conduct initialization and initial setup of configuration */
7014 
7015  __kmp_runtime_initialize();
7016 
7017 #if KMP_MIC_SUPPORTED
7018  __kmp_check_mic_type();
7019 #endif
7020 
7021 // Some global variable initialization moved here from kmp_env_initialize()
7022 #ifdef KMP_DEBUG
7023  kmp_diag = 0;
7024 #endif
7025  __kmp_abort_delay = 0;
7026 
7027  // From __kmp_init_dflt_team_nth()
7028  /* assume the entire machine will be used */
7029  __kmp_dflt_team_nth_ub = __kmp_xproc;
7030  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7031  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7032  }
7033  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7034  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7035  }
7036  __kmp_max_nth = __kmp_sys_max_nth;
7037  __kmp_cg_max_nth = __kmp_sys_max_nth;
7038  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7039  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7040  __kmp_teams_max_nth = __kmp_sys_max_nth;
7041  }
7042 
7043  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7044  // part
7045  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7046 #if KMP_USE_MONITOR
7047  __kmp_monitor_wakeups =
7048  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7049  __kmp_bt_intervals =
7050  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7051 #endif
7052  // From "KMP_LIBRARY" part of __kmp_env_initialize()
7053  __kmp_library = library_throughput;
7054  // From KMP_SCHEDULE initialization
7055  __kmp_static = kmp_sch_static_balanced;
7056 // AC: do not use analytical here, because it is non-monotonous
7057 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7058 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7059 // need to repeat assignment
7060 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7061 // bit control and barrier method control parts
7062 #if KMP_FAST_REDUCTION_BARRIER
7063 #define kmp_reduction_barrier_gather_bb ((int)1)
7064 #define kmp_reduction_barrier_release_bb ((int)1)
7065 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7066 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7067 #endif // KMP_FAST_REDUCTION_BARRIER
7068  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7069  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7070  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7071  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7072  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7073 #if KMP_FAST_REDUCTION_BARRIER
7074  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7075  // lin_64 ): hyper,1
7076  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7077  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7078  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7079  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7080  }
7081 #endif // KMP_FAST_REDUCTION_BARRIER
7082  }
7083 #if KMP_FAST_REDUCTION_BARRIER
7084 #undef kmp_reduction_barrier_release_pat
7085 #undef kmp_reduction_barrier_gather_pat
7086 #undef kmp_reduction_barrier_release_bb
7087 #undef kmp_reduction_barrier_gather_bb
7088 #endif // KMP_FAST_REDUCTION_BARRIER
7089 #if KMP_MIC_SUPPORTED
7090  if (__kmp_mic_type == mic2) { // KNC
7091  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7092  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7093  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7094  1; // forkjoin release
7095  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7096  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7097  }
7098 #if KMP_FAST_REDUCTION_BARRIER
7099  if (__kmp_mic_type == mic2) { // KNC
7100  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7101  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7102  }
7103 #endif // KMP_FAST_REDUCTION_BARRIER
7104 #endif // KMP_MIC_SUPPORTED
7105 
7106 // From KMP_CHECKS initialization
7107 #ifdef KMP_DEBUG
7108  __kmp_env_checks = TRUE; /* development versions have the extra checks */
7109 #else
7110  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7111 #endif
7112 
7113  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7114  __kmp_foreign_tp = TRUE;
7115 
7116  __kmp_global.g.g_dynamic = FALSE;
7117  __kmp_global.g.g_dynamic_mode = dynamic_default;
7118 
7119  __kmp_init_nesting_mode();
7120 
7121  __kmp_env_initialize(NULL);
7122 
7123 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7124  __kmp_user_level_mwait_init();
7125 #endif
7126 // Print all messages in message catalog for testing purposes.
7127 #ifdef KMP_DEBUG
7128  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7129  if (__kmp_str_match_true(val)) {
7130  kmp_str_buf_t buffer;
7131  __kmp_str_buf_init(&buffer);
7132  __kmp_i18n_dump_catalog(&buffer);
7133  __kmp_printf("%s", buffer.str);
7134  __kmp_str_buf_free(&buffer);
7135  }
7136  __kmp_env_free(&val);
7137 #endif
7138 
7139  __kmp_threads_capacity =
7140  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7141  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7142  __kmp_tp_capacity = __kmp_default_tp_capacity(
7143  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7144 
7145  // If the library is shut down properly, both pools must be NULL. Just in
7146  // case, set them to NULL -- some memory may leak, but subsequent code will
7147  // work even if pools are not freed.
7148  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7149  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7150  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7151  __kmp_thread_pool = NULL;
7152  __kmp_thread_pool_insert_pt = NULL;
7153  __kmp_team_pool = NULL;
7154 
7155  /* Allocate all of the variable sized records */
7156  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7157  * expandable */
7158  /* Since allocation is cache-aligned, just add extra padding at the end */
7159  size =
7160  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7161  CACHE_LINE;
7162  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7163  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7164  sizeof(kmp_info_t *) * __kmp_threads_capacity);
7165 
7166  /* init thread counts */
7167  KMP_DEBUG_ASSERT(__kmp_all_nth ==
7168  0); // Asserts fail if the library is reinitializing and
7169  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7170  __kmp_all_nth = 0;
7171  __kmp_nth = 0;
7172 
7173  /* setup the uber master thread and hierarchy */
7174  gtid = __kmp_register_root(TRUE);
7175  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7176  KMP_ASSERT(KMP_UBER_GTID(gtid));
7177  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7178 
7179  KMP_MB(); /* Flush all pending memory write invalidates. */
7180 
7181  __kmp_common_initialize();
7182 
7183 #if KMP_OS_UNIX
7184  /* invoke the child fork handler */
7185  __kmp_register_atfork();
7186 #endif
7187 
7188 #if !KMP_DYNAMIC_LIB
7189  {
7190  /* Invoke the exit handler when the program finishes, only for static
7191  library. For dynamic library, we already have _fini and DllMain. */
7192  int rc = atexit(__kmp_internal_end_atexit);
7193  if (rc != 0) {
7194  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7195  __kmp_msg_null);
7196  }
7197  }
7198 #endif
7199 
7200 #if KMP_HANDLE_SIGNALS
7201 #if KMP_OS_UNIX
7202  /* NOTE: make sure that this is called before the user installs their own
7203  signal handlers so that the user handlers are called first. this way they
7204  can return false, not call our handler, avoid terminating the library, and
7205  continue execution where they left off. */
7206  __kmp_install_signals(FALSE);
7207 #endif /* KMP_OS_UNIX */
7208 #if KMP_OS_WINDOWS
7209  __kmp_install_signals(TRUE);
7210 #endif /* KMP_OS_WINDOWS */
7211 #endif
7212 
7213  /* we have finished the serial initialization */
7214  __kmp_init_counter++;
7215 
7216  __kmp_init_serial = TRUE;
7217 
7218  if (__kmp_settings) {
7219  __kmp_env_print();
7220  }
7221 
7222  if (__kmp_display_env || __kmp_display_env_verbose) {
7223  __kmp_env_print_2();
7224  }
7225 
7226 #if OMPT_SUPPORT
7227  ompt_post_init();
7228 #endif
7229 
7230  KMP_MB();
7231 
7232  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7233 }
7234 
7235 void __kmp_serial_initialize(void) {
7236  if (__kmp_init_serial) {
7237  return;
7238  }
7239  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7240  if (__kmp_init_serial) {
7241  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7242  return;
7243  }
7244  __kmp_do_serial_initialize();
7245  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7246 }
7247 
7248 static void __kmp_do_middle_initialize(void) {
7249  int i, j;
7250  int prev_dflt_team_nth;
7251 
7252  if (!__kmp_init_serial) {
7253  __kmp_do_serial_initialize();
7254  }
7255 
7256  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7257 
7258  // Save the previous value for the __kmp_dflt_team_nth so that
7259  // we can avoid some reinitialization if it hasn't changed.
7260  prev_dflt_team_nth = __kmp_dflt_team_nth;
7261 
7262 #if KMP_AFFINITY_SUPPORTED
7263  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7264  // number of cores on the machine.
7265  __kmp_affinity_initialize();
7266 
7267 #endif /* KMP_AFFINITY_SUPPORTED */
7268 
7269  KMP_ASSERT(__kmp_xproc > 0);
7270  if (__kmp_avail_proc == 0) {
7271  __kmp_avail_proc = __kmp_xproc;
7272  }
7273 
7274  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7275  // correct them now
7276  j = 0;
7277  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7278  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7279  __kmp_avail_proc;
7280  j++;
7281  }
7282 
7283  if (__kmp_dflt_team_nth == 0) {
7284 #ifdef KMP_DFLT_NTH_CORES
7285  // Default #threads = #cores
7286  __kmp_dflt_team_nth = __kmp_ncores;
7287  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7288  "__kmp_ncores (%d)\n",
7289  __kmp_dflt_team_nth));
7290 #else
7291  // Default #threads = #available OS procs
7292  __kmp_dflt_team_nth = __kmp_avail_proc;
7293  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7294  "__kmp_avail_proc(%d)\n",
7295  __kmp_dflt_team_nth));
7296 #endif /* KMP_DFLT_NTH_CORES */
7297  }
7298 
7299  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7300  __kmp_dflt_team_nth = KMP_MIN_NTH;
7301  }
7302  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7303  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7304  }
7305 
7306  if (__kmp_nesting_mode > 0)
7307  __kmp_set_nesting_mode_threads();
7308 
7309  // There's no harm in continuing if the following check fails,
7310  // but it indicates an error in the previous logic.
7311  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7312 
7313  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7314  // Run through the __kmp_threads array and set the num threads icv for each
7315  // root thread that is currently registered with the RTL (which has not
7316  // already explicitly set its nthreads-var with a call to
7317  // omp_set_num_threads()).
7318  for (i = 0; i < __kmp_threads_capacity; i++) {
7319  kmp_info_t *thread = __kmp_threads[i];
7320  if (thread == NULL)
7321  continue;
7322  if (thread->th.th_current_task->td_icvs.nproc != 0)
7323  continue;
7324 
7325  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7326  }
7327  }
7328  KA_TRACE(
7329  20,
7330  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7331  __kmp_dflt_team_nth));
7332 
7333 #ifdef KMP_ADJUST_BLOCKTIME
7334  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7335  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7336  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7337  if (__kmp_nth > __kmp_avail_proc) {
7338  __kmp_zero_bt = TRUE;
7339  }
7340  }
7341 #endif /* KMP_ADJUST_BLOCKTIME */
7342 
7343  /* we have finished middle initialization */
7344  TCW_SYNC_4(__kmp_init_middle, TRUE);
7345 
7346  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7347 }
7348 
7349 void __kmp_middle_initialize(void) {
7350  if (__kmp_init_middle) {
7351  return;
7352  }
7353  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7354  if (__kmp_init_middle) {
7355  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7356  return;
7357  }
7358  __kmp_do_middle_initialize();
7359  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7360 }
7361 
7362 void __kmp_parallel_initialize(void) {
7363  int gtid = __kmp_entry_gtid(); // this might be a new root
7364 
7365  /* synchronize parallel initialization (for sibling) */
7366  if (TCR_4(__kmp_init_parallel))
7367  return;
7368  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7369  if (TCR_4(__kmp_init_parallel)) {
7370  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7371  return;
7372  }
7373 
7374  /* TODO reinitialization after we have already shut down */
7375  if (TCR_4(__kmp_global.g.g_done)) {
7376  KA_TRACE(
7377  10,
7378  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7379  __kmp_infinite_loop();
7380  }
7381 
7382  /* jc: The lock __kmp_initz_lock is already held, so calling
7383  __kmp_serial_initialize would cause a deadlock. So we call
7384  __kmp_do_serial_initialize directly. */
7385  if (!__kmp_init_middle) {
7386  __kmp_do_middle_initialize();
7387  }
7388  __kmp_assign_root_init_mask();
7389  __kmp_resume_if_hard_paused();
7390 
7391  /* begin initialization */
7392  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7393  KMP_ASSERT(KMP_UBER_GTID(gtid));
7394 
7395 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7396  // Save the FP control regs.
7397  // Worker threads will set theirs to these values at thread startup.
7398  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7399  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7400  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7401 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7402 
7403 #if KMP_OS_UNIX
7404 #if KMP_HANDLE_SIGNALS
7405  /* must be after __kmp_serial_initialize */
7406  __kmp_install_signals(TRUE);
7407 #endif
7408 #endif
7409 
7410  __kmp_suspend_initialize();
7411 
7412 #if defined(USE_LOAD_BALANCE)
7413  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7414  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7415  }
7416 #else
7417  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7418  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7419  }
7420 #endif
7421 
7422  if (__kmp_version) {
7423  __kmp_print_version_2();
7424  }
7425 
7426  /* we have finished parallel initialization */
7427  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7428 
7429  KMP_MB();
7430  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7431 
7432  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7433 }
7434 
7435 void __kmp_hidden_helper_initialize() {
7436  if (TCR_4(__kmp_init_hidden_helper))
7437  return;
7438 
7439  // __kmp_parallel_initialize is required before we initialize hidden helper
7440  if (!TCR_4(__kmp_init_parallel))
7441  __kmp_parallel_initialize();
7442 
7443  // Double check. Note that this double check should not be placed before
7444  // __kmp_parallel_initialize as it will cause dead lock.
7445  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7446  if (TCR_4(__kmp_init_hidden_helper)) {
7447  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7448  return;
7449  }
7450 
7451  // Set the count of hidden helper tasks to be executed to zero
7452  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7453 
7454  // Set the global variable indicating that we're initializing hidden helper
7455  // team/threads
7456  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7457 
7458  // Platform independent initialization
7459  __kmp_do_initialize_hidden_helper_threads();
7460 
7461  // Wait here for the finish of initialization of hidden helper teams
7462  __kmp_hidden_helper_threads_initz_wait();
7463 
7464  // We have finished hidden helper initialization
7465  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7466 
7467  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7468 }
7469 
7470 /* ------------------------------------------------------------------------ */
7471 
7472 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7473  kmp_team_t *team) {
7474  kmp_disp_t *dispatch;
7475 
7476  KMP_MB();
7477 
7478  /* none of the threads have encountered any constructs, yet. */
7479  this_thr->th.th_local.this_construct = 0;
7480 #if KMP_CACHE_MANAGE
7481  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7482 #endif /* KMP_CACHE_MANAGE */
7483  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7484  KMP_DEBUG_ASSERT(dispatch);
7485  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7486  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7487  // this_thr->th.th_info.ds.ds_tid ] );
7488 
7489  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7490  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7491  if (__kmp_env_consistency_check)
7492  __kmp_push_parallel(gtid, team->t.t_ident);
7493 
7494  KMP_MB(); /* Flush all pending memory write invalidates. */
7495 }
7496 
7497 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7498  kmp_team_t *team) {
7499  if (__kmp_env_consistency_check)
7500  __kmp_pop_parallel(gtid, team->t.t_ident);
7501 
7502  __kmp_finish_implicit_task(this_thr);
7503 }
7504 
7505 int __kmp_invoke_task_func(int gtid) {
7506  int rc;
7507  int tid = __kmp_tid_from_gtid(gtid);
7508  kmp_info_t *this_thr = __kmp_threads[gtid];
7509  kmp_team_t *team = this_thr->th.th_team;
7510 
7511  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7512 #if USE_ITT_BUILD
7513  if (__itt_stack_caller_create_ptr) {
7514  // inform ittnotify about entering user's code
7515  if (team->t.t_stack_id != NULL) {
7516  __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7517  } else {
7518  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7519  __kmp_itt_stack_callee_enter(
7520  (__itt_caller)team->t.t_parent->t.t_stack_id);
7521  }
7522  }
7523 #endif /* USE_ITT_BUILD */
7524 #if INCLUDE_SSC_MARKS
7525  SSC_MARK_INVOKING();
7526 #endif
7527 
7528 #if OMPT_SUPPORT
7529  void *dummy;
7530  void **exit_frame_p;
7531  ompt_data_t *my_task_data;
7532  ompt_data_t *my_parallel_data;
7533  int ompt_team_size;
7534 
7535  if (ompt_enabled.enabled) {
7536  exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7537  .ompt_task_info.frame.exit_frame.ptr);
7538  } else {
7539  exit_frame_p = &dummy;
7540  }
7541 
7542  my_task_data =
7543  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7544  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7545  if (ompt_enabled.ompt_callback_implicit_task) {
7546  ompt_team_size = team->t.t_nproc;
7547  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7548  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7549  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7550  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7551  }
7552 #endif
7553 
7554 #if KMP_STATS_ENABLED
7555  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7556  if (previous_state == stats_state_e::TEAMS_REGION) {
7557  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7558  } else {
7559  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7560  }
7561  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7562 #endif
7563 
7564  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7565  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7566 #if OMPT_SUPPORT
7567  ,
7568  exit_frame_p
7569 #endif
7570  );
7571 #if OMPT_SUPPORT
7572  *exit_frame_p = NULL;
7573  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7574 #endif
7575 
7576 #if KMP_STATS_ENABLED
7577  if (previous_state == stats_state_e::TEAMS_REGION) {
7578  KMP_SET_THREAD_STATE(previous_state);
7579  }
7580  KMP_POP_PARTITIONED_TIMER();
7581 #endif
7582 
7583 #if USE_ITT_BUILD
7584  if (__itt_stack_caller_create_ptr) {
7585  // inform ittnotify about leaving user's code
7586  if (team->t.t_stack_id != NULL) {
7587  __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7588  } else {
7589  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7590  __kmp_itt_stack_callee_leave(
7591  (__itt_caller)team->t.t_parent->t.t_stack_id);
7592  }
7593  }
7594 #endif /* USE_ITT_BUILD */
7595  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7596 
7597  return rc;
7598 }
7599 
7600 void __kmp_teams_master(int gtid) {
7601  // This routine is called by all primary threads in teams construct
7602  kmp_info_t *thr = __kmp_threads[gtid];
7603  kmp_team_t *team = thr->th.th_team;
7604  ident_t *loc = team->t.t_ident;
7605  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7606  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7607  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7608  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7609  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7610 
7611  // This thread is a new CG root. Set up the proper variables.
7612  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7613  tmp->cg_root = thr; // Make thr the CG root
7614  // Init to thread limit stored when league primary threads were forked
7615  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7616  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7617  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7618  " cg_nthreads to 1\n",
7619  thr, tmp));
7620  tmp->up = thr->th.th_cg_roots;
7621  thr->th.th_cg_roots = tmp;
7622 
7623 // Launch league of teams now, but not let workers execute
7624 // (they hang on fork barrier until next parallel)
7625 #if INCLUDE_SSC_MARKS
7626  SSC_MARK_FORKING();
7627 #endif
7628  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7629  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7630  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7631 #if INCLUDE_SSC_MARKS
7632  SSC_MARK_JOINING();
7633 #endif
7634  // If the team size was reduced from the limit, set it to the new size
7635  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7636  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7637  // AC: last parameter "1" eliminates join barrier which won't work because
7638  // worker threads are in a fork barrier waiting for more parallel regions
7639  __kmp_join_call(loc, gtid
7640 #if OMPT_SUPPORT
7641  ,
7642  fork_context_intel
7643 #endif
7644  ,
7645  1);
7646 }
7647 
7648 int __kmp_invoke_teams_master(int gtid) {
7649  kmp_info_t *this_thr = __kmp_threads[gtid];
7650  kmp_team_t *team = this_thr->th.th_team;
7651 #if KMP_DEBUG
7652  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7653  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7654  (void *)__kmp_teams_master);
7655 #endif
7656  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7657 #if OMPT_SUPPORT
7658  int tid = __kmp_tid_from_gtid(gtid);
7659  ompt_data_t *task_data =
7660  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7661  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7662  if (ompt_enabled.ompt_callback_implicit_task) {
7663  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7664  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7665  ompt_task_initial);
7666  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7667  }
7668 #endif
7669  __kmp_teams_master(gtid);
7670 #if OMPT_SUPPORT
7671  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7672 #endif
7673  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7674  return 1;
7675 }
7676 
7677 /* this sets the requested number of threads for the next parallel region
7678  encountered by this team. since this should be enclosed in the forkjoin
7679  critical section it should avoid race conditions with asymmetrical nested
7680  parallelism */
7681 
7682 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7683  kmp_info_t *thr = __kmp_threads[gtid];
7684 
7685  if (num_threads > 0)
7686  thr->th.th_set_nproc = num_threads;
7687 }
7688 
7689 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7690  int num_threads) {
7691  KMP_DEBUG_ASSERT(thr);
7692  // Remember the number of threads for inner parallel regions
7693  if (!TCR_4(__kmp_init_middle))
7694  __kmp_middle_initialize(); // get internal globals calculated
7695  __kmp_assign_root_init_mask();
7696  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7697  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7698 
7699  if (num_threads == 0) {
7700  if (__kmp_teams_thread_limit > 0) {
7701  num_threads = __kmp_teams_thread_limit;
7702  } else {
7703  num_threads = __kmp_avail_proc / num_teams;
7704  }
7705  // adjust num_threads w/o warning as it is not user setting
7706  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7707  // no thread_limit clause specified - do not change thread-limit-var ICV
7708  if (num_threads > __kmp_dflt_team_nth) {
7709  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7710  }
7711  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7712  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7713  } // prevent team size to exceed thread-limit-var
7714  if (num_teams * num_threads > __kmp_teams_max_nth) {
7715  num_threads = __kmp_teams_max_nth / num_teams;
7716  }
7717  if (num_threads == 0) {
7718  num_threads = 1;
7719  }
7720  } else {
7721  if (num_threads < 0) {
7722  __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7723  __kmp_msg_null);
7724  num_threads = 1;
7725  }
7726  // This thread will be the primary thread of the league primary threads
7727  // Store new thread limit; old limit is saved in th_cg_roots list
7728  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7729  // num_threads = min(num_threads, nthreads-var)
7730  if (num_threads > __kmp_dflt_team_nth) {
7731  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7732  }
7733  if (num_teams * num_threads > __kmp_teams_max_nth) {
7734  int new_threads = __kmp_teams_max_nth / num_teams;
7735  if (new_threads == 0) {
7736  new_threads = 1;
7737  }
7738  if (new_threads != num_threads) {
7739  if (!__kmp_reserve_warn) { // user asked for too many threads
7740  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7741  __kmp_msg(kmp_ms_warning,
7742  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7743  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7744  }
7745  }
7746  num_threads = new_threads;
7747  }
7748  }
7749  thr->th.th_teams_size.nth = num_threads;
7750 }
7751 
7752 /* this sets the requested number of teams for the teams region and/or
7753  the number of threads for the next parallel region encountered */
7754 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7755  int num_threads) {
7756  kmp_info_t *thr = __kmp_threads[gtid];
7757  if (num_teams < 0) {
7758  // OpenMP specification requires requested values to be positive,
7759  // but people can send us any value, so we'd better check
7760  __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7761  __kmp_msg_null);
7762  num_teams = 1;
7763  }
7764  if (num_teams == 0) {
7765  if (__kmp_nteams > 0) {
7766  num_teams = __kmp_nteams;
7767  } else {
7768  num_teams = 1; // default number of teams is 1.
7769  }
7770  }
7771  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7772  if (!__kmp_reserve_warn) {
7773  __kmp_reserve_warn = 1;
7774  __kmp_msg(kmp_ms_warning,
7775  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7776  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7777  }
7778  num_teams = __kmp_teams_max_nth;
7779  }
7780  // Set number of teams (number of threads in the outer "parallel" of the
7781  // teams)
7782  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7783 
7784  __kmp_push_thread_limit(thr, num_teams, num_threads);
7785 }
7786 
7787 /* This sets the requested number of teams for the teams region and/or
7788  the number of threads for the next parallel region encountered */
7789 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7790  int num_teams_ub, int num_threads) {
7791  kmp_info_t *thr = __kmp_threads[gtid];
7792  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7793  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7794  KMP_DEBUG_ASSERT(num_threads >= 0);
7795 
7796  if (num_teams_lb > num_teams_ub) {
7797  __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7798  KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7799  }
7800 
7801  int num_teams = 1; // defalt number of teams is 1.
7802 
7803  if (num_teams_lb == 0 && num_teams_ub > 0)
7804  num_teams_lb = num_teams_ub;
7805 
7806  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7807  num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7808  if (num_teams > __kmp_teams_max_nth) {
7809  if (!__kmp_reserve_warn) {
7810  __kmp_reserve_warn = 1;
7811  __kmp_msg(kmp_ms_warning,
7812  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7813  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7814  }
7815  num_teams = __kmp_teams_max_nth;
7816  }
7817  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7818  num_teams = num_teams_ub;
7819  } else { // num_teams_lb <= num_teams <= num_teams_ub
7820  if (num_threads <= 0) {
7821  if (num_teams_ub > __kmp_teams_max_nth) {
7822  num_teams = num_teams_lb;
7823  } else {
7824  num_teams = num_teams_ub;
7825  }
7826  } else {
7827  num_teams = (num_threads > __kmp_teams_max_nth)
7828  ? num_teams
7829  : __kmp_teams_max_nth / num_threads;
7830  if (num_teams < num_teams_lb) {
7831  num_teams = num_teams_lb;
7832  } else if (num_teams > num_teams_ub) {
7833  num_teams = num_teams_ub;
7834  }
7835  }
7836  }
7837  // Set number of teams (number of threads in the outer "parallel" of the
7838  // teams)
7839  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7840 
7841  __kmp_push_thread_limit(thr, num_teams, num_threads);
7842 }
7843 
7844 // Set the proc_bind var to use in the following parallel region.
7845 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7846  kmp_info_t *thr = __kmp_threads[gtid];
7847  thr->th.th_set_proc_bind = proc_bind;
7848 }
7849 
7850 /* Launch the worker threads into the microtask. */
7851 
7852 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7853  kmp_info_t *this_thr = __kmp_threads[gtid];
7854 
7855 #ifdef KMP_DEBUG
7856  int f;
7857 #endif /* KMP_DEBUG */
7858 
7859  KMP_DEBUG_ASSERT(team);
7860  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7861  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7862  KMP_MB(); /* Flush all pending memory write invalidates. */
7863 
7864  team->t.t_construct = 0; /* no single directives seen yet */
7865  team->t.t_ordered.dt.t_value =
7866  0; /* thread 0 enters the ordered section first */
7867 
7868  /* Reset the identifiers on the dispatch buffer */
7869  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7870  if (team->t.t_max_nproc > 1) {
7871  int i;
7872  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7873  team->t.t_disp_buffer[i].buffer_index = i;
7874  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7875  }
7876  } else {
7877  team->t.t_disp_buffer[0].buffer_index = 0;
7878  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7879  }
7880 
7881  KMP_MB(); /* Flush all pending memory write invalidates. */
7882  KMP_ASSERT(this_thr->th.th_team == team);
7883 
7884 #ifdef KMP_DEBUG
7885  for (f = 0; f < team->t.t_nproc; f++) {
7886  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7887  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7888  }
7889 #endif /* KMP_DEBUG */
7890 
7891  /* release the worker threads so they may begin working */
7892  __kmp_fork_barrier(gtid, 0);
7893 }
7894 
7895 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7896  kmp_info_t *this_thr = __kmp_threads[gtid];
7897 
7898  KMP_DEBUG_ASSERT(team);
7899  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7900  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7901  KMP_MB(); /* Flush all pending memory write invalidates. */
7902 
7903  /* Join barrier after fork */
7904 
7905 #ifdef KMP_DEBUG
7906  if (__kmp_threads[gtid] &&
7907  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7908  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7909  __kmp_threads[gtid]);
7910  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7911  "team->t.t_nproc=%d\n",
7912  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7913  team->t.t_nproc);
7914  __kmp_print_structure();
7915  }
7916  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7917  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7918 #endif /* KMP_DEBUG */
7919 
7920  __kmp_join_barrier(gtid); /* wait for everyone */
7921 #if OMPT_SUPPORT
7922  if (ompt_enabled.enabled &&
7923  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7924  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7925  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7926  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7927 #if OMPT_OPTIONAL
7928  void *codeptr = NULL;
7929  if (KMP_MASTER_TID(ds_tid) &&
7930  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7931  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7932  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7933 
7934  if (ompt_enabled.ompt_callback_sync_region_wait) {
7935  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7936  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7937  codeptr);
7938  }
7939  if (ompt_enabled.ompt_callback_sync_region) {
7940  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7941  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7942  codeptr);
7943  }
7944 #endif
7945  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7946  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7947  ompt_scope_end, NULL, task_data, 0, ds_tid,
7948  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7949  }
7950  }
7951 #endif
7952 
7953  KMP_MB(); /* Flush all pending memory write invalidates. */
7954  KMP_ASSERT(this_thr->th.th_team == team);
7955 }
7956 
7957 /* ------------------------------------------------------------------------ */
7958 
7959 #ifdef USE_LOAD_BALANCE
7960 
7961 // Return the worker threads actively spinning in the hot team, if we
7962 // are at the outermost level of parallelism. Otherwise, return 0.
7963 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7964  int i;
7965  int retval;
7966  kmp_team_t *hot_team;
7967 
7968  if (root->r.r_active) {
7969  return 0;
7970  }
7971  hot_team = root->r.r_hot_team;
7972  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7973  return hot_team->t.t_nproc - 1; // Don't count primary thread
7974  }
7975 
7976  // Skip the primary thread - it is accounted for elsewhere.
7977  retval = 0;
7978  for (i = 1; i < hot_team->t.t_nproc; i++) {
7979  if (hot_team->t.t_threads[i]->th.th_active) {
7980  retval++;
7981  }
7982  }
7983  return retval;
7984 }
7985 
7986 // Perform an automatic adjustment to the number of
7987 // threads used by the next parallel region.
7988 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7989  int retval;
7990  int pool_active;
7991  int hot_team_active;
7992  int team_curr_active;
7993  int system_active;
7994 
7995  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7996  set_nproc));
7997  KMP_DEBUG_ASSERT(root);
7998  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7999  ->th.th_current_task->td_icvs.dynamic == TRUE);
8000  KMP_DEBUG_ASSERT(set_nproc > 1);
8001 
8002  if (set_nproc == 1) {
8003  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8004  return 1;
8005  }
8006 
8007  // Threads that are active in the thread pool, active in the hot team for this
8008  // particular root (if we are at the outer par level), and the currently
8009  // executing thread (to become the primary thread) are available to add to the
8010  // new team, but are currently contributing to the system load, and must be
8011  // accounted for.
8012  pool_active = __kmp_thread_pool_active_nth;
8013  hot_team_active = __kmp_active_hot_team_nproc(root);
8014  team_curr_active = pool_active + hot_team_active + 1;
8015 
8016  // Check the system load.
8017  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8018  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8019  "hot team active = %d\n",
8020  system_active, pool_active, hot_team_active));
8021 
8022  if (system_active < 0) {
8023  // There was an error reading the necessary info from /proc, so use the
8024  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8025  // = dynamic_thread_limit, we shouldn't wind up getting back here.
8026  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8027  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8028 
8029  // Make this call behave like the thread limit algorithm.
8030  retval = __kmp_avail_proc - __kmp_nth +
8031  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8032  if (retval > set_nproc) {
8033  retval = set_nproc;
8034  }
8035  if (retval < KMP_MIN_NTH) {
8036  retval = KMP_MIN_NTH;
8037  }
8038 
8039  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8040  retval));
8041  return retval;
8042  }
8043 
8044  // There is a slight delay in the load balance algorithm in detecting new
8045  // running procs. The real system load at this instant should be at least as
8046  // large as the #active omp thread that are available to add to the team.
8047  if (system_active < team_curr_active) {
8048  system_active = team_curr_active;
8049  }
8050  retval = __kmp_avail_proc - system_active + team_curr_active;
8051  if (retval > set_nproc) {
8052  retval = set_nproc;
8053  }
8054  if (retval < KMP_MIN_NTH) {
8055  retval = KMP_MIN_NTH;
8056  }
8057 
8058  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8059  return retval;
8060 } // __kmp_load_balance_nproc()
8061 
8062 #endif /* USE_LOAD_BALANCE */
8063 
8064 /* ------------------------------------------------------------------------ */
8065 
8066 /* NOTE: this is called with the __kmp_init_lock held */
8067 void __kmp_cleanup(void) {
8068  int f;
8069 
8070  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8071 
8072  if (TCR_4(__kmp_init_parallel)) {
8073 #if KMP_HANDLE_SIGNALS
8074  __kmp_remove_signals();
8075 #endif
8076  TCW_4(__kmp_init_parallel, FALSE);
8077  }
8078 
8079  if (TCR_4(__kmp_init_middle)) {
8080 #if KMP_AFFINITY_SUPPORTED
8081  __kmp_affinity_uninitialize();
8082 #endif /* KMP_AFFINITY_SUPPORTED */
8083  __kmp_cleanup_hierarchy();
8084  TCW_4(__kmp_init_middle, FALSE);
8085  }
8086 
8087  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8088 
8089  if (__kmp_init_serial) {
8090  __kmp_runtime_destroy();
8091  __kmp_init_serial = FALSE;
8092  }
8093 
8094  __kmp_cleanup_threadprivate_caches();
8095 
8096  for (f = 0; f < __kmp_threads_capacity; f++) {
8097  if (__kmp_root[f] != NULL) {
8098  __kmp_free(__kmp_root[f]);
8099  __kmp_root[f] = NULL;
8100  }
8101  }
8102  __kmp_free(__kmp_threads);
8103  // __kmp_threads and __kmp_root were allocated at once, as single block, so
8104  // there is no need in freeing __kmp_root.
8105  __kmp_threads = NULL;
8106  __kmp_root = NULL;
8107  __kmp_threads_capacity = 0;
8108 
8109  // Free old __kmp_threads arrays if they exist.
8110  kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8111  while (ptr) {
8112  kmp_old_threads_list_t *next = ptr->next;
8113  __kmp_free(ptr->threads);
8114  __kmp_free(ptr);
8115  ptr = next;
8116  }
8117 
8118 #if KMP_USE_DYNAMIC_LOCK
8119  __kmp_cleanup_indirect_user_locks();
8120 #else
8121  __kmp_cleanup_user_locks();
8122 #endif
8123 #if OMPD_SUPPORT
8124  if (ompd_state) {
8125  __kmp_free(ompd_env_block);
8126  ompd_env_block = NULL;
8127  ompd_env_block_size = 0;
8128  }
8129 #endif
8130 
8131 #if KMP_AFFINITY_SUPPORTED
8132  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8133  __kmp_cpuinfo_file = NULL;
8134 #endif /* KMP_AFFINITY_SUPPORTED */
8135 
8136 #if KMP_USE_ADAPTIVE_LOCKS
8137 #if KMP_DEBUG_ADAPTIVE_LOCKS
8138  __kmp_print_speculative_stats();
8139 #endif
8140 #endif
8141  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8142  __kmp_nested_nth.nth = NULL;
8143  __kmp_nested_nth.size = 0;
8144  __kmp_nested_nth.used = 0;
8145  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8146  __kmp_nested_proc_bind.bind_types = NULL;
8147  __kmp_nested_proc_bind.size = 0;
8148  __kmp_nested_proc_bind.used = 0;
8149  if (__kmp_affinity_format) {
8150  KMP_INTERNAL_FREE(__kmp_affinity_format);
8151  __kmp_affinity_format = NULL;
8152  }
8153 
8154  __kmp_i18n_catclose();
8155 
8156 #if KMP_USE_HIER_SCHED
8157  __kmp_hier_scheds.deallocate();
8158 #endif
8159 
8160 #if KMP_STATS_ENABLED
8161  __kmp_stats_fini();
8162 #endif
8163 
8164  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8165 }
8166 
8167 /* ------------------------------------------------------------------------ */
8168 
8169 int __kmp_ignore_mppbeg(void) {
8170  char *env;
8171 
8172  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8173  if (__kmp_str_match_false(env))
8174  return FALSE;
8175  }
8176  // By default __kmpc_begin() is no-op.
8177  return TRUE;
8178 }
8179 
8180 int __kmp_ignore_mppend(void) {
8181  char *env;
8182 
8183  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8184  if (__kmp_str_match_false(env))
8185  return FALSE;
8186  }
8187  // By default __kmpc_end() is no-op.
8188  return TRUE;
8189 }
8190 
8191 void __kmp_internal_begin(void) {
8192  int gtid;
8193  kmp_root_t *root;
8194 
8195  /* this is a very important step as it will register new sibling threads
8196  and assign these new uber threads a new gtid */
8197  gtid = __kmp_entry_gtid();
8198  root = __kmp_threads[gtid]->th.th_root;
8199  KMP_ASSERT(KMP_UBER_GTID(gtid));
8200 
8201  if (root->r.r_begin)
8202  return;
8203  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8204  if (root->r.r_begin) {
8205  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8206  return;
8207  }
8208 
8209  root->r.r_begin = TRUE;
8210 
8211  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8212 }
8213 
8214 /* ------------------------------------------------------------------------ */
8215 
8216 void __kmp_user_set_library(enum library_type arg) {
8217  int gtid;
8218  kmp_root_t *root;
8219  kmp_info_t *thread;
8220 
8221  /* first, make sure we are initialized so we can get our gtid */
8222 
8223  gtid = __kmp_entry_gtid();
8224  thread = __kmp_threads[gtid];
8225 
8226  root = thread->th.th_root;
8227 
8228  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8229  library_serial));
8230  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8231  thread */
8232  KMP_WARNING(SetLibraryIncorrectCall);
8233  return;
8234  }
8235 
8236  switch (arg) {
8237  case library_serial:
8238  thread->th.th_set_nproc = 0;
8239  set__nproc(thread, 1);
8240  break;
8241  case library_turnaround:
8242  thread->th.th_set_nproc = 0;
8243  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8244  : __kmp_dflt_team_nth_ub);
8245  break;
8246  case library_throughput:
8247  thread->th.th_set_nproc = 0;
8248  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8249  : __kmp_dflt_team_nth_ub);
8250  break;
8251  default:
8252  KMP_FATAL(UnknownLibraryType, arg);
8253  }
8254 
8255  __kmp_aux_set_library(arg);
8256 }
8257 
8258 void __kmp_aux_set_stacksize(size_t arg) {
8259  if (!__kmp_init_serial)
8260  __kmp_serial_initialize();
8261 
8262 #if KMP_OS_DARWIN
8263  if (arg & (0x1000 - 1)) {
8264  arg &= ~(0x1000 - 1);
8265  if (arg + 0x1000) /* check for overflow if we round up */
8266  arg += 0x1000;
8267  }
8268 #endif
8269  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8270 
8271  /* only change the default stacksize before the first parallel region */
8272  if (!TCR_4(__kmp_init_parallel)) {
8273  size_t value = arg; /* argument is in bytes */
8274 
8275  if (value < __kmp_sys_min_stksize)
8276  value = __kmp_sys_min_stksize;
8277  else if (value > KMP_MAX_STKSIZE)
8278  value = KMP_MAX_STKSIZE;
8279 
8280  __kmp_stksize = value;
8281 
8282  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8283  }
8284 
8285  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8286 }
8287 
8288 /* set the behaviour of the runtime library */
8289 /* TODO this can cause some odd behaviour with sibling parallelism... */
8290 void __kmp_aux_set_library(enum library_type arg) {
8291  __kmp_library = arg;
8292 
8293  switch (__kmp_library) {
8294  case library_serial: {
8295  KMP_INFORM(LibraryIsSerial);
8296  } break;
8297  case library_turnaround:
8298  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8299  __kmp_use_yield = 2; // only yield when oversubscribed
8300  break;
8301  case library_throughput:
8302  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8303  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8304  break;
8305  default:
8306  KMP_FATAL(UnknownLibraryType, arg);
8307  }
8308 }
8309 
8310 /* Getting team information common for all team API */
8311 // Returns NULL if not in teams construct
8312 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8313  kmp_info_t *thr = __kmp_entry_thread();
8314  teams_serialized = 0;
8315  if (thr->th.th_teams_microtask) {
8316  kmp_team_t *team = thr->th.th_team;
8317  int tlevel = thr->th.th_teams_level; // the level of the teams construct
8318  int ii = team->t.t_level;
8319  teams_serialized = team->t.t_serialized;
8320  int level = tlevel + 1;
8321  KMP_DEBUG_ASSERT(ii >= tlevel);
8322  while (ii > level) {
8323  for (teams_serialized = team->t.t_serialized;
8324  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8325  }
8326  if (team->t.t_serialized && (!teams_serialized)) {
8327  team = team->t.t_parent;
8328  continue;
8329  }
8330  if (ii > level) {
8331  team = team->t.t_parent;
8332  ii--;
8333  }
8334  }
8335  return team;
8336  }
8337  return NULL;
8338 }
8339 
8340 int __kmp_aux_get_team_num() {
8341  int serialized;
8342  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8343  if (team) {
8344  if (serialized > 1) {
8345  return 0; // teams region is serialized ( 1 team of 1 thread ).
8346  } else {
8347  return team->t.t_master_tid;
8348  }
8349  }
8350  return 0;
8351 }
8352 
8353 int __kmp_aux_get_num_teams() {
8354  int serialized;
8355  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8356  if (team) {
8357  if (serialized > 1) {
8358  return 1;
8359  } else {
8360  return team->t.t_parent->t.t_nproc;
8361  }
8362  }
8363  return 1;
8364 }
8365 
8366 /* ------------------------------------------------------------------------ */
8367 
8368 /*
8369  * Affinity Format Parser
8370  *
8371  * Field is in form of: %[[[0].]size]type
8372  * % and type are required (%% means print a literal '%')
8373  * type is either single char or long name surrounded by {},
8374  * e.g., N or {num_threads}
8375  * 0 => leading zeros
8376  * . => right justified when size is specified
8377  * by default output is left justified
8378  * size is the *minimum* field length
8379  * All other characters are printed as is
8380  *
8381  * Available field types:
8382  * L {thread_level} - omp_get_level()
8383  * n {thread_num} - omp_get_thread_num()
8384  * h {host} - name of host machine
8385  * P {process_id} - process id (integer)
8386  * T {thread_identifier} - native thread identifier (integer)
8387  * N {num_threads} - omp_get_num_threads()
8388  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8389  * a {thread_affinity} - comma separated list of integers or integer ranges
8390  * (values of affinity mask)
8391  *
8392  * Implementation-specific field types can be added
8393  * If a type is unknown, print "undefined"
8394  */
8395 
8396 // Structure holding the short name, long name, and corresponding data type
8397 // for snprintf. A table of these will represent the entire valid keyword
8398 // field types.
8399 typedef struct kmp_affinity_format_field_t {
8400  char short_name; // from spec e.g., L -> thread level
8401  const char *long_name; // from spec thread_level -> thread level
8402  char field_format; // data type for snprintf (typically 'd' or 's'
8403  // for integer or string)
8404 } kmp_affinity_format_field_t;
8405 
8406 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8407 #if KMP_AFFINITY_SUPPORTED
8408  {'A', "thread_affinity", 's'},
8409 #endif
8410  {'t', "team_num", 'd'},
8411  {'T', "num_teams", 'd'},
8412  {'L', "nesting_level", 'd'},
8413  {'n', "thread_num", 'd'},
8414  {'N', "num_threads", 'd'},
8415  {'a', "ancestor_tnum", 'd'},
8416  {'H', "host", 's'},
8417  {'P', "process_id", 'd'},
8418  {'i', "native_thread_id", 'd'}};
8419 
8420 // Return the number of characters it takes to hold field
8421 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8422  const char **ptr,
8423  kmp_str_buf_t *field_buffer) {
8424  int rc, format_index, field_value;
8425  const char *width_left, *width_right;
8426  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8427  static const int FORMAT_SIZE = 20;
8428  char format[FORMAT_SIZE] = {0};
8429  char absolute_short_name = 0;
8430 
8431  KMP_DEBUG_ASSERT(gtid >= 0);
8432  KMP_DEBUG_ASSERT(th);
8433  KMP_DEBUG_ASSERT(**ptr == '%');
8434  KMP_DEBUG_ASSERT(field_buffer);
8435 
8436  __kmp_str_buf_clear(field_buffer);
8437 
8438  // Skip the initial %
8439  (*ptr)++;
8440 
8441  // Check for %% first
8442  if (**ptr == '%') {
8443  __kmp_str_buf_cat(field_buffer, "%", 1);
8444  (*ptr)++; // skip over the second %
8445  return 1;
8446  }
8447 
8448  // Parse field modifiers if they are present
8449  pad_zeros = false;
8450  if (**ptr == '0') {
8451  pad_zeros = true;
8452  (*ptr)++; // skip over 0
8453  }
8454  right_justify = false;
8455  if (**ptr == '.') {
8456  right_justify = true;
8457  (*ptr)++; // skip over .
8458  }
8459  // Parse width of field: [width_left, width_right)
8460  width_left = width_right = NULL;
8461  if (**ptr >= '0' && **ptr <= '9') {
8462  width_left = *ptr;
8463  SKIP_DIGITS(*ptr);
8464  width_right = *ptr;
8465  }
8466 
8467  // Create the format for KMP_SNPRINTF based on flags parsed above
8468  format_index = 0;
8469  format[format_index++] = '%';
8470  if (!right_justify)
8471  format[format_index++] = '-';
8472  if (pad_zeros)
8473  format[format_index++] = '0';
8474  if (width_left && width_right) {
8475  int i = 0;
8476  // Only allow 8 digit number widths.
8477  // This also prevents overflowing format variable
8478  while (i < 8 && width_left < width_right) {
8479  format[format_index++] = *width_left;
8480  width_left++;
8481  i++;
8482  }
8483  }
8484 
8485  // Parse a name (long or short)
8486  // Canonicalize the name into absolute_short_name
8487  found_valid_name = false;
8488  parse_long_name = (**ptr == '{');
8489  if (parse_long_name)
8490  (*ptr)++; // skip initial left brace
8491  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8492  sizeof(__kmp_affinity_format_table[0]);
8493  ++i) {
8494  char short_name = __kmp_affinity_format_table[i].short_name;
8495  const char *long_name = __kmp_affinity_format_table[i].long_name;
8496  char field_format = __kmp_affinity_format_table[i].field_format;
8497  if (parse_long_name) {
8498  size_t length = KMP_STRLEN(long_name);
8499  if (strncmp(*ptr, long_name, length) == 0) {
8500  found_valid_name = true;
8501  (*ptr) += length; // skip the long name
8502  }
8503  } else if (**ptr == short_name) {
8504  found_valid_name = true;
8505  (*ptr)++; // skip the short name
8506  }
8507  if (found_valid_name) {
8508  format[format_index++] = field_format;
8509  format[format_index++] = '\0';
8510  absolute_short_name = short_name;
8511  break;
8512  }
8513  }
8514  if (parse_long_name) {
8515  if (**ptr != '}') {
8516  absolute_short_name = 0;
8517  } else {
8518  (*ptr)++; // skip over the right brace
8519  }
8520  }
8521 
8522  // Attempt to fill the buffer with the requested
8523  // value using snprintf within __kmp_str_buf_print()
8524  switch (absolute_short_name) {
8525  case 't':
8526  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8527  break;
8528  case 'T':
8529  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8530  break;
8531  case 'L':
8532  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8533  break;
8534  case 'n':
8535  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8536  break;
8537  case 'H': {
8538  static const int BUFFER_SIZE = 256;
8539  char buf[BUFFER_SIZE];
8540  __kmp_expand_host_name(buf, BUFFER_SIZE);
8541  rc = __kmp_str_buf_print(field_buffer, format, buf);
8542  } break;
8543  case 'P':
8544  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8545  break;
8546  case 'i':
8547  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8548  break;
8549  case 'N':
8550  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8551  break;
8552  case 'a':
8553  field_value =
8554  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8555  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8556  break;
8557 #if KMP_AFFINITY_SUPPORTED
8558  case 'A': {
8559  kmp_str_buf_t buf;
8560  __kmp_str_buf_init(&buf);
8561  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8562  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8563  __kmp_str_buf_free(&buf);
8564  } break;
8565 #endif
8566  default:
8567  // According to spec, If an implementation does not have info for field
8568  // type, then "undefined" is printed
8569  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8570  // Skip the field
8571  if (parse_long_name) {
8572  SKIP_TOKEN(*ptr);
8573  if (**ptr == '}')
8574  (*ptr)++;
8575  } else {
8576  (*ptr)++;
8577  }
8578  }
8579 
8580  KMP_ASSERT(format_index <= FORMAT_SIZE);
8581  return rc;
8582 }
8583 
8584 /*
8585  * Return number of characters needed to hold the affinity string
8586  * (not including null byte character)
8587  * The resultant string is printed to buffer, which the caller can then
8588  * handle afterwards
8589  */
8590 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8591  kmp_str_buf_t *buffer) {
8592  const char *parse_ptr;
8593  size_t retval;
8594  const kmp_info_t *th;
8595  kmp_str_buf_t field;
8596 
8597  KMP_DEBUG_ASSERT(buffer);
8598  KMP_DEBUG_ASSERT(gtid >= 0);
8599 
8600  __kmp_str_buf_init(&field);
8601  __kmp_str_buf_clear(buffer);
8602 
8603  th = __kmp_threads[gtid];
8604  retval = 0;
8605 
8606  // If format is NULL or zero-length string, then we use
8607  // affinity-format-var ICV
8608  parse_ptr = format;
8609  if (parse_ptr == NULL || *parse_ptr == '\0') {
8610  parse_ptr = __kmp_affinity_format;
8611  }
8612  KMP_DEBUG_ASSERT(parse_ptr);
8613 
8614  while (*parse_ptr != '\0') {
8615  // Parse a field
8616  if (*parse_ptr == '%') {
8617  // Put field in the buffer
8618  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8619  __kmp_str_buf_catbuf(buffer, &field);
8620  retval += rc;
8621  } else {
8622  // Put literal character in buffer
8623  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8624  retval++;
8625  parse_ptr++;
8626  }
8627  }
8628  __kmp_str_buf_free(&field);
8629  return retval;
8630 }
8631 
8632 // Displays the affinity string to stdout
8633 void __kmp_aux_display_affinity(int gtid, const char *format) {
8634  kmp_str_buf_t buf;
8635  __kmp_str_buf_init(&buf);
8636  __kmp_aux_capture_affinity(gtid, format, &buf);
8637  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8638  __kmp_str_buf_free(&buf);
8639 }
8640 
8641 /* ------------------------------------------------------------------------ */
8642 
8643 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8644  int blocktime = arg; /* argument is in milliseconds */
8645 #if KMP_USE_MONITOR
8646  int bt_intervals;
8647 #endif
8648  kmp_int8 bt_set;
8649 
8650  __kmp_save_internal_controls(thread);
8651 
8652  /* Normalize and set blocktime for the teams */
8653  if (blocktime < KMP_MIN_BLOCKTIME)
8654  blocktime = KMP_MIN_BLOCKTIME;
8655  else if (blocktime > KMP_MAX_BLOCKTIME)
8656  blocktime = KMP_MAX_BLOCKTIME;
8657 
8658  set__blocktime_team(thread->th.th_team, tid, blocktime);
8659  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8660 
8661 #if KMP_USE_MONITOR
8662  /* Calculate and set blocktime intervals for the teams */
8663  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8664 
8665  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8666  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8667 #endif
8668 
8669  /* Set whether blocktime has been set to "TRUE" */
8670  bt_set = TRUE;
8671 
8672  set__bt_set_team(thread->th.th_team, tid, bt_set);
8673  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8674 #if KMP_USE_MONITOR
8675  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8676  "bt_intervals=%d, monitor_updates=%d\n",
8677  __kmp_gtid_from_tid(tid, thread->th.th_team),
8678  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8679  __kmp_monitor_wakeups));
8680 #else
8681  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8682  __kmp_gtid_from_tid(tid, thread->th.th_team),
8683  thread->th.th_team->t.t_id, tid, blocktime));
8684 #endif
8685 }
8686 
8687 void __kmp_aux_set_defaults(char const *str, size_t len) {
8688  if (!__kmp_init_serial) {
8689  __kmp_serial_initialize();
8690  }
8691  __kmp_env_initialize(str);
8692 
8693  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8694  __kmp_env_print();
8695  }
8696 } // __kmp_aux_set_defaults
8697 
8698 /* ------------------------------------------------------------------------ */
8699 /* internal fast reduction routines */
8700 
8701 PACKED_REDUCTION_METHOD_T
8702 __kmp_determine_reduction_method(
8703  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8704  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8705  kmp_critical_name *lck) {
8706 
8707  // Default reduction method: critical construct ( lck != NULL, like in current
8708  // PAROPT )
8709  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8710  // can be selected by RTL
8711  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8712  // can be selected by RTL
8713  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8714  // among generated by PAROPT.
8715 
8716  PACKED_REDUCTION_METHOD_T retval;
8717 
8718  int team_size;
8719 
8720  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8721  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8722 
8723 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8724  (loc && \
8725  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8726 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8727 
8728  retval = critical_reduce_block;
8729 
8730  // another choice of getting a team size (with 1 dynamic deference) is slower
8731  team_size = __kmp_get_team_num_threads(global_tid);
8732  if (team_size == 1) {
8733 
8734  retval = empty_reduce_block;
8735 
8736  } else {
8737 
8738  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8739 
8740 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8741  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8742 
8743 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8744  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8745 
8746  int teamsize_cutoff = 4;
8747 
8748 #if KMP_MIC_SUPPORTED
8749  if (__kmp_mic_type != non_mic) {
8750  teamsize_cutoff = 8;
8751  }
8752 #endif
8753  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8754  if (tree_available) {
8755  if (team_size <= teamsize_cutoff) {
8756  if (atomic_available) {
8757  retval = atomic_reduce_block;
8758  }
8759  } else {
8760  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8761  }
8762  } else if (atomic_available) {
8763  retval = atomic_reduce_block;
8764  }
8765 #else
8766 #error "Unknown or unsupported OS"
8767 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8768  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8769 
8770 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8771 
8772 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8773 
8774  // basic tuning
8775 
8776  if (atomic_available) {
8777  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8778  retval = atomic_reduce_block;
8779  }
8780  } // otherwise: use critical section
8781 
8782 #elif KMP_OS_DARWIN
8783 
8784  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8785  if (atomic_available && (num_vars <= 3)) {
8786  retval = atomic_reduce_block;
8787  } else if (tree_available) {
8788  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8789  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8790  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8791  }
8792  } // otherwise: use critical section
8793 
8794 #else
8795 #error "Unknown or unsupported OS"
8796 #endif
8797 
8798 #else
8799 #error "Unknown or unsupported architecture"
8800 #endif
8801  }
8802 
8803  // KMP_FORCE_REDUCTION
8804 
8805  // If the team is serialized (team_size == 1), ignore the forced reduction
8806  // method and stay with the unsynchronized method (empty_reduce_block)
8807  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8808  team_size != 1) {
8809 
8810  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8811 
8812  int atomic_available, tree_available;
8813 
8814  switch ((forced_retval = __kmp_force_reduction_method)) {
8815  case critical_reduce_block:
8816  KMP_ASSERT(lck); // lck should be != 0
8817  break;
8818 
8819  case atomic_reduce_block:
8820  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8821  if (!atomic_available) {
8822  KMP_WARNING(RedMethodNotSupported, "atomic");
8823  forced_retval = critical_reduce_block;
8824  }
8825  break;
8826 
8827  case tree_reduce_block:
8828  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8829  if (!tree_available) {
8830  KMP_WARNING(RedMethodNotSupported, "tree");
8831  forced_retval = critical_reduce_block;
8832  } else {
8833 #if KMP_FAST_REDUCTION_BARRIER
8834  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8835 #endif
8836  }
8837  break;
8838 
8839  default:
8840  KMP_ASSERT(0); // "unsupported method specified"
8841  }
8842 
8843  retval = forced_retval;
8844  }
8845 
8846  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8847 
8848 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8849 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8850 
8851  return (retval);
8852 }
8853 // this function is for testing set/get/determine reduce method
8854 kmp_int32 __kmp_get_reduce_method(void) {
8855  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8856 }
8857 
8858 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8859 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8860 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8861 
8862 // Hard pause shuts down the runtime completely. Resume happens naturally when
8863 // OpenMP is used subsequently.
8864 void __kmp_hard_pause() {
8865  __kmp_pause_status = kmp_hard_paused;
8866  __kmp_internal_end_thread(-1);
8867 }
8868 
8869 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8870 void __kmp_resume_if_soft_paused() {
8871  if (__kmp_pause_status == kmp_soft_paused) {
8872  __kmp_pause_status = kmp_not_paused;
8873 
8874  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8875  kmp_info_t *thread = __kmp_threads[gtid];
8876  if (thread) { // Wake it if sleeping
8877  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8878  thread);
8879  if (fl.is_sleeping())
8880  fl.resume(gtid);
8881  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8882  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8883  } else { // thread holds the lock and may sleep soon
8884  do { // until either the thread sleeps, or we can get the lock
8885  if (fl.is_sleeping()) {
8886  fl.resume(gtid);
8887  break;
8888  } else if (__kmp_try_suspend_mx(thread)) {
8889  __kmp_unlock_suspend_mx(thread);
8890  break;
8891  }
8892  } while (1);
8893  }
8894  }
8895  }
8896  }
8897 }
8898 
8899 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8900 // TODO: add warning messages
8901 int __kmp_pause_resource(kmp_pause_status_t level) {
8902  if (level == kmp_not_paused) { // requesting resume
8903  if (__kmp_pause_status == kmp_not_paused) {
8904  // error message about runtime not being paused, so can't resume
8905  return 1;
8906  } else {
8907  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8908  __kmp_pause_status == kmp_hard_paused);
8909  __kmp_pause_status = kmp_not_paused;
8910  return 0;
8911  }
8912  } else if (level == kmp_soft_paused) { // requesting soft pause
8913  if (__kmp_pause_status != kmp_not_paused) {
8914  // error message about already being paused
8915  return 1;
8916  } else {
8917  __kmp_soft_pause();
8918  return 0;
8919  }
8920  } else if (level == kmp_hard_paused) { // requesting hard pause
8921  if (__kmp_pause_status != kmp_not_paused) {
8922  // error message about already being paused
8923  return 1;
8924  } else {
8925  __kmp_hard_pause();
8926  return 0;
8927  }
8928  } else {
8929  // error message about invalid level
8930  return 1;
8931  }
8932 }
8933 
8934 void __kmp_omp_display_env(int verbose) {
8935  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8936  if (__kmp_init_serial == 0)
8937  __kmp_do_serial_initialize();
8938  __kmp_display_env_impl(!verbose, verbose);
8939  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8940 }
8941 
8942 // The team size is changing, so distributed barrier must be modified
8943 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
8944  int new_nthreads) {
8945  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
8946  bp_dist_bar);
8947  kmp_info_t **other_threads = team->t.t_threads;
8948 
8949  // We want all the workers to stop waiting on the barrier while we adjust the
8950  // size of the team.
8951  for (int f = 1; f < old_nthreads; ++f) {
8952  KMP_DEBUG_ASSERT(other_threads[f] != NULL);
8953  // Ignore threads that are already inactive or not present in the team
8954  if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
8955  // teams construct causes thread_limit to get passed in, and some of
8956  // those could be inactive; just ignore them
8957  continue;
8958  }
8959  // If thread is transitioning still to in_use state, wait for it
8960  if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
8961  while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
8962  KMP_CPU_PAUSE();
8963  }
8964  // The thread should be in_use now
8965  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
8966  // Transition to unused state
8967  team->t.t_threads[f]->th.th_used_in_team.store(2);
8968  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
8969  }
8970  // Release all the workers
8971  team->t.b->go_release();
8972 
8973  KMP_MFENCE();
8974 
8975  // Workers should see transition status 2 and move to 0; but may need to be
8976  // woken up first
8977  int count = old_nthreads - 1;
8978  while (count > 0) {
8979  count = old_nthreads - 1;
8980  for (int f = 1; f < old_nthreads; ++f) {
8981  if (other_threads[f]->th.th_used_in_team.load() != 0) {
8982  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
8983  kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
8984  void *, other_threads[f]->th.th_sleep_loc);
8985  __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
8986  }
8987  } else {
8988  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
8989  count--;
8990  }
8991  }
8992  }
8993  // Now update the barrier size
8994  team->t.b->update_num_threads(new_nthreads);
8995  team->t.b->go_reset();
8996 }
8997 
8998 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
8999  // Add the threads back to the team
9000  KMP_DEBUG_ASSERT(team);
9001  // Threads were paused and pointed at th_used_in_team temporarily during a
9002  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9003  // the thread that it should transition itself back into the team. Then, if
9004  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9005  // to wake it up.
9006  for (int f = 1; f < new_nthreads; ++f) {
9007  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9008  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9009  3);
9010  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9011  __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9012  (kmp_flag_32<false, false> *)NULL);
9013  }
9014  }
9015  // The threads should be transitioning to the team; when they are done, they
9016  // should have set th_used_in_team to 1. This loop forces master to wait until
9017  // all threads have moved into the team and are waiting in the barrier.
9018  int count = new_nthreads - 1;
9019  while (count > 0) {
9020  count = new_nthreads - 1;
9021  for (int f = 1; f < new_nthreads; ++f) {
9022  if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9023  count--;
9024  }
9025  }
9026  }
9027 }
9028 
9029 // Globals and functions for hidden helper task
9030 kmp_info_t **__kmp_hidden_helper_threads;
9031 kmp_info_t *__kmp_hidden_helper_main_thread;
9032 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9033 #if KMP_OS_LINUX
9034 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9035 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9036 #else
9037 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9038 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9039 #endif
9040 
9041 namespace {
9042 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9043 
9044 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9045  // This is an explicit synchronization on all hidden helper threads in case
9046  // that when a regular thread pushes a hidden helper task to one hidden
9047  // helper thread, the thread has not been awaken once since they're released
9048  // by the main thread after creating the team.
9049  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9050  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9051  __kmp_hidden_helper_threads_num)
9052  ;
9053 
9054  // If main thread, then wait for signal
9055  if (__kmpc_master(nullptr, *gtid)) {
9056  // First, unset the initial state and release the initial thread
9057  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9058  __kmp_hidden_helper_initz_release();
9059  __kmp_hidden_helper_main_thread_wait();
9060  // Now wake up all worker threads
9061  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9062  __kmp_hidden_helper_worker_thread_signal();
9063  }
9064  }
9065 }
9066 } // namespace
9067 
9068 void __kmp_hidden_helper_threads_initz_routine() {
9069  // Create a new root for hidden helper team/threads
9070  const int gtid = __kmp_register_root(TRUE);
9071  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9072  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9073  __kmp_hidden_helper_main_thread->th.th_set_nproc =
9074  __kmp_hidden_helper_threads_num;
9075 
9076  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9077 
9078  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9079 
9080  // Set the initialization flag to FALSE
9081  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9082 
9083  __kmp_hidden_helper_threads_deinitz_release();
9084 }
9085 
9086 /* Nesting Mode:
9087  Set via KMP_NESTING_MODE, which takes an integer.
9088  Note: we skip duplicate topology levels, and skip levels with only
9089  one entity.
9090  KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9091  KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9092  in the topology, and initializes the number of threads at each of those
9093  levels to the number of entities at each level, respectively, below the
9094  entity at the parent level.
9095  KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9096  but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9097  the user to turn nesting on explicitly. This is an even more experimental
9098  option to this experimental feature, and may change or go away in the
9099  future.
9100 */
9101 
9102 // Allocate space to store nesting levels
9103 void __kmp_init_nesting_mode() {
9104  int levels = KMP_HW_LAST;
9105  __kmp_nesting_mode_nlevels = levels;
9106  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9107  for (int i = 0; i < levels; ++i)
9108  __kmp_nesting_nth_level[i] = 0;
9109  if (__kmp_nested_nth.size < levels) {
9110  __kmp_nested_nth.nth =
9111  (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9112  __kmp_nested_nth.size = levels;
9113  }
9114 }
9115 
9116 // Set # threads for top levels of nesting; must be called after topology set
9117 void __kmp_set_nesting_mode_threads() {
9118  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9119 
9120  if (__kmp_nesting_mode == 1)
9121  __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9122  else if (__kmp_nesting_mode > 1)
9123  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9124 
9125  if (__kmp_topology) { // use topology info
9126  int loc, hw_level;
9127  for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9128  loc < __kmp_nesting_mode_nlevels;
9129  loc++, hw_level++) {
9130  __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9131  if (__kmp_nesting_nth_level[loc] == 1)
9132  loc--;
9133  }
9134  // Make sure all cores are used
9135  if (__kmp_nesting_mode > 1 && loc > 1) {
9136  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9137  int num_cores = __kmp_topology->get_count(core_level);
9138  int upper_levels = 1;
9139  for (int level = 0; level < loc - 1; ++level)
9140  upper_levels *= __kmp_nesting_nth_level[level];
9141  if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9142  __kmp_nesting_nth_level[loc - 1] =
9143  num_cores / __kmp_nesting_nth_level[loc - 2];
9144  }
9145  __kmp_nesting_mode_nlevels = loc;
9146  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9147  } else { // no topology info available; provide a reasonable guesstimation
9148  if (__kmp_avail_proc >= 4) {
9149  __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9150  __kmp_nesting_nth_level[1] = 2;
9151  __kmp_nesting_mode_nlevels = 2;
9152  } else {
9153  __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9154  __kmp_nesting_mode_nlevels = 1;
9155  }
9156  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9157  }
9158  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9159  __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9160  }
9161  set__nproc(thread, __kmp_nesting_nth_level[0]);
9162  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9163  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9164  if (get__max_active_levels(thread) > 1) {
9165  // if max levels was set, set nesting mode levels to same
9166  __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9167  }
9168  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9169  set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9170 }
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:199
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:937
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:895
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:357
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:364
@ kmp_sch_static
Definition: kmp.h:360
@ kmp_sch_guided_chunked
Definition: kmp.h:362
Definition: kmp.h:234
kmp_int32 flags
Definition: kmp.h:236