LLVM OpenMP* Runtime Library
Loading...
Searching...
No Matches
kmp_tasking.cpp
1/*
2 * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_i18n.h"
15#include "kmp_itt.h"
16#include "kmp_stats.h"
17#include "kmp_wait_release.h"
18#include "kmp_taskdeps.h"
19
20#if OMPT_SUPPORT
21#include "ompt-specific.h"
22#endif
23
24#if ENABLE_LIBOMPTARGET
25static void (*tgt_target_nowait_query)(void **);
26
27void __kmp_init_target_task() {
28 *(void **)(&tgt_target_nowait_query) = KMP_DLSYM("__tgt_target_nowait_query");
29}
30#endif
31
32/* forward declaration */
33static void __kmp_enable_tasking(kmp_task_team_t *task_team,
34 kmp_info_t *this_thr);
35static void __kmp_alloc_task_deque(kmp_info_t *thread,
36 kmp_thread_data_t *thread_data);
37static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
38 kmp_task_team_t *task_team);
39static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
40#if OMPX_TASKGRAPH
41static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
42int __kmp_taskloop_task(int gtid, void *ptask);
43#endif
44
45// returns 1 if new task is allowed to execute, 0 otherwise
46// checks Task Scheduling constraint (if requested) and
47// mutexinoutset dependencies if any
48static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
49 const kmp_taskdata_t *tasknew,
50 const kmp_taskdata_t *taskcurr) {
51 if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
52 // Check if the candidate obeys the Task Scheduling Constraints (TSC)
53 // only descendant of all deferred tied tasks can be scheduled, checking
54 // the last one is enough, as it in turn is the descendant of all others
55 kmp_taskdata_t *current = taskcurr->td_last_tied;
56 KMP_DEBUG_ASSERT(current != NULL);
57 // check if the task is not suspended on barrier
58 if (current->td_flags.tasktype == TASK_EXPLICIT ||
59 current->td_taskwait_thread > 0) { // <= 0 on barrier
60 kmp_int32 level = current->td_level;
61 kmp_taskdata_t *parent = tasknew->td_parent;
62 while (parent != current && parent->td_level > level) {
63 // check generation up to the level of the current task
64 parent = parent->td_parent;
65 KMP_DEBUG_ASSERT(parent != NULL);
66 }
67 if (parent != current)
68 return false;
69 }
70 }
71 // Check mutexinoutset dependencies, acquire locks
72 kmp_depnode_t *node = tasknew->td_depnode;
73#if OMPX_TASKGRAPH
74 if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
75#else
76 if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
77#endif
78 for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
79 KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
80 if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
81 continue;
82 // could not get the lock, release previous locks
83 for (int j = i - 1; j >= 0; --j)
84 __kmp_release_lock(node->dn.mtx_locks[j], gtid);
85 return false;
86 }
87 // negative num_locks means all locks acquired successfully
88 node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
89 }
90 return true;
91}
92
93// __kmp_realloc_task_deque:
94// Re-allocates a task deque for a particular thread, copies the content from
95// the old deque and adjusts the necessary data structures relating to the
96// deque. This operation must be done with the deque_lock being held
97static void __kmp_realloc_task_deque(kmp_info_t *thread,
98 kmp_thread_data_t *thread_data) {
99 kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
100 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
101 kmp_int32 new_size = 2 * size;
102
103 KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
104 "%d] for thread_data %p\n",
105 __kmp_gtid_from_thread(thread), size, new_size, thread_data));
106
107 kmp_taskdata_t **new_deque =
108 (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
109
110 int i, j;
111 for (i = thread_data->td.td_deque_head, j = 0; j < size;
112 i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
113 new_deque[j] = thread_data->td.td_deque[i];
114
115 __kmp_free(thread_data->td.td_deque);
116
117 thread_data->td.td_deque_head = 0;
118 thread_data->td.td_deque_tail = size;
119 thread_data->td.td_deque = new_deque;
120 thread_data->td.td_deque_size = new_size;
121}
122
123static kmp_task_pri_t *__kmp_alloc_task_pri_list() {
124 kmp_task_pri_t *l = (kmp_task_pri_t *)__kmp_allocate(sizeof(kmp_task_pri_t));
125 kmp_thread_data_t *thread_data = &l->td;
126 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
127 thread_data->td.td_deque_last_stolen = -1;
128 KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] "
129 "for thread_data %p\n",
130 __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data));
131 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
132 INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
133 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
134 return l;
135}
136
137// The function finds the deque of priority tasks with given priority, or
138// allocates a new deque and put it into sorted (high -> low) list of deques.
139// Deques of non-default priority tasks are shared between all threads in team,
140// as opposed to per-thread deques of tasks with default priority.
141// The function is called under the lock task_team->tt.tt_task_pri_lock.
142static kmp_thread_data_t *
143__kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) {
144 kmp_thread_data_t *thread_data;
145 kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
146 if (lst->priority == pri) {
147 // Found queue of tasks with given priority.
148 thread_data = &lst->td;
149 } else if (lst->priority < pri) {
150 // All current priority queues contain tasks with lower priority.
151 // Allocate new one for given priority tasks.
152 kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
153 thread_data = &list->td;
154 list->priority = pri;
155 list->next = lst;
156 task_team->tt.tt_task_pri_list = list;
157 } else { // task_team->tt.tt_task_pri_list->priority > pri
158 kmp_task_pri_t *next_queue = lst->next;
159 while (next_queue && next_queue->priority > pri) {
160 lst = next_queue;
161 next_queue = lst->next;
162 }
163 // lst->priority > pri && (next == NULL || pri >= next->priority)
164 if (next_queue == NULL) {
165 // No queue with pri priority, need to allocate new one.
166 kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
167 thread_data = &list->td;
168 list->priority = pri;
169 list->next = NULL;
170 lst->next = list;
171 } else if (next_queue->priority == pri) {
172 // Found queue of tasks with given priority.
173 thread_data = &next_queue->td;
174 } else { // lst->priority > pri > next->priority
175 // insert newly allocated between existed queues
176 kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
177 thread_data = &list->td;
178 list->priority = pri;
179 list->next = next_queue;
180 lst->next = list;
181 }
182 }
183 return thread_data;
184}
185
186// __kmp_push_priority_task: Add a task to the team's priority task deque
187static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,
188 kmp_taskdata_t *taskdata,
189 kmp_task_team_t *task_team,
190 kmp_int32 pri) {
191 kmp_thread_data_t *thread_data = NULL;
192 KA_TRACE(20,
193 ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n",
194 gtid, taskdata, pri));
195
196 // Find task queue specific to priority value
197 kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
198 if (UNLIKELY(lst == NULL)) {
199 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
200 if (task_team->tt.tt_task_pri_list == NULL) {
201 // List of queues is still empty, allocate one.
202 kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
203 thread_data = &list->td;
204 list->priority = pri;
205 list->next = NULL;
206 task_team->tt.tt_task_pri_list = list;
207 } else {
208 // Other thread initialized a queue. Check if it fits and get thread_data.
209 thread_data = __kmp_get_priority_deque_data(task_team, pri);
210 }
211 __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
212 } else {
213 if (lst->priority == pri) {
214 // Found queue of tasks with given priority.
215 thread_data = &lst->td;
216 } else {
217 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
218 thread_data = __kmp_get_priority_deque_data(task_team, pri);
219 __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
220 }
221 }
222 KMP_DEBUG_ASSERT(thread_data);
223
224 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
225 // Check if deque is full
226 if (TCR_4(thread_data->td.td_deque_ntasks) >=
227 TASK_DEQUE_SIZE(thread_data->td)) {
228 if (__kmp_enable_task_throttling &&
229 __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
230 thread->th.th_current_task)) {
231 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
232 KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "
233 "TASK_NOT_PUSHED for task %p\n",
234 gtid, taskdata));
235 return TASK_NOT_PUSHED;
236 } else {
237 // expand deque to push the task which is not allowed to execute
238 __kmp_realloc_task_deque(thread, thread_data);
239 }
240 }
241 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
242 TASK_DEQUE_SIZE(thread_data->td));
243 // Push taskdata.
244 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
245 // Wrap index.
246 thread_data->td.td_deque_tail =
247 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
248 TCW_4(thread_data->td.td_deque_ntasks,
249 TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
250 KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
251 KMP_FSYNC_RELEASING(taskdata); // releasing child
252 KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning "
253 "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n",
254 gtid, taskdata, thread_data->td.td_deque_ntasks,
255 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
256 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
257 task_team->tt.tt_num_task_pri++; // atomic inc
258 return TASK_SUCCESSFULLY_PUSHED;
259}
260
261// __kmp_push_task: Add a task to the thread's deque
262static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
263 kmp_info_t *thread = __kmp_threads[gtid];
264 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
265
266 // If we encounter a hidden helper task, and the current thread is not a
267 // hidden helper thread, we have to give the task to any hidden helper thread
268 // starting from its shadow one.
269 if (UNLIKELY(taskdata->td_flags.hidden_helper &&
270 !KMP_HIDDEN_HELPER_THREAD(gtid))) {
271 kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
272 __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));
273 // Signal the hidden helper threads.
274 __kmp_hidden_helper_worker_thread_signal();
275 return TASK_SUCCESSFULLY_PUSHED;
276 }
277
278 kmp_task_team_t *task_team = thread->th.th_task_team;
279 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
280 kmp_thread_data_t *thread_data;
281
282 KA_TRACE(20,
283 ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
284
285 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
286 // untied task needs to increment counter so that the task structure is not
287 // freed prematurely
288 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
289 KMP_DEBUG_USE_VAR(counter);
290 KA_TRACE(
291 20,
292 ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
293 gtid, counter, taskdata));
294 }
295
296 // The first check avoids building task_team thread data if serialized
297 if (UNLIKELY(taskdata->td_flags.task_serial)) {
298 KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
299 "TASK_NOT_PUSHED for task %p\n",
300 gtid, taskdata));
301 return TASK_NOT_PUSHED;
302 }
303
304 // Now that serialized tasks have returned, we can assume that we are not in
305 // immediate exec mode
306 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
307 if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
308 __kmp_enable_tasking(task_team, thread);
309 }
310 KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
311 KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
312
313 if (taskdata->td_flags.priority_specified && task->data2.priority > 0 &&
314 __kmp_max_task_priority > 0) {
315 int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority);
316 return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri);
317 }
318
319 // Find tasking deque specific to encountering thread
320 thread_data = &task_team->tt.tt_threads_data[tid];
321
322 // No lock needed since only owner can allocate. If the task is hidden_helper,
323 // we don't need it either because we have initialized the dequeue for hidden
324 // helper thread data.
325 if (UNLIKELY(thread_data->td.td_deque == NULL)) {
326 __kmp_alloc_task_deque(thread, thread_data);
327 }
328
329 int locked = 0;
330 // Check if deque is full
331 if (TCR_4(thread_data->td.td_deque_ntasks) >=
332 TASK_DEQUE_SIZE(thread_data->td)) {
333 if (__kmp_enable_task_throttling &&
334 __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
335 thread->th.th_current_task)) {
336 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
337 "TASK_NOT_PUSHED for task %p\n",
338 gtid, taskdata));
339 return TASK_NOT_PUSHED;
340 } else {
341 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
342 locked = 1;
343 if (TCR_4(thread_data->td.td_deque_ntasks) >=
344 TASK_DEQUE_SIZE(thread_data->td)) {
345 // expand deque to push the task which is not allowed to execute
346 __kmp_realloc_task_deque(thread, thread_data);
347 }
348 }
349 }
350 // Lock the deque for the task push operation
351 if (!locked) {
352 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
353 // Need to recheck as we can get a proxy task from thread outside of OpenMP
354 if (TCR_4(thread_data->td.td_deque_ntasks) >=
355 TASK_DEQUE_SIZE(thread_data->td)) {
356 if (__kmp_enable_task_throttling &&
357 __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
358 thread->th.th_current_task)) {
359 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
360 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
361 "returning TASK_NOT_PUSHED for task %p\n",
362 gtid, taskdata));
363 return TASK_NOT_PUSHED;
364 } else {
365 // expand deque to push the task which is not allowed to execute
366 __kmp_realloc_task_deque(thread, thread_data);
367 }
368 }
369 }
370 // Must have room since no thread can add tasks but calling thread
371 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
372 TASK_DEQUE_SIZE(thread_data->td));
373
374 thread_data->td.td_deque[thread_data->td.td_deque_tail] =
375 taskdata; // Push taskdata
376 // Wrap index.
377 thread_data->td.td_deque_tail =
378 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
379 TCW_4(thread_data->td.td_deque_ntasks,
380 TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
381 KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
382 KMP_FSYNC_RELEASING(taskdata); // releasing child
383 KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
384 "task=%p ntasks=%d head=%u tail=%u\n",
385 gtid, taskdata, thread_data->td.td_deque_ntasks,
386 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
387
388 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
389
390 return TASK_SUCCESSFULLY_PUSHED;
391}
392
393// __kmp_pop_current_task_from_thread: set up current task from called thread
394// when team ends
395//
396// this_thr: thread structure to set current_task in.
397void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
398 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
399 "this_thread=%p, curtask=%p, "
400 "curtask_parent=%p\n",
401 0, this_thr, this_thr->th.th_current_task,
402 this_thr->th.th_current_task->td_parent));
403
404 this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
405
406 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
407 "this_thread=%p, curtask=%p, "
408 "curtask_parent=%p\n",
409 0, this_thr, this_thr->th.th_current_task,
410 this_thr->th.th_current_task->td_parent));
411}
412
413// __kmp_push_current_task_to_thread: set up current task in called thread for a
414// new team
415//
416// this_thr: thread structure to set up
417// team: team for implicit task data
418// tid: thread within team to set up
419void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
420 int tid) {
421 // current task of the thread is a parent of the new just created implicit
422 // tasks of new team
423 KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
424 "curtask=%p "
425 "parent_task=%p\n",
426 tid, this_thr, this_thr->th.th_current_task,
427 team->t.t_implicit_task_taskdata[tid].td_parent));
428
429 KMP_DEBUG_ASSERT(this_thr != NULL);
430
431 if (tid == 0) {
432 if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
433 team->t.t_implicit_task_taskdata[0].td_parent =
434 this_thr->th.th_current_task;
435 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
436 }
437 } else {
438 team->t.t_implicit_task_taskdata[tid].td_parent =
439 team->t.t_implicit_task_taskdata[0].td_parent;
440 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
441 }
442
443 KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
444 "curtask=%p "
445 "parent_task=%p\n",
446 tid, this_thr, this_thr->th.th_current_task,
447 team->t.t_implicit_task_taskdata[tid].td_parent));
448}
449
450// __kmp_task_start: bookkeeping for a task starting execution
451//
452// GTID: global thread id of calling thread
453// task: task starting execution
454// current_task: task suspending
455static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
456 kmp_taskdata_t *current_task) {
457 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
458 kmp_info_t *thread = __kmp_threads[gtid];
459
460 KA_TRACE(10,
461 ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
462 gtid, taskdata, current_task));
463
464 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
465
466 // mark currently executing task as suspended
467 // TODO: GEH - make sure root team implicit task is initialized properly.
468 // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
469 current_task->td_flags.executing = 0;
470
471 // mark starting task as executing and as current task
472 thread->th.th_current_task = taskdata;
473
474 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
475 taskdata->td_flags.tiedness == TASK_UNTIED);
476 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
477 taskdata->td_flags.tiedness == TASK_UNTIED);
478 taskdata->td_flags.started = 1;
479 taskdata->td_flags.executing = 1;
480 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
481 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
482
483 // GEH TODO: shouldn't we pass some sort of location identifier here?
484 // APT: yes, we will pass location here.
485 // need to store current thread state (in a thread or taskdata structure)
486 // before setting work_state, otherwise wrong state is set after end of task
487
488 KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
489
490 return;
491}
492
493#if OMPT_SUPPORT
494//------------------------------------------------------------------------------
495
496// __ompt_task_start:
497// Build and trigger task-begin event
498static inline void __ompt_task_start(kmp_task_t *task,
499 kmp_taskdata_t *current_task,
500 kmp_int32 gtid) {
501 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
502 ompt_task_status_t status = ompt_task_switch;
503 if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
504 status = ompt_task_yield;
505 __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
506 }
507 /* let OMPT know that we're about to run this task */
508 if (ompt_enabled.ompt_callback_task_schedule) {
509 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
510 &(current_task->ompt_task_info.task_data), status,
511 &(taskdata->ompt_task_info.task_data));
512 }
513 taskdata->ompt_task_info.scheduling_parent = current_task;
514}
515
516// __ompt_task_finish:
517// Build and trigger final task-schedule event
518static inline void __ompt_task_finish(kmp_task_t *task,
519 kmp_taskdata_t *resumed_task,
520 ompt_task_status_t status) {
521 if (ompt_enabled.ompt_callback_task_schedule) {
522 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
523 if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
524 taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
525 status = ompt_task_cancel;
526 }
527
528 /* let OMPT know that we're returning to the callee task */
529 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
530 &(taskdata->ompt_task_info.task_data), status,
531 (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
532 }
533}
534#endif
535
536template <bool ompt>
537static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
538 kmp_task_t *task,
539 void *frame_address,
540 void *return_address) {
541 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
542 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
543
544 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
545 "current_task=%p\n",
546 gtid, loc_ref, taskdata, current_task));
547
548 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
549 // untied task needs to increment counter so that the task structure is not
550 // freed prematurely
551 kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
552 KMP_DEBUG_USE_VAR(counter);
553 KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
554 "incremented for task %p\n",
555 gtid, counter, taskdata));
556 }
557
558 taskdata->td_flags.task_serial =
559 1; // Execute this task immediately, not deferred.
560 __kmp_task_start(gtid, task, current_task);
561
562#if OMPT_SUPPORT
563 if (ompt) {
564 if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
565 current_task->ompt_task_info.frame.enter_frame.ptr =
566 taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
567 current_task->ompt_task_info.frame.enter_frame_flags =
568 taskdata->ompt_task_info.frame.exit_frame_flags =
569 OMPT_FRAME_FLAGS_APP;
570 }
571 if (ompt_enabled.ompt_callback_task_create) {
572 ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
573 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
574 &(parent_info->task_data), &(parent_info->frame),
575 &(taskdata->ompt_task_info.task_data),
576 TASK_TYPE_DETAILS_FORMAT(taskdata), 0, return_address);
577 }
578 __ompt_task_start(task, current_task, gtid);
579 }
580#endif // OMPT_SUPPORT
581
582 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
583 loc_ref, taskdata));
584}
585
586#if OMPT_SUPPORT
587OMPT_NOINLINE
588static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
589 kmp_task_t *task,
590 void *frame_address,
591 void *return_address) {
592 __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
593 return_address);
594}
595#endif // OMPT_SUPPORT
596
597// __kmpc_omp_task_begin_if0: report that a given serialized task has started
598// execution
599//
600// loc_ref: source location information; points to beginning of task block.
601// gtid: global thread number.
602// task: task thunk for the started task.
603#ifdef __s390x__
604// This is required for OMPT_GET_FRAME_ADDRESS(1) to compile on s390x.
605// In order for it to work correctly, the caller also needs to be compiled with
606// backchain. If a caller is compiled without backchain,
607// OMPT_GET_FRAME_ADDRESS(1) will produce an incorrect value, but will not
608// crash.
609__attribute__((target("backchain")))
610#endif
611void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
612 kmp_task_t *task) {
613#if OMPT_SUPPORT
614 if (UNLIKELY(ompt_enabled.enabled)) {
615 OMPT_STORE_RETURN_ADDRESS(gtid);
616 __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
617 OMPT_GET_FRAME_ADDRESS(1),
618 OMPT_LOAD_RETURN_ADDRESS(gtid));
619 return;
620 }
621#endif
622 __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
623}
624
625#ifdef TASK_UNUSED
626// __kmpc_omp_task_begin: report that a given task has started execution
627// NEVER GENERATED BY COMPILER, DEPRECATED!!!
628void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
629 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
630
631 KA_TRACE(
632 10,
633 ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
634 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
635
636 __kmp_task_start(gtid, task, current_task);
637
638 KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
639 loc_ref, KMP_TASK_TO_TASKDATA(task)));
640 return;
641}
642#endif // TASK_UNUSED
643
644// __kmp_free_task: free the current task space and the space for shareds
645//
646// gtid: Global thread ID of calling thread
647// taskdata: task to free
648// thread: thread data structure of caller
649static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
650 kmp_info_t *thread) {
651 KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
652 taskdata));
653
654 // Check to make sure all flags and counters have the correct values
655 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
656 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
657 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
658 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
659 KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
660 taskdata->td_flags.task_serial == 1);
661 KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
662 kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
663 // Clear data to not be re-used later by mistake.
664 task->data1.destructors = NULL;
665 task->data2.priority = 0;
666
667 taskdata->td_flags.freed = 1;
668#if OMPX_TASKGRAPH
669 // do not free tasks in taskgraph
670 if (!taskdata->is_taskgraph) {
671#endif
672// deallocate the taskdata and shared variable blocks associated with this task
673#if USE_FAST_MEMORY
674 __kmp_fast_free(thread, taskdata);
675#else /* ! USE_FAST_MEMORY */
676 __kmp_thread_free(thread, taskdata);
677#endif
678#if OMPX_TASKGRAPH
679 } else {
680 taskdata->td_flags.complete = 0;
681 taskdata->td_flags.started = 0;
682 taskdata->td_flags.freed = 0;
683 taskdata->td_flags.executing = 0;
684 taskdata->td_flags.task_serial =
685 (taskdata->td_parent->td_flags.final ||
686 taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);
687
688 // taskdata->td_allow_completion_event.pending_events_count = 1;
689 KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
690 KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
691 // start at one because counts current task and children
692 KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
693 }
694#endif
695
696 KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
697}
698
699// __kmp_free_task_and_ancestors: free the current task and ancestors without
700// children
701//
702// gtid: Global thread ID of calling thread
703// taskdata: task to free
704// thread: thread data structure of caller
705static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
706 kmp_taskdata_t *taskdata,
707 kmp_info_t *thread) {
708 // Proxy tasks must always be allowed to free their parents
709 // because they can be run in background even in serial mode.
710 kmp_int32 team_serial =
711 (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
712 !taskdata->td_flags.proxy;
713 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
714
715 kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
716 KMP_DEBUG_ASSERT(children >= 0);
717
718 // Now, go up the ancestor tree to see if any ancestors can now be freed.
719 while (children == 0) {
720 kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
721
722 KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
723 "and freeing itself\n",
724 gtid, taskdata));
725
726 // --- Deallocate my ancestor task ---
727 __kmp_free_task(gtid, taskdata, thread);
728
729 taskdata = parent_taskdata;
730
731 if (team_serial)
732 return;
733 // Stop checking ancestors at implicit task instead of walking up ancestor
734 // tree to avoid premature deallocation of ancestors.
735 if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
736 if (taskdata->td_dephash) { // do we need to cleanup dephash?
737 int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
738 kmp_tasking_flags_t flags_old = taskdata->td_flags;
739 if (children == 0 && flags_old.complete == 1) {
740 kmp_tasking_flags_t flags_new = flags_old;
741 flags_new.complete = 0;
742 if (KMP_COMPARE_AND_STORE_ACQ32(
743 RCAST(kmp_int32 *, &taskdata->td_flags),
744 *RCAST(kmp_int32 *, &flags_old),
745 *RCAST(kmp_int32 *, &flags_new))) {
746 KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
747 "dephash of implicit task %p\n",
748 gtid, taskdata));
749 // cleanup dephash of finished implicit task
750 __kmp_dephash_free_entries(thread, taskdata->td_dephash);
751 }
752 }
753 }
754 return;
755 }
756 // Predecrement simulated by "- 1" calculation
757 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
758 KMP_DEBUG_ASSERT(children >= 0);
759 }
760
761 KA_TRACE(
762 20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
763 "not freeing it yet\n",
764 gtid, taskdata, children));
765}
766
767// Only need to keep track of child task counts if any of the following:
768// 1. team parallel and tasking not serialized;
769// 2. it is a proxy or detachable or hidden helper task
770// 3. the children counter of its parent task is greater than 0.
771// The reason for the 3rd one is for serialized team that found detached task,
772// hidden helper task, T. In this case, the execution of T is still deferred,
773// and it is also possible that a regular task depends on T. In this case, if we
774// don't track the children, task synchronization will be broken.
775static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
776 kmp_tasking_flags_t flags = taskdata->td_flags;
777 bool ret = !(flags.team_serial || flags.tasking_ser);
778 ret = ret || flags.proxy == TASK_PROXY ||
779 flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
780 ret = ret ||
781 KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
782#if OMPX_TASKGRAPH
783 if (taskdata->td_taskgroup && taskdata->is_taskgraph)
784 ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0;
785#endif
786 return ret;
787}
788
789// __kmp_task_finish: bookkeeping to do when a task finishes execution
790//
791// gtid: global thread ID for calling thread
792// task: task to be finished
793// resumed_task: task to be resumed. (may be NULL if task is serialized)
794//
795// template<ompt>: effectively ompt_enabled.enabled!=0
796// the version with ompt=false is inlined, allowing to optimize away all ompt
797// code in this case
798template <bool ompt>
799static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
800 kmp_taskdata_t *resumed_task) {
801 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
802 kmp_info_t *thread = __kmp_threads[gtid];
803 kmp_task_team_t *task_team =
804 thread->th.th_task_team; // might be NULL for serial teams...
805#if OMPX_TASKGRAPH
806 // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop
807 bool is_taskgraph;
808#endif
809#if KMP_DEBUG
810 kmp_int32 children = 0;
811#endif
812 KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
813 "task %p\n",
814 gtid, taskdata, resumed_task));
815
816 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
817
818#if OMPX_TASKGRAPH
819 is_taskgraph = taskdata->is_taskgraph;
820#endif
821
822 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
823 // untied task needs to check the counter so that the task structure is not
824 // freed prematurely
825 kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
826 KA_TRACE(
827 20,
828 ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
829 gtid, counter, taskdata));
830 if (counter > 0) {
831 // untied task is not done, to be continued possibly by other thread, do
832 // not free it now
833 if (resumed_task == NULL) {
834 KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
835 resumed_task = taskdata->td_parent; // In a serialized task, the resumed
836 // task is the parent
837 }
838 thread->th.th_current_task = resumed_task; // restore current_task
839 resumed_task->td_flags.executing = 1; // resume previous task
840 KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
841 "resuming task %p\n",
842 gtid, taskdata, resumed_task));
843 return;
844 }
845 }
846
847 // bookkeeping for resuming task:
848 // GEH - note tasking_ser => task_serial
849 KMP_DEBUG_ASSERT(
850 (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
851 taskdata->td_flags.task_serial);
852 if (taskdata->td_flags.task_serial) {
853 if (resumed_task == NULL) {
854 resumed_task = taskdata->td_parent; // In a serialized task, the resumed
855 // task is the parent
856 }
857 } else {
858 KMP_DEBUG_ASSERT(resumed_task !=
859 NULL); // verify that resumed task is passed as argument
860 }
861
862 /* If the tasks' destructor thunk flag has been set, we need to invoke the
863 destructor thunk that has been generated by the compiler. The code is
864 placed here, since at this point other tasks might have been released
865 hence overlapping the destructor invocations with some other work in the
866 released tasks. The OpenMP spec is not specific on when the destructors
867 are invoked, so we should be free to choose. */
868 if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
869 kmp_routine_entry_t destr_thunk = task->data1.destructors;
870 KMP_ASSERT(destr_thunk);
871 destr_thunk(gtid, task);
872 }
873
874 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
875 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
876 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
877
878 bool completed = true;
879 if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
880 if (taskdata->td_allow_completion_event.type ==
881 KMP_EVENT_ALLOW_COMPLETION) {
882 // event hasn't been fulfilled yet. Try to detach task.
883 __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
884 if (taskdata->td_allow_completion_event.type ==
885 KMP_EVENT_ALLOW_COMPLETION) {
886 // task finished execution
887 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
888 taskdata->td_flags.executing = 0; // suspend the finishing task
889
890#if OMPT_SUPPORT
891 // For a detached task, which is not completed, we switch back
892 // the omp_fulfill_event signals completion
893 // locking is necessary to avoid a race with ompt_task_late_fulfill
894 if (ompt)
895 __ompt_task_finish(task, resumed_task, ompt_task_detach);
896#endif
897
898 // no access to taskdata after this point!
899 // __kmp_fulfill_event might free taskdata at any time from now
900
901 taskdata->td_flags.proxy = TASK_PROXY; // proxify!
902 completed = false;
903 }
904 __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
905 }
906 }
907
908 // Tasks with valid target async handles must be re-enqueued.
909 if (taskdata->td_target_data.async_handle != NULL) {
910 // Note: no need to translate gtid to its shadow. If the current thread is a
911 // hidden helper one, then the gtid is already correct. Otherwise, hidden
912 // helper threads are disabled, and gtid refers to a OpenMP thread.
913#if OMPT_SUPPORT
914 if (ompt) {
915 __ompt_task_finish(task, resumed_task, ompt_task_switch);
916 }
917#endif
918 __kmpc_give_task(task, __kmp_tid_from_gtid(gtid));
919 if (KMP_HIDDEN_HELPER_THREAD(gtid))
920 __kmp_hidden_helper_worker_thread_signal();
921 completed = false;
922 }
923
924 if (completed) {
925 taskdata->td_flags.complete = 1; // mark the task as completed
926#if OMPX_TASKGRAPH
927 taskdata->td_flags.onced = 1; // mark the task as ran once already
928#endif
929
930#if OMPT_SUPPORT
931 // This is not a detached task, we are done here
932 if (ompt)
933 __ompt_task_finish(task, resumed_task, ompt_task_complete);
934#endif
935 // TODO: What would be the balance between the conditions in the function
936 // and an atomic operation?
937 if (__kmp_track_children_task(taskdata)) {
938 __kmp_release_deps(gtid, taskdata);
939 // Predecrement simulated by "- 1" calculation
940#if KMP_DEBUG
941 children = -1 +
942#endif
943 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
944 KMP_DEBUG_ASSERT(children >= 0);
945#if OMPX_TASKGRAPH
946 if (taskdata->td_taskgroup && !taskdata->is_taskgraph)
947#else
948 if (taskdata->td_taskgroup)
949#endif
950 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
951 } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
952 task_team->tt.tt_hidden_helper_task_encountered)) {
953 // if we found proxy or hidden helper tasks there could exist a dependency
954 // chain with the proxy task as origin
955 __kmp_release_deps(gtid, taskdata);
956 }
957 // td_flags.executing must be marked as 0 after __kmp_release_deps has been
958 // called. Othertwise, if a task is executed immediately from the
959 // release_deps code, the flag will be reset to 1 again by this same
960 // function
961 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
962 taskdata->td_flags.executing = 0; // suspend the finishing task
963
964 // Decrement the counter of hidden helper tasks to be executed.
965 if (taskdata->td_flags.hidden_helper) {
966 // Hidden helper tasks can only be executed by hidden helper threads.
967 KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
968 KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
969 }
970 }
971
972 KA_TRACE(
973 20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
974 gtid, taskdata, children));
975
976 // Free this task and then ancestor tasks if they have no children.
977 // Restore th_current_task first as suggested by John:
978 // johnmc: if an asynchronous inquiry peers into the runtime system
979 // it doesn't see the freed task as the current task.
980 thread->th.th_current_task = resumed_task;
981 if (completed)
982 __kmp_free_task_and_ancestors(gtid, taskdata, thread);
983
984 // TODO: GEH - make sure root team implicit task is initialized properly.
985 // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
986 resumed_task->td_flags.executing = 1; // resume previous task
987
988#if OMPX_TASKGRAPH
989 if (is_taskgraph && __kmp_track_children_task(taskdata) &&
990 taskdata->td_taskgroup) {
991 // TDG: we only release taskgroup barrier here because
992 // free_task_and_ancestors will call
993 // __kmp_free_task, which resets all task parameters such as
994 // taskdata->started, etc. If we release the barrier earlier, these
995 // parameters could be read before being reset. This is not an issue for
996 // non-TDG implementation because we never reuse a task(data) structure
997 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
998 }
999#endif
1000
1001 KA_TRACE(
1002 10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
1003 gtid, taskdata, resumed_task));
1004
1005 return;
1006}
1007
1008template <bool ompt>
1009static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
1010 kmp_int32 gtid,
1011 kmp_task_t *task) {
1012 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
1013 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1014 KMP_DEBUG_ASSERT(gtid >= 0);
1015 // this routine will provide task to resume
1016 __kmp_task_finish<ompt>(gtid, task, NULL);
1017
1018 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
1019 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1020
1021#if OMPT_SUPPORT
1022 if (ompt) {
1023 ompt_frame_t *ompt_frame;
1024 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1025 ompt_frame->enter_frame = ompt_data_none;
1026 ompt_frame->enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
1027 }
1028#endif
1029
1030 return;
1031}
1032
1033#if OMPT_SUPPORT
1034OMPT_NOINLINE
1035void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1036 kmp_task_t *task) {
1037 __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1038}
1039#endif // OMPT_SUPPORT
1040
1041// __kmpc_omp_task_complete_if0: report that a task has completed execution
1042//
1043// loc_ref: source location information; points to end of task block.
1044// gtid: global thread number.
1045// task: task thunk for the completed task.
1046void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1047 kmp_task_t *task) {
1048#if OMPT_SUPPORT
1049 if (UNLIKELY(ompt_enabled.enabled)) {
1050 __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1051 return;
1052 }
1053#endif
1054 __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1055}
1056
1057#ifdef TASK_UNUSED
1058// __kmpc_omp_task_complete: report that a task has completed execution
1059// NEVER GENERATED BY COMPILER, DEPRECATED!!!
1060void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1061 kmp_task_t *task) {
1062 KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1063 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1064
1065 __kmp_task_finish<false>(gtid, task,
1066 NULL); // Not sure how to find task to resume
1067
1068 KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1069 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1070 return;
1071}
1072#endif // TASK_UNUSED
1073
1074// __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1075// task for a given thread
1076//
1077// loc_ref: reference to source location of parallel region
1078// this_thr: thread data structure corresponding to implicit task
1079// team: team for this_thr
1080// tid: thread id of given thread within team
1081// set_curr_task: TRUE if need to push current task to thread
1082// NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1083// have already been done elsewhere.
1084// TODO: Get better loc_ref. Value passed in may be NULL
1085void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1086 kmp_team_t *team, int tid, int set_curr_task) {
1087 kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1088
1089 KF_TRACE(
1090 10,
1091 ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1092 tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1093
1094 task->td_task_id = KMP_GEN_TASK_ID();
1095 task->td_team = team;
1096 // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1097 // in debugger)
1098 task->td_ident = loc_ref;
1099 task->td_taskwait_ident = NULL;
1100 task->td_taskwait_counter = 0;
1101 task->td_taskwait_thread = 0;
1102
1103 task->td_flags.tiedness = TASK_TIED;
1104 task->td_flags.tasktype = TASK_IMPLICIT;
1105 task->td_flags.proxy = TASK_FULL;
1106
1107 // All implicit tasks are executed immediately, not deferred
1108 task->td_flags.task_serial = 1;
1109 task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1110 task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1111
1112 task->td_flags.started = 1;
1113 task->td_flags.executing = 1;
1114 task->td_flags.complete = 0;
1115 task->td_flags.freed = 0;
1116#if OMPX_TASKGRAPH
1117 task->td_flags.onced = 0;
1118#endif
1119
1120 task->td_depnode = NULL;
1121 task->td_last_tied = task;
1122 task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1123
1124 if (set_curr_task) { // only do this init first time thread is created
1125 KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1126 // Not used: don't need to deallocate implicit task
1127 KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1128 task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1129 task->td_dephash = NULL;
1130 __kmp_push_current_task_to_thread(this_thr, team, tid);
1131 } else {
1132 KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1133 KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1134 }
1135
1136#if OMPT_SUPPORT
1137 if (UNLIKELY(ompt_enabled.enabled))
1138 __ompt_task_init(task, tid);
1139#endif
1140
1141 KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1142 team, task));
1143}
1144
1145// __kmp_finish_implicit_task: Release resources associated to implicit tasks
1146// at the end of parallel regions. Some resources are kept for reuse in the next
1147// parallel region.
1148//
1149// thread: thread data structure corresponding to implicit task
1150void __kmp_finish_implicit_task(kmp_info_t *thread) {
1151 kmp_taskdata_t *task = thread->th.th_current_task;
1152 if (task->td_dephash) {
1153 int children;
1154 task->td_flags.complete = 1;
1155#if OMPX_TASKGRAPH
1156 task->td_flags.onced = 1;
1157#endif
1158 children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1159 kmp_tasking_flags_t flags_old = task->td_flags;
1160 if (children == 0 && flags_old.complete == 1) {
1161 kmp_tasking_flags_t flags_new = flags_old;
1162 flags_new.complete = 0;
1163 if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1164 *RCAST(kmp_int32 *, &flags_old),
1165 *RCAST(kmp_int32 *, &flags_new))) {
1166 KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1167 "dephash of implicit task %p\n",
1168 thread->th.th_info.ds.ds_gtid, task));
1169 __kmp_dephash_free_entries(thread, task->td_dephash);
1170 }
1171 }
1172 }
1173}
1174
1175// __kmp_free_implicit_task: Release resources associated to implicit tasks
1176// when these are destroyed regions
1177//
1178// thread: thread data structure corresponding to implicit task
1179void __kmp_free_implicit_task(kmp_info_t *thread) {
1180 kmp_taskdata_t *task = thread->th.th_current_task;
1181 if (task && task->td_dephash) {
1182 __kmp_dephash_free(thread, task->td_dephash);
1183 task->td_dephash = NULL;
1184 }
1185}
1186
1187// Round up a size to a power of two specified by val: Used to insert padding
1188// between structures co-allocated using a single malloc() call
1189static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1190 if (size & (val - 1)) {
1191 size &= ~(val - 1);
1192 if (size <= KMP_SIZE_T_MAX - val) {
1193 size += val; // Round up if there is no overflow.
1194 }
1195 }
1196 return size;
1197} // __kmp_round_up_to_va
1198
1199// __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1200//
1201// loc_ref: source location information
1202// gtid: global thread number.
1203// flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1204// task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1205// sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1206// private vars accessed in task.
1207// sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1208// in task.
1209// task_entry: Pointer to task code entry point generated by compiler.
1210// returns: a pointer to the allocated kmp_task_t structure (task).
1211kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1212 kmp_tasking_flags_t *flags,
1213 size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1214 kmp_routine_entry_t task_entry) {
1215 kmp_task_t *task;
1216 kmp_taskdata_t *taskdata;
1217 kmp_info_t *thread = __kmp_threads[gtid];
1218 kmp_team_t *team = thread->th.th_team;
1219 kmp_taskdata_t *parent_task = thread->th.th_current_task;
1220 size_t shareds_offset;
1221
1222 if (UNLIKELY(!TCR_4(__kmp_init_middle)))
1223 __kmp_middle_initialize();
1224
1225 if (flags->hidden_helper) {
1226 if (__kmp_enable_hidden_helper) {
1227 if (!TCR_4(__kmp_init_hidden_helper))
1228 __kmp_hidden_helper_initialize();
1229 } else {
1230 // If the hidden helper task is not enabled, reset the flag to FALSE.
1231 flags->hidden_helper = FALSE;
1232 }
1233 }
1234
1235 KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1236 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1237 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1238 sizeof_shareds, task_entry));
1239
1240 KMP_DEBUG_ASSERT(parent_task);
1241 if (parent_task->td_flags.final) {
1242 if (flags->merged_if0) {
1243 }
1244 flags->final = 1;
1245 }
1246
1247 if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1248 // Untied task encountered causes the TSC algorithm to check entire deque of
1249 // the victim thread. If no untied task encountered, then checking the head
1250 // of the deque should be enough.
1251 KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1252 }
1253
1254 // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1255 // the tasking setup
1256 // when that happens is too late.
1257 if (UNLIKELY(flags->proxy == TASK_PROXY ||
1258 flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1259 if (flags->proxy == TASK_PROXY) {
1260 flags->tiedness = TASK_UNTIED;
1261 flags->merged_if0 = 1;
1262 }
1263 /* are we running in a sequential parallel or tskm_immediate_exec... we need
1264 tasking support enabled */
1265 if ((thread->th.th_task_team) == NULL) {
1266 /* This should only happen if the team is serialized
1267 setup a task team and propagate it to the thread */
1268 KMP_DEBUG_ASSERT(team->t.t_serialized);
1269 KA_TRACE(30,
1270 ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1271 gtid));
1272 __kmp_task_team_setup(thread, team);
1273 thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1274 }
1275 kmp_task_team_t *task_team = thread->th.th_task_team;
1276
1277 /* tasking must be enabled now as the task might not be pushed */
1278 if (!KMP_TASKING_ENABLED(task_team)) {
1279 KA_TRACE(
1280 30,
1281 ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1282 __kmp_enable_tasking(task_team, thread);
1283 kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1284 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1285 // No lock needed since only owner can allocate
1286 if (thread_data->td.td_deque == NULL) {
1287 __kmp_alloc_task_deque(thread, thread_data);
1288 }
1289 }
1290
1291 if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1292 task_team->tt.tt_found_proxy_tasks == FALSE)
1293 TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1294 if (flags->hidden_helper &&
1295 task_team->tt.tt_hidden_helper_task_encountered == FALSE)
1296 TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
1297 }
1298
1299 // Calculate shared structure offset including padding after kmp_task_t struct
1300 // to align pointers in shared struct
1301 shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1302 shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(kmp_uint64));
1303
1304 // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1305 KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1306 shareds_offset));
1307 KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1308 sizeof_shareds));
1309
1310 // Avoid double allocation here by combining shareds with taskdata
1311#if USE_FAST_MEMORY
1312 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1313 sizeof_shareds);
1314#else /* ! USE_FAST_MEMORY */
1315 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1316 sizeof_shareds);
1317#endif /* USE_FAST_MEMORY */
1318
1319 task = KMP_TASKDATA_TO_TASK(taskdata);
1320
1321// Make sure task & taskdata are aligned appropriately
1322#if KMP_ARCH_X86 || KMP_ARCH_PPC64 || KMP_ARCH_S390X || !KMP_HAVE_QUAD
1323 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1324 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1325#else
1326 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1327 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1328#endif
1329 if (sizeof_shareds > 0) {
1330 // Avoid double allocation here by combining shareds with taskdata
1331 task->shareds = &((char *)taskdata)[shareds_offset];
1332 // Make sure shareds struct is aligned to pointer size
1333 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1334 0);
1335 } else {
1336 task->shareds = NULL;
1337 }
1338 task->routine = task_entry;
1339 task->part_id = 0; // AC: Always start with 0 part id
1340
1341 taskdata->td_task_id = KMP_GEN_TASK_ID();
1342 taskdata->td_team = thread->th.th_team;
1343 taskdata->td_alloc_thread = thread;
1344 taskdata->td_parent = parent_task;
1345 taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1346 KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1347 taskdata->td_ident = loc_ref;
1348 taskdata->td_taskwait_ident = NULL;
1349 taskdata->td_taskwait_counter = 0;
1350 taskdata->td_taskwait_thread = 0;
1351 KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1352 // avoid copying icvs for proxy tasks
1353 if (flags->proxy == TASK_FULL)
1354 copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1355
1356 taskdata->td_flags = *flags;
1357 taskdata->td_task_team = thread->th.th_task_team;
1358 taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1359 taskdata->td_flags.tasktype = TASK_EXPLICIT;
1360 // If it is hidden helper task, we need to set the team and task team
1361 // correspondingly.
1362 if (flags->hidden_helper) {
1363 kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1364 taskdata->td_team = shadow_thread->th.th_team;
1365 taskdata->td_task_team = shadow_thread->th.th_task_team;
1366 }
1367
1368 // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1369 taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1370
1371 // GEH - TODO: fix this to copy parent task's value of team_serial flag
1372 taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1373
1374 // GEH - Note we serialize the task if the team is serialized to make sure
1375 // implicit parallel region tasks are not left until program termination to
1376 // execute. Also, it helps locality to execute immediately.
1377
1378 taskdata->td_flags.task_serial =
1379 (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1380 taskdata->td_flags.tasking_ser || flags->merged_if0);
1381
1382 taskdata->td_flags.started = 0;
1383 taskdata->td_flags.executing = 0;
1384 taskdata->td_flags.complete = 0;
1385 taskdata->td_flags.freed = 0;
1386#if OMPX_TASKGRAPH
1387 taskdata->td_flags.onced = 0;
1388 taskdata->is_taskgraph = 0;
1389 taskdata->tdg = nullptr;
1390#endif
1391 KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1392 // start at one because counts current task and children
1393 KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1394 taskdata->td_taskgroup =
1395 parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1396 taskdata->td_dephash = NULL;
1397 taskdata->td_depnode = NULL;
1398 taskdata->td_target_data.async_handle = NULL;
1399 if (flags->tiedness == TASK_UNTIED)
1400 taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1401 else
1402 taskdata->td_last_tied = taskdata;
1403 taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1404#if OMPT_SUPPORT
1405 if (UNLIKELY(ompt_enabled.enabled))
1406 __ompt_task_init(taskdata, gtid);
1407#endif
1408 // TODO: What would be the balance between the conditions in the function and
1409 // an atomic operation?
1410 if (__kmp_track_children_task(taskdata)) {
1411 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1412 if (parent_task->td_taskgroup)
1413 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1414 // Only need to keep track of allocated child tasks for explicit tasks since
1415 // implicit not deallocated
1416 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1417 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1418 }
1419 if (flags->hidden_helper) {
1420 taskdata->td_flags.task_serial = FALSE;
1421 // Increment the number of hidden helper tasks to be executed
1422 KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
1423 }
1424 }
1425
1426#if OMPX_TASKGRAPH
1427 kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
1428 if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&
1429 (task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) {
1430 taskdata->is_taskgraph = 1;
1431 taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
1432 taskdata->td_task_id = KMP_GEN_TASK_ID();
1433 taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
1434 }
1435#endif
1436 KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1437 gtid, taskdata, taskdata->td_parent));
1438
1439 return task;
1440}
1441
1442kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1443 kmp_int32 flags, size_t sizeof_kmp_task_t,
1444 size_t sizeof_shareds,
1445 kmp_routine_entry_t task_entry) {
1446 kmp_task_t *retval;
1447 kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1448 __kmp_assert_valid_gtid(gtid);
1449 input_flags->native = FALSE;
1450 // __kmp_task_alloc() sets up all other runtime flags
1451 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1452 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1453 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1454 input_flags->proxy ? "proxy" : "",
1455 input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1456 sizeof_shareds, task_entry));
1457
1458 retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1459 sizeof_shareds, task_entry);
1460
1461 KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1462
1463 return retval;
1464}
1465
1466kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1467 kmp_int32 flags,
1468 size_t sizeof_kmp_task_t,
1469 size_t sizeof_shareds,
1470 kmp_routine_entry_t task_entry,
1471 kmp_int64 device_id) {
1472 auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1473 // target task is untied defined in the specification
1474 input_flags.tiedness = TASK_UNTIED;
1475 input_flags.target = 1;
1476
1477 if (__kmp_enable_hidden_helper)
1478 input_flags.hidden_helper = TRUE;
1479
1480 return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1481 sizeof_shareds, task_entry);
1482}
1483
1497kmp_int32
1499 kmp_task_t *new_task, kmp_int32 naffins,
1500 kmp_task_affinity_info_t *affin_list) {
1501 return 0;
1502}
1503
1504// __kmp_invoke_task: invoke the specified task
1505//
1506// gtid: global thread ID of caller
1507// task: the task to invoke
1508// current_task: the task to resume after task invocation
1509#ifdef __s390x__
1510__attribute__((target("backchain")))
1511#endif
1512static void
1513__kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1514 kmp_taskdata_t *current_task) {
1515 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1516 kmp_info_t *thread;
1517 int discard = 0 /* false */;
1518 KA_TRACE(
1519 30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1520 gtid, taskdata, current_task));
1521 KMP_DEBUG_ASSERT(task);
1522 if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1523 taskdata->td_flags.complete == 1)) {
1524 // This is a proxy task that was already completed but it needs to run
1525 // its bottom-half finish
1526 KA_TRACE(
1527 30,
1528 ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1529 gtid, taskdata));
1530
1531 __kmp_bottom_half_finish_proxy(gtid, task);
1532
1533 KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1534 "proxy task %p, resuming task %p\n",
1535 gtid, taskdata, current_task));
1536
1537 return;
1538 }
1539
1540#if OMPT_SUPPORT
1541 // For untied tasks, the first task executed only calls __kmpc_omp_task and
1542 // does not execute code.
1543 ompt_thread_info_t oldInfo;
1544 if (UNLIKELY(ompt_enabled.enabled)) {
1545 // Store the threads states and restore them after the task
1546 thread = __kmp_threads[gtid];
1547 oldInfo = thread->th.ompt_thread_info;
1548 thread->th.ompt_thread_info.wait_id = 0;
1549 thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1550 ? ompt_state_work_serial
1551 : ompt_state_work_parallel;
1552 taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1553 }
1554#endif
1555
1556 // Proxy tasks are not handled by the runtime
1557 if (taskdata->td_flags.proxy != TASK_PROXY) {
1558 __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1559 }
1560
1561 // TODO: cancel tasks if the parallel region has also been cancelled
1562 // TODO: check if this sequence can be hoisted above __kmp_task_start
1563 // if cancellation has been enabled for this run ...
1564 if (UNLIKELY(__kmp_omp_cancellation)) {
1565 thread = __kmp_threads[gtid];
1566 kmp_team_t *this_team = thread->th.th_team;
1567 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1568 if ((taskgroup && taskgroup->cancel_request) ||
1569 (this_team->t.t_cancel_request == cancel_parallel)) {
1570#if OMPT_SUPPORT && OMPT_OPTIONAL
1571 ompt_data_t *task_data;
1572 if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1573 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1574 ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1575 task_data,
1576 ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1577 : ompt_cancel_parallel) |
1578 ompt_cancel_discarded_task,
1579 NULL);
1580 }
1581#endif
1582 KMP_COUNT_BLOCK(TASK_cancelled);
1583 // this task belongs to a task group and we need to cancel it
1584 discard = 1 /* true */;
1585 }
1586 }
1587
1588 // Invoke the task routine and pass in relevant data.
1589 // Thunks generated by gcc take a different argument list.
1590 if (!discard) {
1591 if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1592 taskdata->td_last_tied = current_task->td_last_tied;
1593 KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1594 }
1595#if KMP_STATS_ENABLED
1596 KMP_COUNT_BLOCK(TASK_executed);
1597 switch (KMP_GET_THREAD_STATE()) {
1598 case FORK_JOIN_BARRIER:
1599 KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1600 break;
1601 case PLAIN_BARRIER:
1602 KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1603 break;
1604 case TASKYIELD:
1605 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1606 break;
1607 case TASKWAIT:
1608 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1609 break;
1610 case TASKGROUP:
1611 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1612 break;
1613 default:
1614 KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1615 break;
1616 }
1617#endif // KMP_STATS_ENABLED
1618
1619// OMPT task begin
1620#if OMPT_SUPPORT
1621 if (UNLIKELY(ompt_enabled.enabled))
1622 __ompt_task_start(task, current_task, gtid);
1623#endif
1624#if OMPT_SUPPORT && OMPT_OPTIONAL
1625 if (UNLIKELY(ompt_enabled.ompt_callback_dispatch &&
1626 taskdata->ompt_task_info.dispatch_chunk.iterations > 0)) {
1627 ompt_data_t instance = ompt_data_none;
1628 instance.ptr = &(taskdata->ompt_task_info.dispatch_chunk);
1629 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1630 ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
1631 &(team_info->parallel_data), &(taskdata->ompt_task_info.task_data),
1632 ompt_dispatch_taskloop_chunk, instance);
1633 taskdata->ompt_task_info.dispatch_chunk = {0, 0};
1634 }
1635#endif // OMPT_SUPPORT && OMPT_OPTIONAL
1636
1637#if OMPD_SUPPORT
1638 if (ompd_state & OMPD_ENABLE_BP)
1639 ompd_bp_task_begin();
1640#endif
1641
1642#if USE_ITT_BUILD && USE_ITT_NOTIFY
1643 kmp_uint64 cur_time;
1644 kmp_int32 kmp_itt_count_task =
1645 __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1646 current_task->td_flags.tasktype == TASK_IMPLICIT;
1647 if (kmp_itt_count_task) {
1648 thread = __kmp_threads[gtid];
1649 // Time outer level explicit task on barrier for adjusting imbalance time
1650 if (thread->th.th_bar_arrive_time)
1651 cur_time = __itt_get_timestamp();
1652 else
1653 kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1654 }
1655 KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1656#endif
1657
1658#if ENABLE_LIBOMPTARGET
1659 if (taskdata->td_target_data.async_handle != NULL) {
1660 // If we have a valid target async handle, that means that we have already
1661 // executed the task routine once. We must query for the handle completion
1662 // instead of re-executing the routine.
1663 KMP_ASSERT(tgt_target_nowait_query);
1664 tgt_target_nowait_query(&taskdata->td_target_data.async_handle);
1665 } else
1666#endif
1667 if (task->routine != NULL) {
1668#ifdef KMP_GOMP_COMPAT
1669 if (taskdata->td_flags.native) {
1670 ((void (*)(void *))(*(task->routine)))(task->shareds);
1671 } else
1672#endif /* KMP_GOMP_COMPAT */
1673 {
1674 (*(task->routine))(gtid, task);
1675 }
1676 }
1677 KMP_POP_PARTITIONED_TIMER();
1678
1679#if USE_ITT_BUILD && USE_ITT_NOTIFY
1680 if (kmp_itt_count_task) {
1681 // Barrier imbalance - adjust arrive time with the task duration
1682 thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1683 }
1684 KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1685 KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1686#endif
1687 }
1688
1689#if OMPD_SUPPORT
1690 if (ompd_state & OMPD_ENABLE_BP)
1691 ompd_bp_task_end();
1692#endif
1693
1694 // Proxy tasks are not handled by the runtime
1695 if (taskdata->td_flags.proxy != TASK_PROXY) {
1696#if OMPT_SUPPORT
1697 if (UNLIKELY(ompt_enabled.enabled)) {
1698 thread->th.ompt_thread_info = oldInfo;
1699 if (taskdata->td_flags.tiedness == TASK_TIED) {
1700 taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1701 }
1702 __kmp_task_finish<true>(gtid, task, current_task);
1703 } else
1704#endif
1705 __kmp_task_finish<false>(gtid, task, current_task);
1706 }
1707#if OMPT_SUPPORT
1708 else if (UNLIKELY(ompt_enabled.enabled && taskdata->td_flags.target)) {
1709 __ompt_task_finish(task, current_task, ompt_task_switch);
1710 }
1711#endif
1712
1713 KA_TRACE(
1714 30,
1715 ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1716 gtid, taskdata, current_task));
1717 return;
1718}
1719
1720// __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1721//
1722// loc_ref: location of original task pragma (ignored)
1723// gtid: Global Thread ID of encountering thread
1724// new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1725// Returns:
1726// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1727// be resumed later.
1728// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1729// resumed later.
1730kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1731 kmp_task_t *new_task) {
1732 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1733
1734 KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1735 loc_ref, new_taskdata));
1736
1737#if OMPT_SUPPORT
1738 kmp_taskdata_t *parent;
1739 if (UNLIKELY(ompt_enabled.enabled)) {
1740 parent = new_taskdata->td_parent;
1741 if (ompt_enabled.ompt_callback_task_create) {
1742 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1743 &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1744 &(new_taskdata->ompt_task_info.task_data),
1745 TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1746 OMPT_GET_RETURN_ADDRESS(0));
1747 }
1748 }
1749#endif
1750
1751 /* Should we execute the new task or queue it? For now, let's just always try
1752 to queue it. If the queue fills up, then we'll execute it. */
1753
1754 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1755 { // Execute this task immediately
1756 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1757 new_taskdata->td_flags.task_serial = 1;
1758 __kmp_invoke_task(gtid, new_task, current_task);
1759 }
1760
1761 KA_TRACE(
1762 10,
1763 ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1764 "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1765 gtid, loc_ref, new_taskdata));
1766
1767#if OMPT_SUPPORT
1768 if (UNLIKELY(ompt_enabled.enabled)) {
1769 parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1770 parent->ompt_task_info.frame.enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
1771 }
1772#endif
1773 return TASK_CURRENT_NOT_QUEUED;
1774}
1775
1776// __kmp_omp_task: Schedule a non-thread-switchable task for execution
1777//
1778// gtid: Global Thread ID of encountering thread
1779// new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1780// serialize_immediate: if TRUE then if the task is executed immediately its
1781// execution will be serialized
1782// Returns:
1783// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1784// be resumed later.
1785// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1786// resumed later.
1787kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1788 bool serialize_immediate) {
1789 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1790
1791#if OMPX_TASKGRAPH
1792 if (new_taskdata->is_taskgraph &&
1793 __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
1794 kmp_tdg_info_t *tdg = new_taskdata->tdg;
1795 // extend the record_map if needed
1796 if (new_taskdata->td_tdg_task_id >= new_taskdata->tdg->map_size) {
1797 __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
1798 // map_size could have been updated by another thread if recursive
1799 // taskloop
1800 if (new_taskdata->td_tdg_task_id >= tdg->map_size) {
1801 kmp_uint old_size = tdg->map_size;
1802 kmp_uint new_size = old_size * 2;
1803 kmp_node_info_t *old_record = tdg->record_map;
1804 kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
1805 new_size * sizeof(kmp_node_info_t));
1806
1807 KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));
1808 tdg->record_map = new_record;
1809
1810 __kmp_free(old_record);
1811
1812 for (kmp_int i = old_size; i < new_size; i++) {
1813 kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
1814 __kmp_successors_size * sizeof(kmp_int32));
1815 new_record[i].task = nullptr;
1816 new_record[i].successors = successorsList;
1817 new_record[i].nsuccessors = 0;
1818 new_record[i].npredecessors = 0;
1819 new_record[i].successors_size = __kmp_successors_size;
1820 KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
1821 }
1822 // update the size at the end, so that we avoid other
1823 // threads use old_record while map_size is already updated
1824 tdg->map_size = new_size;
1825 }
1826 __kmp_release_bootstrap_lock(&tdg->graph_lock);
1827 }
1828 // record a task
1829 if (tdg->record_map[new_taskdata->td_tdg_task_id].task == nullptr) {
1830 tdg->record_map[new_taskdata->td_tdg_task_id].task = new_task;
1831 tdg->record_map[new_taskdata->td_tdg_task_id].parent_task =
1832 new_taskdata->td_parent;
1833 KMP_ATOMIC_INC(&tdg->num_tasks);
1834 }
1835 }
1836#endif
1837
1838 /* Should we execute the new task or queue it? For now, let's just always try
1839 to queue it. If the queue fills up, then we'll execute it. */
1840 if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1841 __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1842 { // Execute this task immediately
1843 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1844 if (serialize_immediate)
1845 new_taskdata->td_flags.task_serial = 1;
1846 __kmp_invoke_task(gtid, new_task, current_task);
1847 } else if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1848 __kmp_wpolicy_passive) {
1849 kmp_info_t *this_thr = __kmp_threads[gtid];
1850 kmp_team_t *team = this_thr->th.th_team;
1851 kmp_int32 nthreads = this_thr->th.th_team_nproc;
1852 for (int i = 0; i < nthreads; ++i) {
1853 kmp_info_t *thread = team->t.t_threads[i];
1854 if (thread == this_thr)
1855 continue;
1856 if (thread->th.th_sleep_loc != NULL) {
1857 __kmp_null_resume_wrapper(thread);
1858 break; // awake one thread at a time
1859 }
1860 }
1861 }
1862 return TASK_CURRENT_NOT_QUEUED;
1863}
1864
1865// __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1866// non-thread-switchable task from the parent thread only!
1867//
1868// loc_ref: location of original task pragma (ignored)
1869// gtid: Global Thread ID of encountering thread
1870// new_task: non-thread-switchable task thunk allocated by
1871// __kmp_omp_task_alloc()
1872// Returns:
1873// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1874// be resumed later.
1875// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1876// resumed later.
1877kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1878 kmp_task_t *new_task) {
1879 kmp_int32 res;
1880 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1881
1882#if KMP_DEBUG || OMPT_SUPPORT
1883 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1884#endif
1885 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1886 new_taskdata));
1887 __kmp_assert_valid_gtid(gtid);
1888
1889#if OMPT_SUPPORT
1890 kmp_taskdata_t *parent = NULL;
1891 if (UNLIKELY(ompt_enabled.enabled)) {
1892 if (!new_taskdata->td_flags.started) {
1893 OMPT_STORE_RETURN_ADDRESS(gtid);
1894 parent = new_taskdata->td_parent;
1895 if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1896 parent->ompt_task_info.frame.enter_frame.ptr =
1897 OMPT_GET_FRAME_ADDRESS(0);
1898 }
1899 if (ompt_enabled.ompt_callback_task_create) {
1900 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1901 &(parent->ompt_task_info.task_data),
1902 &(parent->ompt_task_info.frame),
1903 &(new_taskdata->ompt_task_info.task_data),
1904 TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1905 OMPT_LOAD_RETURN_ADDRESS(gtid));
1906 }
1907 } else {
1908 // We are scheduling the continuation of an UNTIED task.
1909 // Scheduling back to the parent task.
1910 __ompt_task_finish(new_task,
1911 new_taskdata->ompt_task_info.scheduling_parent,
1912 ompt_task_switch);
1913 new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1914 }
1915 }
1916#endif
1917
1918 res = __kmp_omp_task(gtid, new_task, true);
1919
1920 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1921 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1922 gtid, loc_ref, new_taskdata));
1923#if OMPT_SUPPORT
1924 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1925 parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1926 }
1927#endif
1928 return res;
1929}
1930
1931// __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1932// a taskloop task with the correct OMPT return address
1933//
1934// loc_ref: location of original task pragma (ignored)
1935// gtid: Global Thread ID of encountering thread
1936// new_task: non-thread-switchable task thunk allocated by
1937// __kmp_omp_task_alloc()
1938// codeptr_ra: return address for OMPT callback
1939// Returns:
1940// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1941// be resumed later.
1942// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1943// resumed later.
1944kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
1945 kmp_task_t *new_task, void *codeptr_ra) {
1946 kmp_int32 res;
1947 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1948
1949#if KMP_DEBUG || OMPT_SUPPORT
1950 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1951#endif
1952 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1953 new_taskdata));
1954
1955#if OMPT_SUPPORT
1956 kmp_taskdata_t *parent = NULL;
1957 if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1958 parent = new_taskdata->td_parent;
1959 if (!parent->ompt_task_info.frame.enter_frame.ptr)
1960 parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1961 if (ompt_enabled.ompt_callback_task_create) {
1962 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1963 &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1964 &(new_taskdata->ompt_task_info.task_data),
1965 TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, codeptr_ra);
1966 }
1967 }
1968#endif
1969
1970 res = __kmp_omp_task(gtid, new_task, true);
1971
1972 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1973 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1974 gtid, loc_ref, new_taskdata));
1975#if OMPT_SUPPORT
1976 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1977 parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1978 }
1979#endif
1980 return res;
1981}
1982
1983template <bool ompt>
1984static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
1985 void *frame_address,
1986 void *return_address) {
1987 kmp_taskdata_t *taskdata = nullptr;
1988 kmp_info_t *thread;
1989 int thread_finished = FALSE;
1990 KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1991
1992 KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1993 KMP_DEBUG_ASSERT(gtid >= 0);
1994
1995 if (__kmp_tasking_mode != tskm_immediate_exec) {
1996 thread = __kmp_threads[gtid];
1997 taskdata = thread->th.th_current_task;
1998
1999#if OMPT_SUPPORT && OMPT_OPTIONAL
2000 ompt_data_t *my_task_data;
2001 ompt_data_t *my_parallel_data;
2002
2003 if (ompt) {
2004 my_task_data = &(taskdata->ompt_task_info.task_data);
2005 my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
2006
2007 taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
2008
2009 if (ompt_enabled.ompt_callback_sync_region) {
2010 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2011 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2012 my_task_data, return_address);
2013 }
2014
2015 if (ompt_enabled.ompt_callback_sync_region_wait) {
2016 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2017 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2018 my_task_data, return_address);
2019 }
2020 }
2021#endif // OMPT_SUPPORT && OMPT_OPTIONAL
2022
2023// Debugger: The taskwait is active. Store location and thread encountered the
2024// taskwait.
2025#if USE_ITT_BUILD
2026// Note: These values are used by ITT events as well.
2027#endif /* USE_ITT_BUILD */
2028 taskdata->td_taskwait_counter += 1;
2029 taskdata->td_taskwait_ident = loc_ref;
2030 taskdata->td_taskwait_thread = gtid + 1;
2031
2032#if USE_ITT_BUILD
2033 void *itt_sync_obj = NULL;
2034#if USE_ITT_NOTIFY
2035 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2036#endif /* USE_ITT_NOTIFY */
2037#endif /* USE_ITT_BUILD */
2038
2039 bool must_wait =
2040 !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
2041
2042 must_wait = must_wait || (thread->th.th_task_team != NULL &&
2043 thread->th.th_task_team->tt.tt_found_proxy_tasks);
2044 // If hidden helper thread is encountered, we must enable wait here.
2045 must_wait =
2046 must_wait ||
2047 (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
2048 thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
2049
2050 if (must_wait) {
2051 kmp_flag_32<false, false> flag(
2052 RCAST(std::atomic<kmp_uint32> *,
2053 &(taskdata->td_incomplete_child_tasks)),
2054 0U);
2055 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
2056 flag.execute_tasks(thread, gtid, FALSE,
2057 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2058 __kmp_task_stealing_constraint);
2059 }
2060 }
2061#if USE_ITT_BUILD
2062 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2063 KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
2064#endif /* USE_ITT_BUILD */
2065
2066 // Debugger: The taskwait is completed. Location remains, but thread is
2067 // negated.
2068 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2069
2070#if OMPT_SUPPORT && OMPT_OPTIONAL
2071 if (ompt) {
2072 if (ompt_enabled.ompt_callback_sync_region_wait) {
2073 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2074 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2075 my_task_data, return_address);
2076 }
2077 if (ompt_enabled.ompt_callback_sync_region) {
2078 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2079 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2080 my_task_data, return_address);
2081 }
2082 taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
2083 }
2084#endif // OMPT_SUPPORT && OMPT_OPTIONAL
2085 }
2086
2087 KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
2088 "returning TASK_CURRENT_NOT_QUEUED\n",
2089 gtid, taskdata));
2090
2091 return TASK_CURRENT_NOT_QUEUED;
2092}
2093
2094#if OMPT_SUPPORT && OMPT_OPTIONAL
2095OMPT_NOINLINE
2096static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
2097 void *frame_address,
2098 void *return_address) {
2099 return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
2100 return_address);
2101}
2102#endif // OMPT_SUPPORT && OMPT_OPTIONAL
2103
2104// __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
2105// complete
2106kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
2107#if OMPT_SUPPORT && OMPT_OPTIONAL
2108 if (UNLIKELY(ompt_enabled.enabled)) {
2109 OMPT_STORE_RETURN_ADDRESS(gtid);
2110 return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
2111 OMPT_LOAD_RETURN_ADDRESS(gtid));
2112 }
2113#endif
2114 return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
2115}
2116
2117// __kmpc_omp_taskyield: switch to a different task
2118kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
2119 kmp_taskdata_t *taskdata = NULL;
2120 kmp_info_t *thread;
2121 int thread_finished = FALSE;
2122
2123 KMP_COUNT_BLOCK(OMP_TASKYIELD);
2124 KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2125
2126 KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2127 gtid, loc_ref, end_part));
2128 __kmp_assert_valid_gtid(gtid);
2129
2130 if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2131 thread = __kmp_threads[gtid];
2132 taskdata = thread->th.th_current_task;
2133// Should we model this as a task wait or not?
2134// Debugger: The taskwait is active. Store location and thread encountered the
2135// taskwait.
2136#if USE_ITT_BUILD
2137// Note: These values are used by ITT events as well.
2138#endif /* USE_ITT_BUILD */
2139 taskdata->td_taskwait_counter += 1;
2140 taskdata->td_taskwait_ident = loc_ref;
2141 taskdata->td_taskwait_thread = gtid + 1;
2142
2143#if USE_ITT_BUILD
2144 void *itt_sync_obj = NULL;
2145#if USE_ITT_NOTIFY
2146 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2147#endif /* USE_ITT_NOTIFY */
2148#endif /* USE_ITT_BUILD */
2149 if (!taskdata->td_flags.team_serial) {
2150 kmp_task_team_t *task_team = thread->th.th_task_team;
2151 if (task_team != NULL) {
2152 if (KMP_TASKING_ENABLED(task_team)) {
2153#if OMPT_SUPPORT
2154 if (UNLIKELY(ompt_enabled.enabled))
2155 thread->th.ompt_thread_info.ompt_task_yielded = 1;
2156#endif
2157 __kmp_execute_tasks_32(
2158 thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2159 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2160 __kmp_task_stealing_constraint);
2161#if OMPT_SUPPORT
2162 if (UNLIKELY(ompt_enabled.enabled))
2163 thread->th.ompt_thread_info.ompt_task_yielded = 0;
2164#endif
2165 }
2166 }
2167 }
2168#if USE_ITT_BUILD
2169 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2170#endif /* USE_ITT_BUILD */
2171
2172 // Debugger: The taskwait is completed. Location remains, but thread is
2173 // negated.
2174 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2175 }
2176
2177 KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2178 "returning TASK_CURRENT_NOT_QUEUED\n",
2179 gtid, taskdata));
2180
2181 return TASK_CURRENT_NOT_QUEUED;
2182}
2183
2184// Task Reduction implementation
2185//
2186// Note: initial implementation didn't take into account the possibility
2187// to specify omp_orig for initializer of the UDR (user defined reduction).
2188// Corrected implementation takes into account the omp_orig object.
2189// Compiler is free to use old implementation if omp_orig is not specified.
2190
2199typedef struct kmp_taskred_flags {
2201 unsigned lazy_priv : 1;
2202 unsigned reserved31 : 31;
2204
2208typedef struct kmp_task_red_input {
2211 // three compiler-generated routines (init, fini are optional):
2217
2221typedef struct kmp_taskred_data {
2227 // three compiler-generated routines (init, fini are optional):
2233
2239typedef struct kmp_taskred_input {
2243 // three compiler-generated routines (init, fini are optional):
2253template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2254template <>
2255void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2256 kmp_task_red_input_t &src) {
2257 item.reduce_orig = NULL;
2258}
2259template <>
2260void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2261 kmp_taskred_input_t &src) {
2262 if (src.reduce_orig != NULL) {
2263 item.reduce_orig = src.reduce_orig;
2264 } else {
2265 item.reduce_orig = src.reduce_shar;
2266 } // non-NULL reduce_orig means new interface used
2267}
2268
2269template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2270template <>
2271void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2272 size_t offset) {
2273 ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2274}
2275template <>
2276void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2277 size_t offset) {
2278 ((void (*)(void *, void *))item.reduce_init)(
2279 (char *)(item.reduce_priv) + offset, item.reduce_orig);
2280}
2281
2282template <typename T>
2283void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2284 __kmp_assert_valid_gtid(gtid);
2285 kmp_info_t *thread = __kmp_threads[gtid];
2286 kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2287 kmp_uint32 nth = thread->th.th_team_nproc;
2288 kmp_taskred_data_t *arr;
2289
2290 // check input data just in case
2291 KMP_ASSERT(tg != NULL);
2292 KMP_ASSERT(data != NULL);
2293 KMP_ASSERT(num > 0);
2294 if (nth == 1 && !__kmp_enable_hidden_helper) {
2295 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2296 gtid, tg));
2297 return (void *)tg;
2298 }
2299 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2300 gtid, tg, num));
2301 arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2302 thread, num * sizeof(kmp_taskred_data_t));
2303 for (int i = 0; i < num; ++i) {
2304 size_t size = data[i].reduce_size - 1;
2305 // round the size up to cache line per thread-specific item
2306 size += CACHE_LINE - size % CACHE_LINE;
2307 KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2308 arr[i].reduce_shar = data[i].reduce_shar;
2309 arr[i].reduce_size = size;
2310 arr[i].flags = data[i].flags;
2311 arr[i].reduce_comb = data[i].reduce_comb;
2312 arr[i].reduce_init = data[i].reduce_init;
2313 arr[i].reduce_fini = data[i].reduce_fini;
2314 __kmp_assign_orig<T>(arr[i], data[i]);
2315 if (!arr[i].flags.lazy_priv) {
2316 // allocate cache-line aligned block and fill it with zeros
2317 arr[i].reduce_priv = __kmp_allocate(nth * size);
2318 arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2319 if (arr[i].reduce_init != NULL) {
2320 // initialize all thread-specific items
2321 for (size_t j = 0; j < nth; ++j) {
2322 __kmp_call_init<T>(arr[i], j * size);
2323 }
2324 }
2325 } else {
2326 // only allocate space for pointers now,
2327 // objects will be lazily allocated/initialized if/when requested
2328 // note that __kmp_allocate zeroes the allocated memory
2329 arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2330 }
2331 }
2332 tg->reduce_data = (void *)arr;
2333 tg->reduce_num_data = num;
2334 return (void *)tg;
2335}
2336
2351void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2352#if OMPX_TASKGRAPH
2353 kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2354 if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2355 kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2356 this_tdg->rec_taskred_data =
2357 __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2358 this_tdg->rec_num_taskred = num;
2359 KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2360 sizeof(kmp_task_red_input_t) * num);
2361 }
2362#endif
2363 return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2364}
2365
2378void *__kmpc_taskred_init(int gtid, int num, void *data) {
2379#if OMPX_TASKGRAPH
2380 kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2381 if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2382 kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2383 this_tdg->rec_taskred_data =
2384 __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2385 this_tdg->rec_num_taskred = num;
2386 KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2387 sizeof(kmp_task_red_input_t) * num);
2388 }
2389#endif
2390 return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2391}
2392
2393// Copy task reduction data (except for shared pointers).
2394template <typename T>
2395void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2396 kmp_taskgroup_t *tg, void *reduce_data) {
2397 kmp_taskred_data_t *arr;
2398 KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2399 " from data %p\n",
2400 thr, tg, reduce_data));
2401 arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2402 thr, num * sizeof(kmp_taskred_data_t));
2403 // threads will share private copies, thunk routines, sizes, flags, etc.:
2404 KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2405 for (int i = 0; i < num; ++i) {
2406 arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2407 }
2408 tg->reduce_data = (void *)arr;
2409 tg->reduce_num_data = num;
2410}
2411
2421void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2422 __kmp_assert_valid_gtid(gtid);
2423 kmp_info_t *thread = __kmp_threads[gtid];
2424 kmp_int32 nth = thread->th.th_team_nproc;
2425 if (nth == 1)
2426 return data; // nothing to do
2427
2428 kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2429 if (tg == NULL)
2430 tg = thread->th.th_current_task->td_taskgroup;
2431 KMP_ASSERT(tg != NULL);
2432 kmp_taskred_data_t *arr;
2433 kmp_int32 num;
2434 kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2435
2436#if OMPX_TASKGRAPH
2437 if ((thread->th.th_current_task->is_taskgraph) &&
2438 (!__kmp_tdg_is_recording(
2439 __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {
2440 tg = thread->th.th_current_task->td_taskgroup;
2441 KMP_ASSERT(tg != NULL);
2442 KMP_ASSERT(tg->reduce_data != NULL);
2443 arr = (kmp_taskred_data_t *)(tg->reduce_data);
2444 num = tg->reduce_num_data;
2445 }
2446#endif
2447
2448 KMP_ASSERT(data != NULL);
2449 while (tg != NULL) {
2450 arr = (kmp_taskred_data_t *)(tg->reduce_data);
2451 num = tg->reduce_num_data;
2452 for (int i = 0; i < num; ++i) {
2453 if (!arr[i].flags.lazy_priv) {
2454 if (data == arr[i].reduce_shar ||
2455 (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2456 return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2457 } else {
2458 // check shared location first
2459 void **p_priv = (void **)(arr[i].reduce_priv);
2460 if (data == arr[i].reduce_shar)
2461 goto found;
2462 // check if we get some thread specific location as parameter
2463 for (int j = 0; j < nth; ++j)
2464 if (data == p_priv[j])
2465 goto found;
2466 continue; // not found, continue search
2467 found:
2468 if (p_priv[tid] == NULL) {
2469 // allocate thread specific object lazily
2470 p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2471 if (arr[i].reduce_init != NULL) {
2472 if (arr[i].reduce_orig != NULL) { // new interface
2473 ((void (*)(void *, void *))arr[i].reduce_init)(
2474 p_priv[tid], arr[i].reduce_orig);
2475 } else { // old interface (single parameter)
2476 ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2477 }
2478 }
2479 }
2480 return p_priv[tid];
2481 }
2482 }
2483 KMP_ASSERT(tg->parent);
2484 tg = tg->parent;
2485 }
2486 KMP_ASSERT2(0, "Unknown task reduction item");
2487 return NULL; // ERROR, this line never executed
2488}
2489
2490// Finalize task reduction.
2491// Called from __kmpc_end_taskgroup()
2492static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2493 kmp_int32 nth = th->th.th_team_nproc;
2494 KMP_DEBUG_ASSERT(
2495 nth > 1 ||
2496 __kmp_enable_hidden_helper); // should not be called if nth == 1 unless we
2497 // are using hidden helper threads
2498 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2499 kmp_int32 num = tg->reduce_num_data;
2500 for (int i = 0; i < num; ++i) {
2501 void *sh_data = arr[i].reduce_shar;
2502 void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2503 void (*f_comb)(void *, void *) =
2504 (void (*)(void *, void *))(arr[i].reduce_comb);
2505 if (!arr[i].flags.lazy_priv) {
2506 void *pr_data = arr[i].reduce_priv;
2507 size_t size = arr[i].reduce_size;
2508 for (int j = 0; j < nth; ++j) {
2509 void *priv_data = (char *)pr_data + j * size;
2510 f_comb(sh_data, priv_data); // combine results
2511 if (f_fini)
2512 f_fini(priv_data); // finalize if needed
2513 }
2514 } else {
2515 void **pr_data = (void **)(arr[i].reduce_priv);
2516 for (int j = 0; j < nth; ++j) {
2517 if (pr_data[j] != NULL) {
2518 f_comb(sh_data, pr_data[j]); // combine results
2519 if (f_fini)
2520 f_fini(pr_data[j]); // finalize if needed
2521 __kmp_free(pr_data[j]);
2522 }
2523 }
2524 }
2525 __kmp_free(arr[i].reduce_priv);
2526 }
2527 __kmp_thread_free(th, arr);
2528 tg->reduce_data = NULL;
2529 tg->reduce_num_data = 0;
2530}
2531
2532// Cleanup task reduction data for parallel or worksharing,
2533// do not touch task private data other threads still working with.
2534// Called from __kmpc_end_taskgroup()
2535static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2536 __kmp_thread_free(th, tg->reduce_data);
2537 tg->reduce_data = NULL;
2538 tg->reduce_num_data = 0;
2539}
2540
2541template <typename T>
2542void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2543 int num, T *data) {
2544 __kmp_assert_valid_gtid(gtid);
2545 kmp_info_t *thr = __kmp_threads[gtid];
2546 kmp_int32 nth = thr->th.th_team_nproc;
2547 __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2548 if (nth == 1) {
2549 KA_TRACE(10,
2550 ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2551 gtid, thr->th.th_current_task->td_taskgroup));
2552 return (void *)thr->th.th_current_task->td_taskgroup;
2553 }
2554 kmp_team_t *team = thr->th.th_team;
2555 void *reduce_data;
2556 kmp_taskgroup_t *tg;
2557 reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2558 if (reduce_data == NULL &&
2559 __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2560 (void *)1)) {
2561 // single thread enters this block to initialize common reduction data
2562 KMP_DEBUG_ASSERT(reduce_data == NULL);
2563 // first initialize own data, then make a copy other threads can use
2564 tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2565 reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2566 KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2567 // fini counters should be 0 at this point
2568 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2569 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2570 KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2571 } else {
2572 while (
2573 (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2574 (void *)1) { // wait for task reduction initialization
2575 KMP_CPU_PAUSE();
2576 }
2577 KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2578 tg = thr->th.th_current_task->td_taskgroup;
2579 __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2580 }
2581 return tg;
2582}
2583
2600void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2601 int num, void *data) {
2602 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2603 (kmp_task_red_input_t *)data);
2604}
2605
2620void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2621 void *data) {
2622 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2623 (kmp_taskred_input_t *)data);
2624}
2625
2634void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2635 __kmpc_end_taskgroup(loc, gtid);
2636}
2637
2638// __kmpc_taskgroup: Start a new taskgroup
2639void __kmpc_taskgroup(ident_t *loc, int gtid) {
2640 __kmp_assert_valid_gtid(gtid);
2641 kmp_info_t *thread = __kmp_threads[gtid];
2642 kmp_taskdata_t *taskdata = thread->th.th_current_task;
2643 kmp_taskgroup_t *tg_new =
2644 (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2645 KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2646 KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2647 KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2648 tg_new->parent = taskdata->td_taskgroup;
2649 tg_new->reduce_data = NULL;
2650 tg_new->reduce_num_data = 0;
2651 tg_new->gomp_data = NULL;
2652 taskdata->td_taskgroup = tg_new;
2653
2654#if OMPT_SUPPORT && OMPT_OPTIONAL
2655 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2656 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2657 if (!codeptr)
2658 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2659 kmp_team_t *team = thread->th.th_team;
2660 ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2661 // FIXME: I think this is wrong for lwt!
2662 ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2663
2664 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2665 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2666 &(my_task_data), codeptr);
2667 }
2668#endif
2669}
2670
2671// __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2672// and its descendants are complete
2673void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2674 __kmp_assert_valid_gtid(gtid);
2675 kmp_info_t *thread = __kmp_threads[gtid];
2676 kmp_taskdata_t *taskdata = thread->th.th_current_task;
2677 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2678 int thread_finished = FALSE;
2679
2680#if OMPT_SUPPORT && OMPT_OPTIONAL
2681 kmp_team_t *team;
2682 ompt_data_t my_task_data;
2683 ompt_data_t my_parallel_data;
2684 void *codeptr = nullptr;
2685 if (UNLIKELY(ompt_enabled.enabled)) {
2686 team = thread->th.th_team;
2687 my_task_data = taskdata->ompt_task_info.task_data;
2688 // FIXME: I think this is wrong for lwt!
2689 my_parallel_data = team->t.ompt_team_info.parallel_data;
2690 codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2691 if (!codeptr)
2692 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2693 }
2694#endif
2695
2696 KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2697 KMP_DEBUG_ASSERT(taskgroup != NULL);
2698 KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2699
2700 if (__kmp_tasking_mode != tskm_immediate_exec) {
2701 // mark task as waiting not on a barrier
2702 taskdata->td_taskwait_counter += 1;
2703 taskdata->td_taskwait_ident = loc;
2704 taskdata->td_taskwait_thread = gtid + 1;
2705#if USE_ITT_BUILD
2706 // For ITT the taskgroup wait is similar to taskwait until we need to
2707 // distinguish them
2708 void *itt_sync_obj = NULL;
2709#if USE_ITT_NOTIFY
2710 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2711#endif /* USE_ITT_NOTIFY */
2712#endif /* USE_ITT_BUILD */
2713
2714#if OMPT_SUPPORT && OMPT_OPTIONAL
2715 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2716 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2717 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2718 &(my_task_data), codeptr);
2719 }
2720#endif
2721
2722 if (!taskdata->td_flags.team_serial ||
2723 (thread->th.th_task_team != NULL &&
2724 (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
2725 thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
2726 kmp_flag_32<false, false> flag(
2727 RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2728 while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2729 flag.execute_tasks(thread, gtid, FALSE,
2730 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2731 __kmp_task_stealing_constraint);
2732 }
2733 }
2734 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2735
2736#if OMPT_SUPPORT && OMPT_OPTIONAL
2737 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2738 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2739 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2740 &(my_task_data), codeptr);
2741 }
2742#endif
2743
2744#if USE_ITT_BUILD
2745 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2746 KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2747#endif /* USE_ITT_BUILD */
2748 }
2749 KMP_DEBUG_ASSERT(taskgroup->count == 0);
2750
2751 if (taskgroup->reduce_data != NULL &&
2752 !taskgroup->gomp_data) { // need to reduce?
2753 int cnt;
2754 void *reduce_data;
2755 kmp_team_t *t = thread->th.th_team;
2756 kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2757 // check if <priv> data of the first reduction variable shared for the team
2758 void *priv0 = arr[0].reduce_priv;
2759 if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2760 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2761 // finishing task reduction on parallel
2762 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2763 if (cnt == thread->th.th_team_nproc - 1) {
2764 // we are the last thread passing __kmpc_reduction_modifier_fini()
2765 // finalize task reduction:
2766 __kmp_task_reduction_fini(thread, taskgroup);
2767 // cleanup fields in the team structure:
2768 // TODO: is relaxed store enough here (whole barrier should follow)?
2769 __kmp_thread_free(thread, reduce_data);
2770 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2771 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2772 } else {
2773 // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2774 // so do not finalize reduction, just clean own copy of the data
2775 __kmp_task_reduction_clean(thread, taskgroup);
2776 }
2777 } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2778 NULL &&
2779 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2780 // finishing task reduction on worksharing
2781 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2782 if (cnt == thread->th.th_team_nproc - 1) {
2783 // we are the last thread passing __kmpc_reduction_modifier_fini()
2784 __kmp_task_reduction_fini(thread, taskgroup);
2785 // cleanup fields in team structure:
2786 // TODO: is relaxed store enough here (whole barrier should follow)?
2787 __kmp_thread_free(thread, reduce_data);
2788 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2789 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2790 } else {
2791 // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2792 // so do not finalize reduction, just clean own copy of the data
2793 __kmp_task_reduction_clean(thread, taskgroup);
2794 }
2795 } else {
2796 // finishing task reduction on taskgroup
2797 __kmp_task_reduction_fini(thread, taskgroup);
2798 }
2799 }
2800 // Restore parent taskgroup for the current task
2801 taskdata->td_taskgroup = taskgroup->parent;
2802 __kmp_thread_free(thread, taskgroup);
2803
2804 KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2805 gtid, taskdata));
2806
2807#if OMPT_SUPPORT && OMPT_OPTIONAL
2808 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2809 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2810 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2811 &(my_task_data), codeptr);
2812 }
2813#endif
2814}
2815
2816static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,
2817 kmp_task_team_t *task_team,
2818 kmp_int32 is_constrained) {
2819 kmp_task_t *task = NULL;
2820 kmp_taskdata_t *taskdata;
2821 kmp_taskdata_t *current;
2822 kmp_thread_data_t *thread_data;
2823 int ntasks = task_team->tt.tt_num_task_pri;
2824 if (ntasks == 0) {
2825 KA_TRACE(
2826 20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid));
2827 return NULL;
2828 }
2829 do {
2830 // decrement num_tasks to "reserve" one task to get for execution
2831 if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks,
2832 ntasks - 1))
2833 break;
2834 ntasks = task_team->tt.tt_num_task_pri;
2835 } while (ntasks > 0);
2836 if (ntasks == 0) {
2837 KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",
2838 __kmp_get_gtid()));
2839 return NULL;
2840 }
2841 // We got a "ticket" to get a "reserved" priority task
2842 int deque_ntasks;
2843 kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
2844 do {
2845 KMP_ASSERT(list != NULL);
2846 thread_data = &list->td;
2847 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2848 deque_ntasks = thread_data->td.td_deque_ntasks;
2849 if (deque_ntasks == 0) {
2850 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2851 KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n",
2852 __kmp_get_gtid(), thread_data));
2853 list = list->next;
2854 }
2855 } while (deque_ntasks == 0);
2856 KMP_DEBUG_ASSERT(deque_ntasks);
2857 int target = thread_data->td.td_deque_head;
2858 current = __kmp_threads[gtid]->th.th_current_task;
2859 taskdata = thread_data->td.td_deque[target];
2860 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2861 // Bump head pointer and Wrap.
2862 thread_data->td.td_deque_head =
2863 (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2864 } else {
2865 if (!task_team->tt.tt_untied_task_encountered) {
2866 // The TSC does not allow to steal victim task
2867 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2868 KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task "
2869 "from %p: task_team=%p ntasks=%d head=%u tail=%u\n",
2870 gtid, thread_data, task_team, deque_ntasks, target,
2871 thread_data->td.td_deque_tail));
2872 task_team->tt.tt_num_task_pri++; // atomic inc, restore value
2873 return NULL;
2874 }
2875 int i;
2876 // walk through the deque trying to steal any task
2877 taskdata = NULL;
2878 for (i = 1; i < deque_ntasks; ++i) {
2879 target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2880 taskdata = thread_data->td.td_deque[target];
2881 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2882 break; // found task to execute
2883 } else {
2884 taskdata = NULL;
2885 }
2886 }
2887 if (taskdata == NULL) {
2888 // No appropriate candidate found to execute
2889 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2890 KA_TRACE(
2891 10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from "
2892 "%p: task_team=%p ntasks=%d head=%u tail=%u\n",
2893 gtid, thread_data, task_team, deque_ntasks,
2894 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2895 task_team->tt.tt_num_task_pri++; // atomic inc, restore value
2896 return NULL;
2897 }
2898 int prev = target;
2899 for (i = i + 1; i < deque_ntasks; ++i) {
2900 // shift remaining tasks in the deque left by 1
2901 target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2902 thread_data->td.td_deque[prev] = thread_data->td.td_deque[target];
2903 prev = target;
2904 }
2905 KMP_DEBUG_ASSERT(
2906 thread_data->td.td_deque_tail ==
2907 (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td)));
2908 thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped))
2909 }
2910 thread_data->td.td_deque_ntasks = deque_ntasks - 1;
2911 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2912 task = KMP_TASKDATA_TO_TASK(taskdata);
2913 return task;
2914}
2915
2916// __kmp_remove_my_task: remove a task from my own deque
2917static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2918 kmp_task_team_t *task_team,
2919 kmp_int32 is_constrained) {
2920 kmp_task_t *task;
2921 kmp_taskdata_t *taskdata;
2922 kmp_thread_data_t *thread_data;
2923 kmp_uint32 tail;
2924
2925 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2926 KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2927 NULL); // Caller should check this condition
2928
2929 thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2930
2931 KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2932 gtid, thread_data->td.td_deque_ntasks,
2933 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2934
2935 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2936 KA_TRACE(10,
2937 ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2938 "ntasks=%d head=%u tail=%u\n",
2939 gtid, thread_data->td.td_deque_ntasks,
2940 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2941 return NULL;
2942 }
2943
2944 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2945
2946 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2947 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2948 KA_TRACE(10,
2949 ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2950 "ntasks=%d head=%u tail=%u\n",
2951 gtid, thread_data->td.td_deque_ntasks,
2952 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2953 return NULL;
2954 }
2955
2956 tail = (thread_data->td.td_deque_tail - 1) &
2957 TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2958 taskdata = thread_data->td.td_deque[tail];
2959
2960 if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2961 thread->th.th_current_task)) {
2962 // The TSC does not allow to steal victim task
2963 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2964 KA_TRACE(10,
2965 ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2966 "ntasks=%d head=%u tail=%u\n",
2967 gtid, thread_data->td.td_deque_ntasks,
2968 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2969 return NULL;
2970 }
2971
2972 thread_data->td.td_deque_tail = tail;
2973 TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2974
2975 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2976
2977 KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
2978 "ntasks=%d head=%u tail=%u\n",
2979 gtid, taskdata, thread_data->td.td_deque_ntasks,
2980 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2981
2982 task = KMP_TASKDATA_TO_TASK(taskdata);
2983 return task;
2984}
2985
2986// __kmp_steal_task: remove a task from another thread's deque
2987// Assume that calling thread has already checked existence of
2988// task_team thread_data before calling this routine.
2989static kmp_task_t *__kmp_steal_task(kmp_int32 victim_tid, kmp_int32 gtid,
2990 kmp_task_team_t *task_team,
2991 std::atomic<kmp_int32> *unfinished_threads,
2992 int *thread_finished,
2993 kmp_int32 is_constrained) {
2994 kmp_task_t *task;
2995 kmp_taskdata_t *taskdata;
2996 kmp_taskdata_t *current;
2997 kmp_thread_data_t *victim_td, *threads_data;
2998 kmp_int32 target;
2999 kmp_info_t *victim_thr;
3000
3001 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3002
3003 threads_data = task_team->tt.tt_threads_data;
3004 KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
3005 KMP_DEBUG_ASSERT(victim_tid >= 0);
3006 KMP_DEBUG_ASSERT(victim_tid < task_team->tt.tt_max_threads);
3007
3008 victim_td = &threads_data[victim_tid];
3009 victim_thr = victim_td->td.td_thr;
3010 (void)victim_thr; // Use in TRACE messages which aren't always enabled.
3011
3012 KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
3013 "task_team=%p ntasks=%d head=%u tail=%u\n",
3014 gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3015 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3016 victim_td->td.td_deque_tail));
3017
3018 if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
3019 KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
3020 "task_team=%p ntasks=%d head=%u tail=%u\n",
3021 gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3022 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3023 victim_td->td.td_deque_tail));
3024 return NULL;
3025 }
3026
3027 __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
3028
3029 int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
3030 // Check again after we acquire the lock
3031 if (ntasks == 0) {
3032 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3033 KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
3034 "task_team=%p ntasks=%d head=%u tail=%u\n",
3035 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3036 victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3037 return NULL;
3038 }
3039
3040 KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
3041 current = __kmp_threads[gtid]->th.th_current_task;
3042 taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
3043 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3044 // Bump head pointer and Wrap.
3045 victim_td->td.td_deque_head =
3046 (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
3047 } else {
3048 if (!task_team->tt.tt_untied_task_encountered) {
3049 // The TSC does not allow to steal victim task
3050 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3051 KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
3052 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3053 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3054 victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3055 return NULL;
3056 }
3057 int i;
3058 // walk through victim's deque trying to steal any task
3059 target = victim_td->td.td_deque_head;
3060 taskdata = NULL;
3061 for (i = 1; i < ntasks; ++i) {
3062 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3063 taskdata = victim_td->td.td_deque[target];
3064 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3065 break; // found victim task
3066 } else {
3067 taskdata = NULL;
3068 }
3069 }
3070 if (taskdata == NULL) {
3071 // No appropriate candidate to steal found
3072 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3073 KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
3074 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3075 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3076 victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3077 return NULL;
3078 }
3079 int prev = target;
3080 for (i = i + 1; i < ntasks; ++i) {
3081 // shift remaining tasks in the deque left by 1
3082 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3083 victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
3084 prev = target;
3085 }
3086 KMP_DEBUG_ASSERT(
3087 victim_td->td.td_deque_tail ==
3088 (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
3089 victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
3090 }
3091 if (*thread_finished) {
3092 // We need to un-mark this victim as a finished victim. This must be done
3093 // before releasing the lock, or else other threads (starting with the
3094 // primary thread victim) might be prematurely released from the barrier!!!
3095#if KMP_DEBUG
3096 kmp_int32 count =
3097#endif
3098 KMP_ATOMIC_INC(unfinished_threads);
3099 KA_TRACE(
3100 20,
3101 ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
3102 gtid, count + 1, task_team));
3103 *thread_finished = FALSE;
3104 }
3105 TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
3106
3107 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3108
3109 KMP_COUNT_BLOCK(TASK_stolen);
3110 KA_TRACE(10,
3111 ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
3112 "task_team=%p ntasks=%d head=%u tail=%u\n",
3113 gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
3114 ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3115
3116 task = KMP_TASKDATA_TO_TASK(taskdata);
3117 return task;
3118}
3119
3120// __kmp_execute_tasks_template: Choose and execute tasks until either the
3121// condition is statisfied (return true) or there are none left (return false).
3122//
3123// final_spin is TRUE if this is the spin at the release barrier.
3124// thread_finished indicates whether the thread is finished executing all
3125// the tasks it has on its deque, and is at the release barrier.
3126// spinner is the location on which to spin.
3127// spinner == NULL means only execute a single task and return.
3128// checker is the value to check to terminate the spin.
3129template <class C>
3130static inline int __kmp_execute_tasks_template(
3131 kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
3132 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3133 kmp_int32 is_constrained) {
3134 kmp_task_team_t *task_team = thread->th.th_task_team;
3135 kmp_thread_data_t *threads_data;
3136 kmp_task_t *task;
3137 kmp_info_t *other_thread;
3138 kmp_taskdata_t *current_task = thread->th.th_current_task;
3139 std::atomic<kmp_int32> *unfinished_threads;
3140 kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
3141 tid = thread->th.th_info.ds.ds_tid;
3142
3143 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3144 KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
3145
3146 if (task_team == NULL || current_task == NULL)
3147 return FALSE;
3148
3149 KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
3150 "*thread_finished=%d\n",
3151 gtid, final_spin, *thread_finished));
3152
3153 thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3154 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3155
3156 KMP_DEBUG_ASSERT(threads_data != NULL);
3157
3158 nthreads = task_team->tt.tt_nproc;
3159 unfinished_threads = &(task_team->tt.tt_unfinished_threads);
3160 KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
3161
3162 while (1) { // Outer loop keeps trying to find tasks in case of single thread
3163 // getting tasks from target constructs
3164 while (1) { // Inner loop to find a task and execute it
3165 task = NULL;
3166 if (task_team->tt.tt_num_task_pri) { // get priority task first
3167 task = __kmp_get_priority_task(gtid, task_team, is_constrained);
3168 }
3169 if (task == NULL && use_own_tasks) { // check own queue next
3170 task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
3171 }
3172 if ((task == NULL) && (nthreads > 1)) { // Steal a task finally
3173 int asleep = 1;
3174 use_own_tasks = 0;
3175 // Try to steal from the last place I stole from successfully.
3176 if (victim_tid == -2) { // haven't stolen anything yet
3177 victim_tid = threads_data[tid].td.td_deque_last_stolen;
3178 if (victim_tid !=
3179 -1) // if we have a last stolen from victim, get the thread
3180 other_thread = threads_data[victim_tid].td.td_thr;
3181 }
3182 if (victim_tid != -1) { // found last victim
3183 asleep = 0;
3184 } else if (!new_victim) { // no recent steals and we haven't already
3185 // used a new victim; select a random thread
3186 do { // Find a different thread to steal work from.
3187 // Pick a random thread. Initial plan was to cycle through all the
3188 // threads, and only return if we tried to steal from every thread,
3189 // and failed. Arch says that's not such a great idea.
3190 victim_tid = __kmp_get_random(thread) % (nthreads - 1);
3191 if (victim_tid >= tid) {
3192 ++victim_tid; // Adjusts random distribution to exclude self
3193 }
3194 // Found a potential victim
3195 other_thread = threads_data[victim_tid].td.td_thr;
3196 // There is a slight chance that __kmp_enable_tasking() did not wake
3197 // up all threads waiting at the barrier. If victim is sleeping,
3198 // then wake it up. Since we were going to pay the cache miss
3199 // penalty for referencing another thread's kmp_info_t struct
3200 // anyway,
3201 // the check shouldn't cost too much performance at this point. In
3202 // extra barrier mode, tasks do not sleep at the separate tasking
3203 // barrier, so this isn't a problem.
3204 asleep = 0;
3205 if ((__kmp_tasking_mode == tskm_task_teams) &&
3206 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
3207 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
3208 NULL)) {
3209 asleep = 1;
3210 __kmp_null_resume_wrapper(other_thread);
3211 // A sleeping thread should not have any tasks on it's queue.
3212 // There is a slight possibility that it resumes, steals a task
3213 // from another thread, which spawns more tasks, all in the time
3214 // that it takes this thread to check => don't write an assertion
3215 // that the victim's queue is empty. Try stealing from a
3216 // different thread.
3217 }
3218 } while (asleep);
3219 }
3220
3221 if (!asleep) {
3222 // We have a victim to try to steal from
3223 task =
3224 __kmp_steal_task(victim_tid, gtid, task_team, unfinished_threads,
3225 thread_finished, is_constrained);
3226 }
3227 if (task != NULL) { // set last stolen to victim
3228 if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
3229 threads_data[tid].td.td_deque_last_stolen = victim_tid;
3230 // The pre-refactored code did not try more than 1 successful new
3231 // vicitm, unless the last one generated more local tasks;
3232 // new_victim keeps track of this
3233 new_victim = 1;
3234 }
3235 } else { // No tasks found; unset last_stolen
3236 KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
3237 victim_tid = -2; // no successful victim found
3238 }
3239 }
3240
3241 if (task == NULL)
3242 break; // break out of tasking loop
3243
3244// Found a task; execute it
3245#if USE_ITT_BUILD && USE_ITT_NOTIFY
3246 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
3247 if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
3248 // get the object reliably
3249 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
3250 }
3251 __kmp_itt_task_starting(itt_sync_obj);
3252 }
3253#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
3254 __kmp_invoke_task(gtid, task, current_task);
3255#if USE_ITT_BUILD
3256 if (itt_sync_obj != NULL)
3257 __kmp_itt_task_finished(itt_sync_obj);
3258#endif /* USE_ITT_BUILD */
3259 // If this thread is only partway through the barrier and the condition is
3260 // met, then return now, so that the barrier gather/release pattern can
3261 // proceed. If this thread is in the last spin loop in the barrier,
3262 // waiting to be released, we know that the termination condition will not
3263 // be satisfied, so don't waste any cycles checking it.
3264 if (flag == NULL || (!final_spin && flag->done_check())) {
3265 KA_TRACE(
3266 15,
3267 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3268 gtid));
3269 return TRUE;
3270 }
3271 if (thread->th.th_task_team == NULL) {
3272 break;
3273 }
3274 KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3275 // If execution of a stolen task results in more tasks being placed on our
3276 // run queue, reset use_own_tasks
3277 if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3278 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3279 "other tasks, restart\n",
3280 gtid));
3281 use_own_tasks = 1;
3282 new_victim = 0;
3283 }
3284 }
3285
3286 // The task source has been exhausted. If in final spin loop of barrier,
3287 // check if termination condition is satisfied. The work queue may be empty
3288 // but there might be proxy tasks still executing.
3289 if (final_spin &&
3290 KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3291 // First, decrement the #unfinished threads, if that has not already been
3292 // done. This decrement might be to the spin location, and result in the
3293 // termination condition being satisfied.
3294 if (!*thread_finished) {
3295#if KMP_DEBUG
3296 kmp_int32 count = -1 +
3297#endif
3298 KMP_ATOMIC_DEC(unfinished_threads);
3299 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3300 "unfinished_threads to %d task_team=%p\n",
3301 gtid, count, task_team));
3302 *thread_finished = TRUE;
3303 }
3304
3305 // It is now unsafe to reference thread->th.th_team !!!
3306 // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3307 // thread to pass through the barrier, where it might reset each thread's
3308 // th.th_team field for the next parallel region. If we can steal more
3309 // work, we know that this has not happened yet.
3310 if (flag != NULL && flag->done_check()) {
3311 KA_TRACE(
3312 15,
3313 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3314 gtid));
3315 return TRUE;
3316 }
3317 }
3318
3319 // If this thread's task team is NULL, primary thread has recognized that
3320 // there are no more tasks; bail out
3321 if (thread->th.th_task_team == NULL) {
3322 KA_TRACE(15,
3323 ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3324 return FALSE;
3325 }
3326
3327 // Check the flag again to see if it has already done in case to be trapped
3328 // into infinite loop when a if0 task depends on a hidden helper task
3329 // outside any parallel region. Detached tasks are not impacted in this case
3330 // because the only thread executing this function has to execute the proxy
3331 // task so it is in another code path that has the same check.
3332 if (flag == NULL || (!final_spin && flag->done_check())) {
3333 KA_TRACE(15,
3334 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3335 gtid));
3336 return TRUE;
3337 }
3338
3339 // We could be getting tasks from target constructs; if this is the only
3340 // thread, keep trying to execute tasks from own queue
3341 if (nthreads == 1 &&
3342 KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
3343 use_own_tasks = 1;
3344 else {
3345 KA_TRACE(15,
3346 ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3347 return FALSE;
3348 }
3349 }
3350}
3351
3352template <bool C, bool S>
3353int __kmp_execute_tasks_32(
3354 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3355 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3356 kmp_int32 is_constrained) {
3357 return __kmp_execute_tasks_template(
3358 thread, gtid, flag, final_spin,
3359 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3360}
3361
3362template <bool C, bool S>
3363int __kmp_execute_tasks_64(
3364 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3365 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3366 kmp_int32 is_constrained) {
3367 return __kmp_execute_tasks_template(
3368 thread, gtid, flag, final_spin,
3369 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3370}
3371
3372template <bool C, bool S>
3373int __kmp_atomic_execute_tasks_64(
3374 kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
3375 int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3376 kmp_int32 is_constrained) {
3377 return __kmp_execute_tasks_template(
3378 thread, gtid, flag, final_spin,
3379 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3380}
3381
3382int __kmp_execute_tasks_oncore(
3383 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3384 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3385 kmp_int32 is_constrained) {
3386 return __kmp_execute_tasks_template(
3387 thread, gtid, flag, final_spin,
3388 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3389}
3390
3391template int
3392__kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
3393 kmp_flag_32<false, false> *, int,
3394 int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3395
3396template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
3397 kmp_flag_64<false, true> *,
3398 int,
3399 int *USE_ITT_BUILD_ARG(void *),
3400 kmp_int32);
3401
3402template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
3403 kmp_flag_64<true, false> *,
3404 int,
3405 int *USE_ITT_BUILD_ARG(void *),
3406 kmp_int32);
3407
3408template int __kmp_atomic_execute_tasks_64<false, true>(
3409 kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
3410 int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3411
3412template int __kmp_atomic_execute_tasks_64<true, false>(
3413 kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
3414 int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3415
3416// __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3417// next barrier so they can assist in executing enqueued tasks.
3418// First thread in allocates the task team atomically.
3419static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3420 kmp_info_t *this_thr) {
3421 kmp_thread_data_t *threads_data;
3422 int nthreads, i, is_init_thread;
3423
3424 KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3425 __kmp_gtid_from_thread(this_thr)));
3426
3427 KMP_DEBUG_ASSERT(task_team != NULL);
3428 KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3429
3430 nthreads = task_team->tt.tt_nproc;
3431 KMP_DEBUG_ASSERT(nthreads > 0);
3432 KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3433
3434 // Allocate or increase the size of threads_data if necessary
3435 is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3436
3437 if (!is_init_thread) {
3438 // Some other thread already set up the array.
3439 KA_TRACE(
3440 20,
3441 ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3442 __kmp_gtid_from_thread(this_thr)));
3443 return;
3444 }
3445 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3446 KMP_DEBUG_ASSERT(threads_data != NULL);
3447
3448 if (__kmp_tasking_mode == tskm_task_teams &&
3449 (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3450 // Release any threads sleeping at the barrier, so that they can steal
3451 // tasks and execute them. In extra barrier mode, tasks do not sleep
3452 // at the separate tasking barrier, so this isn't a problem.
3453 for (i = 0; i < nthreads; i++) {
3454 void *sleep_loc;
3455 kmp_info_t *thread = threads_data[i].td.td_thr;
3456
3457 if (i == this_thr->th.th_info.ds.ds_tid) {
3458 continue;
3459 }
3460 // Since we haven't locked the thread's suspend mutex lock at this
3461 // point, there is a small window where a thread might be putting
3462 // itself to sleep, but hasn't set the th_sleep_loc field yet.
3463 // To work around this, __kmp_execute_tasks_template() periodically checks
3464 // see if other threads are sleeping (using the same random mechanism that
3465 // is used for task stealing) and awakens them if they are.
3466 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3467 NULL) {
3468 KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3469 __kmp_gtid_from_thread(this_thr),
3470 __kmp_gtid_from_thread(thread)));
3471 __kmp_null_resume_wrapper(thread);
3472 } else {
3473 KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3474 __kmp_gtid_from_thread(this_thr),
3475 __kmp_gtid_from_thread(thread)));
3476 }
3477 }
3478 }
3479
3480 KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3481 __kmp_gtid_from_thread(this_thr)));
3482}
3483
3484/* // TODO: Check the comment consistency
3485 * Utility routines for "task teams". A task team (kmp_task_t) is kind of
3486 * like a shadow of the kmp_team_t data struct, with a different lifetime.
3487 * After a child * thread checks into a barrier and calls __kmp_release() from
3488 * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3489 * longer assume that the kmp_team_t structure is intact (at any moment, the
3490 * primary thread may exit the barrier code and free the team data structure,
3491 * and return the threads to the thread pool).
3492 *
3493 * This does not work with the tasking code, as the thread is still
3494 * expected to participate in the execution of any tasks that may have been
3495 * spawned my a member of the team, and the thread still needs access to all
3496 * to each thread in the team, so that it can steal work from it.
3497 *
3498 * Enter the existence of the kmp_task_team_t struct. It employs a reference
3499 * counting mechanism, and is allocated by the primary thread before calling
3500 * __kmp_<barrier_kind>_release, and then is release by the last thread to
3501 * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
3502 * of the kmp_task_team_t structs for consecutive barriers can overlap
3503 * (and will, unless the primary thread is the last thread to exit the barrier
3504 * release phase, which is not typical). The existence of such a struct is
3505 * useful outside the context of tasking.
3506 *
3507 * We currently use the existence of the threads array as an indicator that
3508 * tasks were spawned since the last barrier. If the structure is to be
3509 * useful outside the context of tasking, then this will have to change, but
3510 * not setting the field minimizes the performance impact of tasking on
3511 * barriers, when no explicit tasks were spawned (pushed, actually).
3512 */
3513
3514static kmp_task_team_t *__kmp_free_task_teams =
3515 NULL; // Free list for task_team data structures
3516// Lock for task team data structures
3517kmp_bootstrap_lock_t __kmp_task_team_lock =
3518 KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3519
3520// __kmp_alloc_task_deque:
3521// Allocates a task deque for a particular thread, and initialize the necessary
3522// data structures relating to the deque. This only happens once per thread
3523// per task team since task teams are recycled. No lock is needed during
3524// allocation since each thread allocates its own deque.
3525static void __kmp_alloc_task_deque(kmp_info_t *thread,
3526 kmp_thread_data_t *thread_data) {
3527 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3528 KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3529
3530 // Initialize last stolen task field to "none"
3531 thread_data->td.td_deque_last_stolen = -1;
3532
3533 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3534 KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3535 KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3536
3537 KE_TRACE(
3538 10,
3539 ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3540 __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3541 // Allocate space for task deque, and zero the deque
3542 // Cannot use __kmp_thread_calloc() because threads not around for
3543 // kmp_reap_task_team( ).
3544 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3545 INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3546 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3547}
3548
3549// __kmp_free_task_deque:
3550// Deallocates a task deque for a particular thread. Happens at library
3551// deallocation so don't need to reset all thread data fields.
3552static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3553 if (thread_data->td.td_deque != NULL) {
3554 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3555 TCW_4(thread_data->td.td_deque_ntasks, 0);
3556 __kmp_free(thread_data->td.td_deque);
3557 thread_data->td.td_deque = NULL;
3558 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3559 }
3560}
3561
3562// __kmp_realloc_task_threads_data:
3563// Allocates a threads_data array for a task team, either by allocating an
3564// initial array or enlarging an existing array. Only the first thread to get
3565// the lock allocs or enlarges the array and re-initializes the array elements.
3566// That thread returns "TRUE", the rest return "FALSE".
3567// Assumes that the new array size is given by task_team -> tt.tt_nproc.
3568// The current size is given by task_team -> tt.tt_max_threads.
3569static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3570 kmp_task_team_t *task_team) {
3571 kmp_thread_data_t **threads_data_p;
3572 kmp_int32 nthreads, maxthreads;
3573 int is_init_thread = FALSE;
3574
3575 if (TCR_4(task_team->tt.tt_found_tasks)) {
3576 // Already reallocated and initialized.
3577 return FALSE;
3578 }
3579
3580 threads_data_p = &task_team->tt.tt_threads_data;
3581 nthreads = task_team->tt.tt_nproc;
3582 maxthreads = task_team->tt.tt_max_threads;
3583
3584 // All threads must lock when they encounter the first task of the implicit
3585 // task region to make sure threads_data fields are (re)initialized before
3586 // used.
3587 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3588
3589 if (!TCR_4(task_team->tt.tt_found_tasks)) {
3590 // first thread to enable tasking
3591 kmp_team_t *team = thread->th.th_team;
3592 int i;
3593
3594 is_init_thread = TRUE;
3595 if (maxthreads < nthreads) {
3596
3597 if (*threads_data_p != NULL) {
3598 kmp_thread_data_t *old_data = *threads_data_p;
3599 kmp_thread_data_t *new_data = NULL;
3600
3601 KE_TRACE(
3602 10,
3603 ("__kmp_realloc_task_threads_data: T#%d reallocating "
3604 "threads data for task_team %p, new_size = %d, old_size = %d\n",
3605 __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3606 // Reallocate threads_data to have more elements than current array
3607 // Cannot use __kmp_thread_realloc() because threads not around for
3608 // kmp_reap_task_team( ). Note all new array entries are initialized
3609 // to zero by __kmp_allocate().
3610 new_data = (kmp_thread_data_t *)__kmp_allocate(
3611 nthreads * sizeof(kmp_thread_data_t));
3612 // copy old data to new data
3613 KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3614 (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3615
3616 // Install the new data and free the old data
3617 (*threads_data_p) = new_data;
3618 __kmp_free(old_data);
3619 } else {
3620 KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3621 "threads data for task_team %p, size = %d\n",
3622 __kmp_gtid_from_thread(thread), task_team, nthreads));
3623 // Make the initial allocate for threads_data array, and zero entries
3624 // Cannot use __kmp_thread_calloc() because threads not around for
3625 // kmp_reap_task_team( ).
3626 *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3627 nthreads * sizeof(kmp_thread_data_t));
3628 }
3629 task_team->tt.tt_max_threads = nthreads;
3630 } else {
3631 // If array has (more than) enough elements, go ahead and use it
3632 KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3633 }
3634
3635 // initialize threads_data pointers back to thread_info structures
3636 for (i = 0; i < nthreads; i++) {
3637 kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3638 thread_data->td.td_thr = team->t.t_threads[i];
3639
3640 if (thread_data->td.td_deque_last_stolen >= nthreads) {
3641 // The last stolen field survives across teams / barrier, and the number
3642 // of threads may have changed. It's possible (likely?) that a new
3643 // parallel region will exhibit the same behavior as previous region.
3644 thread_data->td.td_deque_last_stolen = -1;
3645 }
3646 }
3647
3648 KMP_MB();
3649 TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3650 }
3651
3652 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3653 return is_init_thread;
3654}
3655
3656// __kmp_free_task_threads_data:
3657// Deallocates a threads_data array for a task team, including any attached
3658// tasking deques. Only occurs at library shutdown.
3659static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3660 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3661 if (task_team->tt.tt_threads_data != NULL) {
3662 int i;
3663 for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3664 __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3665 }
3666 __kmp_free(task_team->tt.tt_threads_data);
3667 task_team->tt.tt_threads_data = NULL;
3668 }
3669 __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3670}
3671
3672// __kmp_free_task_pri_list:
3673// Deallocates tasking deques used for priority tasks.
3674// Only occurs at library shutdown.
3675static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
3676 __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3677 if (task_team->tt.tt_task_pri_list != NULL) {
3678 kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3679 while (list != NULL) {
3680 kmp_task_pri_t *next = list->next;
3681 __kmp_free_task_deque(&list->td);
3682 __kmp_free(list);
3683 list = next;
3684 }
3685 task_team->tt.tt_task_pri_list = NULL;
3686 }
3687 __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3688}
3689
3690static inline void __kmp_task_team_init(kmp_task_team_t *task_team,
3691 kmp_team_t *team) {
3692 int team_nth = team->t.t_nproc;
3693 // Only need to init if task team is isn't active or team size changed
3694 if (!task_team->tt.tt_active || team_nth != task_team->tt.tt_nproc) {
3695 TCW_4(task_team->tt.tt_found_tasks, FALSE);
3696 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3697 TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3698 TCW_4(task_team->tt.tt_nproc, team_nth);
3699 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team_nth);
3700 TCW_4(task_team->tt.tt_active, TRUE);
3701 }
3702}
3703
3704// __kmp_allocate_task_team:
3705// Allocates a task team associated with a specific team, taking it from
3706// the global task team free list if possible. Also initializes data
3707// structures.
3708static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3709 kmp_team_t *team) {
3710 kmp_task_team_t *task_team = NULL;
3711
3712 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3713 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3714
3715 if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3716 // Take a task team from the task team pool
3717 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3718 if (__kmp_free_task_teams != NULL) {
3719 task_team = __kmp_free_task_teams;
3720 TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3721 task_team->tt.tt_next = NULL;
3722 }
3723 __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3724 }
3725
3726 if (task_team == NULL) {
3727 KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3728 "task team for team %p\n",
3729 __kmp_gtid_from_thread(thread), team));
3730 // Allocate a new task team if one is not available. Cannot use
3731 // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3732 task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3733 __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3734 __kmp_init_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3735#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3736 // suppress race conditions detection on synchronization flags in debug mode
3737 // this helps to analyze library internals eliminating false positives
3738 __itt_suppress_mark_range(
3739 __itt_suppress_range, __itt_suppress_threading_errors,
3740 &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3741 __itt_suppress_mark_range(__itt_suppress_range,
3742 __itt_suppress_threading_errors,
3743 CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3744 sizeof(task_team->tt.tt_active));
3745#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3746 // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3747 // task_team->tt.tt_threads_data = NULL;
3748 // task_team->tt.tt_max_threads = 0;
3749 // task_team->tt.tt_next = NULL;
3750 }
3751
3752 __kmp_task_team_init(task_team, team);
3753
3754 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3755 "unfinished_threads init'd to %d\n",
3756 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3757 KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3758 return task_team;
3759}
3760
3761// __kmp_free_task_team:
3762// Frees the task team associated with a specific thread, and adds it
3763// to the global task team free list.
3764void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3765 KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3766 thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3767
3768 // Put task team back on free list
3769 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3770
3771 KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3772 task_team->tt.tt_next = __kmp_free_task_teams;
3773 TCW_PTR(__kmp_free_task_teams, task_team);
3774
3775 __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3776}
3777
3778// __kmp_reap_task_teams:
3779// Free all the task teams on the task team free list.
3780// Should only be done during library shutdown.
3781// Cannot do anything that needs a thread structure or gtid since they are
3782// already gone.
3783void __kmp_reap_task_teams(void) {
3784 kmp_task_team_t *task_team;
3785
3786 if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3787 // Free all task_teams on the free list
3788 __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3789 while ((task_team = __kmp_free_task_teams) != NULL) {
3790 __kmp_free_task_teams = task_team->tt.tt_next;
3791 task_team->tt.tt_next = NULL;
3792
3793 // Free threads_data if necessary
3794 if (task_team->tt.tt_threads_data != NULL) {
3795 __kmp_free_task_threads_data(task_team);
3796 }
3797 if (task_team->tt.tt_task_pri_list != NULL) {
3798 __kmp_free_task_pri_list(task_team);
3799 }
3800 __kmp_free(task_team);
3801 }
3802 __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3803 }
3804}
3805
3806// View the array of two task team pointers as a pair of pointers:
3807// 1) a single task_team pointer
3808// 2) next pointer for stack
3809// Serial teams can create a stack of task teams for nested serial teams.
3810void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
3811 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
3812 kmp_task_team_list_t *current =
3813 (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
3814 kmp_task_team_list_t *node =
3815 (kmp_task_team_list_t *)__kmp_allocate(sizeof(kmp_task_team_list_t));
3816 node->task_team = current->task_team;
3817 node->next = current->next;
3818 thread->th.th_task_team = current->task_team = NULL;
3819 current->next = node;
3820}
3821
3822// Serial team pops a task team off the stack
3823void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
3824 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
3825 kmp_task_team_list_t *current =
3826 (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
3827 if (current->task_team) {
3828 __kmp_free_task_team(thread, current->task_team);
3829 }
3830 kmp_task_team_list_t *next = current->next;
3831 if (next) {
3832 current->task_team = next->task_team;
3833 current->next = next->next;
3834 KMP_DEBUG_ASSERT(next != current);
3835 __kmp_free(next);
3836 thread->th.th_task_team = current->task_team;
3837 }
3838}
3839
3840// __kmp_wait_to_unref_task_teams:
3841// Some threads could still be in the fork barrier release code, possibly
3842// trying to steal tasks. Wait for each thread to unreference its task team.
3843void __kmp_wait_to_unref_task_teams(void) {
3844 kmp_info_t *thread;
3845 kmp_uint32 spins;
3846 kmp_uint64 time;
3847 int done;
3848
3849 KMP_INIT_YIELD(spins);
3850 KMP_INIT_BACKOFF(time);
3851
3852 for (;;) {
3853 done = TRUE;
3854
3855 // TODO: GEH - this may be is wrong because some sync would be necessary
3856 // in case threads are added to the pool during the traversal. Need to
3857 // verify that lock for thread pool is held when calling this routine.
3858 for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3859 thread = thread->th.th_next_pool) {
3860#if KMP_OS_WINDOWS
3861 DWORD exit_val;
3862#endif
3863 if (TCR_PTR(thread->th.th_task_team) == NULL) {
3864 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3865 __kmp_gtid_from_thread(thread)));
3866 continue;
3867 }
3868#if KMP_OS_WINDOWS
3869 // TODO: GEH - add this check for Linux* OS / OS X* as well?
3870 if (!__kmp_is_thread_alive(thread, &exit_val)) {
3871 thread->th.th_task_team = NULL;
3872 continue;
3873 }
3874#endif
3875
3876 done = FALSE; // Because th_task_team pointer is not NULL for this thread
3877
3878 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3879 "unreference task_team\n",
3880 __kmp_gtid_from_thread(thread)));
3881
3882 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3883 void *sleep_loc;
3884 // If the thread is sleeping, awaken it.
3885 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3886 NULL) {
3887 KA_TRACE(
3888 10,
3889 ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3890 __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3891 __kmp_null_resume_wrapper(thread);
3892 }
3893 }
3894 }
3895 if (done) {
3896 break;
3897 }
3898
3899 // If oversubscribed or have waited a bit, yield.
3900 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3901 }
3902}
3903
3904// __kmp_task_team_setup: Create a task_team for the current team, but use
3905// an already created, unused one if it already exists.
3906void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
3907 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3908
3909 // For the serial and root teams, setup the first task team pointer to point
3910 // to task team. The other pointer is a stack of task teams from previous
3911 // serial levels.
3912 if (team == this_thr->th.th_serial_team ||
3913 team == this_thr->th.th_root->r.r_root_team) {
3914 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
3915 if (team->t.t_task_team[0] == NULL) {
3916 team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);
3917 KA_TRACE(
3918 20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
3919 " for serial/root team %p\n",
3920 __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));
3921
3922 } else
3923 __kmp_task_team_init(team->t.t_task_team[0], team);
3924 return;
3925 }
3926
3927 // If this task_team hasn't been created yet, allocate it. It will be used in
3928 // the region after the next.
3929 // If it exists, it is the current task team and shouldn't be touched yet as
3930 // it may still be in use.
3931 if (team->t.t_task_team[this_thr->th.th_task_state] == NULL) {
3932 team->t.t_task_team[this_thr->th.th_task_state] =
3933 __kmp_allocate_task_team(this_thr, team);
3934 KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
3935 " for team %d at parity=%d\n",
3936 __kmp_gtid_from_thread(this_thr),
3937 team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
3938 this_thr->th.th_task_state));
3939 }
3940
3941 // After threads exit the release, they will call sync, and then point to this
3942 // other task_team; make sure it is allocated and properly initialized. As
3943 // threads spin in the barrier release phase, they will continue to use the
3944 // previous task_team struct(above), until they receive the signal to stop
3945 // checking for tasks (they can't safely reference the kmp_team_t struct,
3946 // which could be reallocated by the primary thread).
3947 int other_team = 1 - this_thr->th.th_task_state;
3948 KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
3949 if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3950 team->t.t_task_team[other_team] = __kmp_allocate_task_team(this_thr, team);
3951 KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
3952 "task_team %p for team %d at parity=%d\n",
3953 __kmp_gtid_from_thread(this_thr),
3954 team->t.t_task_team[other_team], team->t.t_id, other_team));
3955 } else { // Leave the old task team struct in place for the upcoming region;
3956 // adjust as needed
3957 kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3958 __kmp_task_team_init(task_team, team);
3959 // if team size has changed, the first thread to enable tasking will
3960 // realloc threads_data if necessary
3961 KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
3962 "%p for team %d at parity=%d\n",
3963 __kmp_gtid_from_thread(this_thr),
3964 team->t.t_task_team[other_team], team->t.t_id, other_team));
3965 }
3966
3967 // For regular thread, task enabling should be called when the task is going
3968 // to be pushed to a dequeue. However, for the hidden helper thread, we need
3969 // it ahead of time so that some operations can be performed without race
3970 // condition.
3971 if (this_thr == __kmp_hidden_helper_main_thread) {
3972 for (int i = 0; i < 2; ++i) {
3973 kmp_task_team_t *task_team = team->t.t_task_team[i];
3974 if (KMP_TASKING_ENABLED(task_team)) {
3975 continue;
3976 }
3977 __kmp_enable_tasking(task_team, this_thr);
3978 for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
3979 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
3980 if (thread_data->td.td_deque == NULL) {
3981 __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);
3982 }
3983 }
3984 }
3985 }
3986}
3987
3988// __kmp_task_team_sync: Propagation of task team data from team to threads
3989// which happens just after the release phase of a team barrier. This may be
3990// called by any thread. This is not called for serial or root teams.
3991void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
3992 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3993 KMP_DEBUG_ASSERT(team != this_thr->th.th_serial_team);
3994 KMP_DEBUG_ASSERT(team != this_thr->th.th_root->r.r_root_team);
3995
3996 // Toggle the th_task_state field, to switch which task_team this thread
3997 // refers to
3998 this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
3999
4000 // It is now safe to propagate the task team pointer from the team struct to
4001 // the current thread.
4002 TCW_PTR(this_thr->th.th_task_team,
4003 team->t.t_task_team[this_thr->th.th_task_state]);
4004 KA_TRACE(20,
4005 ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
4006 "%p from Team #%d (parity=%d)\n",
4007 __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
4008 team->t.t_id, this_thr->th.th_task_state));
4009}
4010
4011// __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
4012// barrier gather phase. Only called by the primary thread.
4013//
4014// wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
4015// by passing in 0 optionally as the last argument. When wait is zero, primary
4016// thread does not wait for unfinished_threads to reach 0.
4017void __kmp_task_team_wait(
4018 kmp_info_t *this_thr,
4019 kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
4020 kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
4021
4022 KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4023 KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
4024
4025 if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
4026 if (wait) {
4027 KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
4028 "(for unfinished_threads to reach 0) on task_team = %p\n",
4029 __kmp_gtid_from_thread(this_thr), task_team));
4030 // Worker threads may have dropped through to release phase, but could
4031 // still be executing tasks. Wait here for tasks to complete. To avoid
4032 // memory contention, only primary thread checks termination condition.
4033 kmp_flag_32<false, false> flag(
4034 RCAST(std::atomic<kmp_uint32> *,
4035 &task_team->tt.tt_unfinished_threads),
4036 0U);
4037 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
4038 }
4039 // Deactivate the old task team, so that the worker threads will stop
4040 // referencing it while spinning.
4041 KA_TRACE(
4042 20,
4043 ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
4044 "setting active to false, setting local and team's pointer to NULL\n",
4045 __kmp_gtid_from_thread(this_thr), task_team));
4046 TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
4047 TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
4048 KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
4049 TCW_SYNC_4(task_team->tt.tt_active, FALSE);
4050 KMP_MB();
4051
4052 TCW_PTR(this_thr->th.th_task_team, NULL);
4053 }
4054}
4055
4056// __kmp_tasking_barrier:
4057// This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
4058// Internal function to execute all tasks prior to a regular barrier or a join
4059// barrier. It is a full barrier itself, which unfortunately turns regular
4060// barriers into double barriers and join barriers into 1 1/2 barriers.
4061void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
4062 std::atomic<kmp_uint32> *spin = RCAST(
4063 std::atomic<kmp_uint32> *,
4064 &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
4065 int flag = FALSE;
4066 KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
4067
4068#if USE_ITT_BUILD
4069 KMP_FSYNC_SPIN_INIT(spin, NULL);
4070#endif /* USE_ITT_BUILD */
4071 kmp_flag_32<false, false> spin_flag(spin, 0U);
4072 while (!spin_flag.execute_tasks(thread, gtid, TRUE,
4073 &flag USE_ITT_BUILD_ARG(NULL), 0)) {
4074#if USE_ITT_BUILD
4075 // TODO: What about itt_sync_obj??
4076 KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
4077#endif /* USE_ITT_BUILD */
4078
4079 if (TCR_4(__kmp_global.g.g_done)) {
4080 if (__kmp_global.g.g_abort)
4081 __kmp_abort_thread();
4082 break;
4083 }
4084 KMP_YIELD(TRUE);
4085 }
4086#if USE_ITT_BUILD
4087 KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
4088#endif /* USE_ITT_BUILD */
4089}
4090
4091// __kmp_give_task puts a task into a given thread queue if:
4092// - the queue for that thread was created
4093// - there's space in that queue
4094// Because of this, __kmp_push_task needs to check if there's space after
4095// getting the lock
4096static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
4097 kmp_int32 pass) {
4098 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4099 kmp_task_team_t *task_team = taskdata->td_task_team;
4100
4101 KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
4102 taskdata, tid));
4103
4104 // If task_team is NULL something went really bad...
4105 KMP_DEBUG_ASSERT(task_team != NULL);
4106
4107 bool result = false;
4108 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
4109
4110 if (thread_data->td.td_deque == NULL) {
4111 // There's no queue in this thread, go find another one
4112 // We're guaranteed that at least one thread has a queue
4113 KA_TRACE(30,
4114 ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
4115 tid, taskdata));
4116 return result;
4117 }
4118
4119 if (TCR_4(thread_data->td.td_deque_ntasks) >=
4120 TASK_DEQUE_SIZE(thread_data->td)) {
4121 KA_TRACE(
4122 30,
4123 ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
4124 taskdata, tid));
4125
4126 // if this deque is bigger than the pass ratio give a chance to another
4127 // thread
4128 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4129 return result;
4130
4131 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4132 if (TCR_4(thread_data->td.td_deque_ntasks) >=
4133 TASK_DEQUE_SIZE(thread_data->td)) {
4134 // expand deque to push the task which is not allowed to execute
4135 __kmp_realloc_task_deque(thread, thread_data);
4136 }
4137
4138 } else {
4139
4140 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4141
4142 if (TCR_4(thread_data->td.td_deque_ntasks) >=
4143 TASK_DEQUE_SIZE(thread_data->td)) {
4144 KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
4145 "thread %d.\n",
4146 taskdata, tid));
4147
4148 // if this deque is bigger than the pass ratio give a chance to another
4149 // thread
4150 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4151 goto release_and_exit;
4152
4153 __kmp_realloc_task_deque(thread, thread_data);
4154 }
4155 }
4156
4157 // lock is held here, and there is space in the deque
4158
4159 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
4160 // Wrap index.
4161 thread_data->td.td_deque_tail =
4162 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
4163 TCW_4(thread_data->td.td_deque_ntasks,
4164 TCR_4(thread_data->td.td_deque_ntasks) + 1);
4165
4166 result = true;
4167 KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
4168 taskdata, tid));
4169
4170release_and_exit:
4171 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
4172
4173 return result;
4174}
4175
4176#define PROXY_TASK_FLAG 0x40000000
4177/* The finish of the proxy tasks is divided in two pieces:
4178 - the top half is the one that can be done from a thread outside the team
4179 - the bottom half must be run from a thread within the team
4180
4181 In order to run the bottom half the task gets queued back into one of the
4182 threads of the team. Once the td_incomplete_child_task counter of the parent
4183 is decremented the threads can leave the barriers. So, the bottom half needs
4184 to be queued before the counter is decremented. The top half is therefore
4185 divided in two parts:
4186 - things that can be run before queuing the bottom half
4187 - things that must be run after queuing the bottom half
4188
4189 This creates a second race as the bottom half can free the task before the
4190 second top half is executed. To avoid this we use the
4191 td_incomplete_child_task of the proxy task to synchronize the top and bottom
4192 half. */
4193static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4194 KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
4195 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4196 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
4197 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
4198
4199 taskdata->td_flags.complete = 1; // mark the task as completed
4200#if OMPX_TASKGRAPH
4201 taskdata->td_flags.onced = 1;
4202#endif
4203
4204 if (taskdata->td_taskgroup)
4205 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
4206
4207 // Create an imaginary children for this task so the bottom half cannot
4208 // release the task before we have completed the second top half
4209 KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);
4210}
4211
4212static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4213#if KMP_DEBUG
4214 kmp_int32 children = 0;
4215 // Predecrement simulated by "- 1" calculation
4216 children = -1 +
4217#endif
4218 KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
4219 KMP_DEBUG_ASSERT(children >= 0);
4220
4221 // Remove the imaginary children
4222 KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);
4223}
4224
4225static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
4226 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4227 kmp_info_t *thread = __kmp_threads[gtid];
4228
4229 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4230 KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
4231 1); // top half must run before bottom half
4232
4233 // We need to wait to make sure the top half is finished
4234 // Spinning here should be ok as this should happen quickly
4235 while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
4236 PROXY_TASK_FLAG) > 0)
4237 ;
4238
4239 __kmp_release_deps(gtid, taskdata);
4240 __kmp_free_task_and_ancestors(gtid, taskdata, thread);
4241}
4242
4251void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
4252 KMP_DEBUG_ASSERT(ptask != NULL);
4253 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4254 KA_TRACE(
4255 10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
4256 gtid, taskdata));
4257 __kmp_assert_valid_gtid(gtid);
4258 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4259
4260 __kmp_first_top_half_finish_proxy(taskdata);
4261 __kmp_second_top_half_finish_proxy(taskdata);
4262 __kmp_bottom_half_finish_proxy(gtid, ptask);
4263
4264 KA_TRACE(10,
4265 ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
4266 gtid, taskdata));
4267}
4268
4269void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
4270 KMP_DEBUG_ASSERT(ptask != NULL);
4271 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4272
4273 // Enqueue task to complete bottom half completion from a thread within the
4274 // corresponding team
4275 kmp_team_t *team = taskdata->td_team;
4276 kmp_int32 nthreads = team->t.t_nproc;
4277 kmp_info_t *thread;
4278
4279 // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
4280 // but we cannot use __kmp_get_random here
4281 kmp_int32 start_k = start % nthreads;
4282 kmp_int32 pass = 1;
4283 kmp_int32 k = start_k;
4284
4285 do {
4286 // For now we're just linearly trying to find a thread
4287 thread = team->t.t_threads[k];
4288 k = (k + 1) % nthreads;
4289
4290 // we did a full pass through all the threads
4291 if (k == start_k)
4292 pass = pass << 1;
4293
4294 } while (!__kmp_give_task(thread, k, ptask, pass));
4295
4296 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && __kmp_wpolicy_passive) {
4297 // awake at least one thread to execute given task
4298 for (int i = 0; i < nthreads; ++i) {
4299 thread = team->t.t_threads[i];
4300 if (thread->th.th_sleep_loc != NULL) {
4301 __kmp_null_resume_wrapper(thread);
4302 break;
4303 }
4304 }
4305 }
4306}
4307
4315void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
4316 KMP_DEBUG_ASSERT(ptask != NULL);
4317 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4318
4319 KA_TRACE(
4320 10,
4321 ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
4322 taskdata));
4323
4324 KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4325
4326 __kmp_first_top_half_finish_proxy(taskdata);
4327
4328 __kmpc_give_task(ptask);
4329
4330 __kmp_second_top_half_finish_proxy(taskdata);
4331
4332 KA_TRACE(
4333 10,
4334 ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
4335 taskdata));
4336}
4337
4338kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
4339 kmp_task_t *task) {
4340 kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
4341 if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
4342 td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
4343 td->td_allow_completion_event.ed.task = task;
4344 __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
4345 }
4346 return &td->td_allow_completion_event;
4347}
4348
4349void __kmp_fulfill_event(kmp_event_t *event) {
4350 if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4351 kmp_task_t *ptask = event->ed.task;
4352 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4353 bool detached = false;
4354 int gtid = __kmp_get_gtid();
4355
4356 // The associated task might have completed or could be completing at this
4357 // point.
4358 // We need to take the lock to avoid races
4359 __kmp_acquire_tas_lock(&event->lock, gtid);
4360 if (taskdata->td_flags.proxy == TASK_PROXY) {
4361 detached = true;
4362 } else {
4363#if OMPT_SUPPORT
4364 // The OMPT event must occur under mutual exclusion,
4365 // otherwise the tool might access ptask after free
4366 if (UNLIKELY(ompt_enabled.enabled))
4367 __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4368#endif
4369 }
4370 event->type = KMP_EVENT_UNINITIALIZED;
4371 __kmp_release_tas_lock(&event->lock, gtid);
4372
4373 if (detached) {
4374#if OMPT_SUPPORT
4375 // We free ptask afterwards and know the task is finished,
4376 // so locking is not necessary
4377 if (UNLIKELY(ompt_enabled.enabled))
4378 __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4379#endif
4380 // If the task detached complete the proxy task
4381 if (gtid >= 0) {
4382 kmp_team_t *team = taskdata->td_team;
4383 kmp_info_t *thread = __kmp_get_thread();
4384 if (thread->th.th_team == team) {
4385 __kmpc_proxy_task_completed(gtid, ptask);
4386 return;
4387 }
4388 }
4389
4390 // fallback
4392 }
4393 }
4394}
4395
4396// __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4397// for taskloop
4398//
4399// thread: allocating thread
4400// task_src: pointer to source task to be duplicated
4401// taskloop_recur: used only when dealing with taskgraph,
4402// indicating whether we need to update task->td_task_id
4403// returns: a pointer to the allocated kmp_task_t structure (task).
4404kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src
4405#if OMPX_TASKGRAPH
4406 , int taskloop_recur
4407#endif
4408) {
4409 kmp_task_t *task;
4410 kmp_taskdata_t *taskdata;
4411 kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4412 kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4413 size_t shareds_offset;
4414 size_t task_size;
4415
4416 KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4417 task_src));
4418 KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4419 TASK_FULL); // it should not be proxy task
4420 KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4421 task_size = taskdata_src->td_size_alloc;
4422
4423 // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4424 KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4425 task_size));
4426#if USE_FAST_MEMORY
4427 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4428#else
4429 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4430#endif /* USE_FAST_MEMORY */
4431 KMP_MEMCPY(taskdata, taskdata_src, task_size);
4432
4433 task = KMP_TASKDATA_TO_TASK(taskdata);
4434
4435 // Initialize new task (only specific fields not affected by memcpy)
4436#if OMPX_TASKGRAPH
4437 if (taskdata->is_taskgraph && !taskloop_recur &&
4438 __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))
4439 taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
4440#endif
4441 taskdata->td_task_id = KMP_GEN_TASK_ID();
4442 if (task->shareds != NULL) { // need setup shareds pointer
4443 shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4444 task->shareds = &((char *)taskdata)[shareds_offset];
4445 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4446 0);
4447 }
4448 taskdata->td_alloc_thread = thread;
4449 taskdata->td_parent = parent_task;
4450 // task inherits the taskgroup from the parent task
4451 taskdata->td_taskgroup = parent_task->td_taskgroup;
4452 // tied task needs to initialize the td_last_tied at creation,
4453 // untied one does this when it is scheduled for execution
4454 if (taskdata->td_flags.tiedness == TASK_TIED)
4455 taskdata->td_last_tied = taskdata;
4456
4457 // Only need to keep track of child task counts if team parallel and tasking
4458 // not serialized
4459 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4460 KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4461 if (parent_task->td_taskgroup)
4462 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4463 // Only need to keep track of allocated child tasks for explicit tasks since
4464 // implicit not deallocated
4465 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4466 KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4467 }
4468
4469 KA_TRACE(20,
4470 ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4471 thread, taskdata, taskdata->td_parent));
4472#if OMPT_SUPPORT
4473 if (UNLIKELY(ompt_enabled.enabled))
4474 __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4475#endif
4476 return task;
4477}
4478
4479// Routine optionally generated by the compiler for setting the lastprivate flag
4480// and calling needed constructors for private/firstprivate objects
4481// (used to form taskloop tasks from pattern task)
4482// Parameters: dest task, src task, lastprivate flag.
4483typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4484
4485KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4486
4487// class to encapsulate manipulating loop bounds in a taskloop task.
4488// this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4489// the loop bound variables.
4490class kmp_taskloop_bounds_t {
4491 kmp_task_t *task;
4492 const kmp_taskdata_t *taskdata;
4493 size_t lower_offset;
4494 size_t upper_offset;
4495
4496public:
4497 kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4498 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4499 lower_offset((char *)lb - (char *)task),
4500 upper_offset((char *)ub - (char *)task) {
4501 KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4502 KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4503 }
4504 kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4505 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4506 lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4507 size_t get_lower_offset() const { return lower_offset; }
4508 size_t get_upper_offset() const { return upper_offset; }
4509 kmp_uint64 get_lb() const {
4510 kmp_int64 retval;
4511#if defined(KMP_GOMP_COMPAT)
4512 // Intel task just returns the lower bound normally
4513 if (!taskdata->td_flags.native) {
4514 retval = *(kmp_int64 *)((char *)task + lower_offset);
4515 } else {
4516 // GOMP task has to take into account the sizeof(long)
4517 if (taskdata->td_size_loop_bounds == 4) {
4518 kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4519 retval = (kmp_int64)*lb;
4520 } else {
4521 kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4522 retval = (kmp_int64)*lb;
4523 }
4524 }
4525#else
4526 (void)taskdata;
4527 retval = *(kmp_int64 *)((char *)task + lower_offset);
4528#endif // defined(KMP_GOMP_COMPAT)
4529 return retval;
4530 }
4531 kmp_uint64 get_ub() const {
4532 kmp_int64 retval;
4533#if defined(KMP_GOMP_COMPAT)
4534 // Intel task just returns the upper bound normally
4535 if (!taskdata->td_flags.native) {
4536 retval = *(kmp_int64 *)((char *)task + upper_offset);
4537 } else {
4538 // GOMP task has to take into account the sizeof(long)
4539 if (taskdata->td_size_loop_bounds == 4) {
4540 kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4541 retval = (kmp_int64)*ub;
4542 } else {
4543 kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4544 retval = (kmp_int64)*ub;
4545 }
4546 }
4547#else
4548 retval = *(kmp_int64 *)((char *)task + upper_offset);
4549#endif // defined(KMP_GOMP_COMPAT)
4550 return retval;
4551 }
4552 void set_lb(kmp_uint64 lb) {
4553#if defined(KMP_GOMP_COMPAT)
4554 // Intel task just sets the lower bound normally
4555 if (!taskdata->td_flags.native) {
4556 *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4557 } else {
4558 // GOMP task has to take into account the sizeof(long)
4559 if (taskdata->td_size_loop_bounds == 4) {
4560 kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4561 *lower = (kmp_uint32)lb;
4562 } else {
4563 kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4564 *lower = (kmp_uint64)lb;
4565 }
4566 }
4567#else
4568 *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4569#endif // defined(KMP_GOMP_COMPAT)
4570 }
4571 void set_ub(kmp_uint64 ub) {
4572#if defined(KMP_GOMP_COMPAT)
4573 // Intel task just sets the upper bound normally
4574 if (!taskdata->td_flags.native) {
4575 *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4576 } else {
4577 // GOMP task has to take into account the sizeof(long)
4578 if (taskdata->td_size_loop_bounds == 4) {
4579 kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4580 *upper = (kmp_uint32)ub;
4581 } else {
4582 kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4583 *upper = (kmp_uint64)ub;
4584 }
4585 }
4586#else
4587 *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4588#endif // defined(KMP_GOMP_COMPAT)
4589 }
4590};
4591
4592// __kmp_taskloop_linear: Start tasks of the taskloop linearly
4593//
4594// loc Source location information
4595// gtid Global thread ID
4596// task Pattern task, exposes the loop iteration range
4597// lb Pointer to loop lower bound in task structure
4598// ub Pointer to loop upper bound in task structure
4599// st Loop stride
4600// ub_glob Global upper bound (used for lastprivate check)
4601// num_tasks Number of tasks to execute
4602// grainsize Number of loop iterations per task
4603// extras Number of chunks with grainsize+1 iterations
4604// last_chunk Reduction of grainsize for last task
4605// tc Iterations count
4606// task_dup Tasks duplication routine
4607// codeptr_ra Return address for OMPT events
4608void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4609 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4610 kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4611 kmp_uint64 grainsize, kmp_uint64 extras,
4612 kmp_int64 last_chunk, kmp_uint64 tc,
4613#if OMPT_SUPPORT
4614 void *codeptr_ra,
4615#endif
4616 void *task_dup) {
4617 KMP_COUNT_BLOCK(OMP_TASKLOOP);
4618 KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4619 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4620 // compiler provides global bounds here
4621 kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4622 kmp_uint64 lower = task_bounds.get_lb();
4623 kmp_uint64 upper = task_bounds.get_ub();
4624 kmp_uint64 i;
4625 kmp_info_t *thread = __kmp_threads[gtid];
4626 kmp_taskdata_t *current_task = thread->th.th_current_task;
4627 kmp_task_t *next_task;
4628 kmp_int32 lastpriv = 0;
4629
4630 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4631 (last_chunk < 0 ? last_chunk : extras));
4632 KMP_DEBUG_ASSERT(num_tasks > extras);
4633 KMP_DEBUG_ASSERT(num_tasks > 0);
4634 KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4635 "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4636 gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4637 ub_glob, st, task_dup));
4638
4639 // Launch num_tasks tasks, assign grainsize iterations each task
4640 for (i = 0; i < num_tasks; ++i) {
4641 kmp_uint64 chunk_minus_1;
4642 if (extras == 0) {
4643 chunk_minus_1 = grainsize - 1;
4644 } else {
4645 chunk_minus_1 = grainsize;
4646 --extras; // first extras iterations get bigger chunk (grainsize+1)
4647 }
4648 upper = lower + st * chunk_minus_1;
4649 if (upper > *ub) {
4650 upper = *ub;
4651 }
4652 if (i == num_tasks - 1) {
4653 // schedule the last task, set lastprivate flag if needed
4654 if (st == 1) { // most common case
4655 KMP_DEBUG_ASSERT(upper == *ub);
4656 if (upper == ub_glob)
4657 lastpriv = 1;
4658 } else if (st > 0) { // positive loop stride
4659 KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4660 if ((kmp_uint64)st > ub_glob - upper)
4661 lastpriv = 1;
4662 } else { // negative loop stride
4663 KMP_DEBUG_ASSERT(upper + st < *ub);
4664 if (upper - ub_glob < (kmp_uint64)(-st))
4665 lastpriv = 1;
4666 }
4667 }
4668
4669#if OMPX_TASKGRAPH
4670 next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0);
4671#else
4672 next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4673#endif
4674
4675 kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4676 kmp_taskloop_bounds_t next_task_bounds =
4677 kmp_taskloop_bounds_t(next_task, task_bounds);
4678
4679 // adjust task-specific bounds
4680 next_task_bounds.set_lb(lower);
4681 if (next_taskdata->td_flags.native) {
4682 next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4683 } else {
4684 next_task_bounds.set_ub(upper);
4685 }
4686 if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4687 // etc.
4688 ptask_dup(next_task, task, lastpriv);
4689 KA_TRACE(40,
4690 ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4691 "upper %lld stride %lld, (offsets %p %p)\n",
4692 gtid, i, next_task, lower, upper, st,
4693 next_task_bounds.get_lower_offset(),
4694 next_task_bounds.get_upper_offset()));
4695#if OMPT_SUPPORT
4696 __kmp_omp_taskloop_task(NULL, gtid, next_task,
4697 codeptr_ra); // schedule new task
4698#if OMPT_OPTIONAL
4699 if (ompt_enabled.ompt_callback_dispatch) {
4700 OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,
4701 lower, upper, st);
4702 }
4703#endif // OMPT_OPTIONAL
4704#else
4705 __kmp_omp_task(gtid, next_task, true); // schedule new task
4706#endif
4707 lower = upper + st; // adjust lower bound for the next iteration
4708 }
4709 // free the pattern task and exit
4710 __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4711 // do not execute the pattern task, just do internal bookkeeping
4712 __kmp_task_finish<false>(gtid, task, current_task);
4713}
4714
4715// Structure to keep taskloop parameters for auxiliary task
4716// kept in the shareds of the task structure.
4717typedef struct __taskloop_params {
4718 kmp_task_t *task;
4719 kmp_uint64 *lb;
4720 kmp_uint64 *ub;
4721 void *task_dup;
4722 kmp_int64 st;
4723 kmp_uint64 ub_glob;
4724 kmp_uint64 num_tasks;
4725 kmp_uint64 grainsize;
4726 kmp_uint64 extras;
4727 kmp_int64 last_chunk;
4728 kmp_uint64 tc;
4729 kmp_uint64 num_t_min;
4730#if OMPT_SUPPORT
4731 void *codeptr_ra;
4732#endif
4733} __taskloop_params_t;
4734
4735void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4736 kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4737 kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
4738 kmp_uint64,
4739#if OMPT_SUPPORT
4740 void *,
4741#endif
4742 void *);
4743
4744// Execute part of the taskloop submitted as a task.
4745int __kmp_taskloop_task(int gtid, void *ptask) {
4746 __taskloop_params_t *p =
4747 (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4748 kmp_task_t *task = p->task;
4749 kmp_uint64 *lb = p->lb;
4750 kmp_uint64 *ub = p->ub;
4751 void *task_dup = p->task_dup;
4752 // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4753 kmp_int64 st = p->st;
4754 kmp_uint64 ub_glob = p->ub_glob;
4755 kmp_uint64 num_tasks = p->num_tasks;
4756 kmp_uint64 grainsize = p->grainsize;
4757 kmp_uint64 extras = p->extras;
4758 kmp_int64 last_chunk = p->last_chunk;
4759 kmp_uint64 tc = p->tc;
4760 kmp_uint64 num_t_min = p->num_t_min;
4761#if OMPT_SUPPORT
4762 void *codeptr_ra = p->codeptr_ra;
4763#endif
4764#if KMP_DEBUG
4765 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4766 KMP_DEBUG_ASSERT(task != NULL);
4767 KA_TRACE(20,
4768 ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4769 " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4770 gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4771 st, task_dup));
4772#endif
4773 KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4774 if (num_tasks > num_t_min)
4775 __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4776 grainsize, extras, last_chunk, tc, num_t_min,
4777#if OMPT_SUPPORT
4778 codeptr_ra,
4779#endif
4780 task_dup);
4781 else
4782 __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4783 grainsize, extras, last_chunk, tc,
4784#if OMPT_SUPPORT
4785 codeptr_ra,
4786#endif
4787 task_dup);
4788
4789 KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
4790 return 0;
4791}
4792
4793// Schedule part of the taskloop as a task,
4794// execute the rest of the taskloop.
4795//
4796// loc Source location information
4797// gtid Global thread ID
4798// task Pattern task, exposes the loop iteration range
4799// lb Pointer to loop lower bound in task structure
4800// ub Pointer to loop upper bound in task structure
4801// st Loop stride
4802// ub_glob Global upper bound (used for lastprivate check)
4803// num_tasks Number of tasks to execute
4804// grainsize Number of loop iterations per task
4805// extras Number of chunks with grainsize+1 iterations
4806// last_chunk Reduction of grainsize for last task
4807// tc Iterations count
4808// num_t_min Threshold to launch tasks recursively
4809// task_dup Tasks duplication routine
4810// codeptr_ra Return address for OMPT events
4811void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
4812 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4813 kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4814 kmp_uint64 grainsize, kmp_uint64 extras,
4815 kmp_int64 last_chunk, kmp_uint64 tc,
4816 kmp_uint64 num_t_min,
4817#if OMPT_SUPPORT
4818 void *codeptr_ra,
4819#endif
4820 void *task_dup) {
4821 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4822 KMP_DEBUG_ASSERT(task != NULL);
4823 KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4824 KA_TRACE(20,
4825 ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4826 " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4827 gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4828 st, task_dup));
4829 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4830 kmp_uint64 lower = *lb;
4831 kmp_info_t *thread = __kmp_threads[gtid];
4832 // kmp_taskdata_t *current_task = thread->th.th_current_task;
4833 kmp_task_t *next_task;
4834 size_t lower_offset =
4835 (char *)lb - (char *)task; // remember offset of lb in the task structure
4836 size_t upper_offset =
4837 (char *)ub - (char *)task; // remember offset of ub in the task structure
4838
4839 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4840 (last_chunk < 0 ? last_chunk : extras));
4841 KMP_DEBUG_ASSERT(num_tasks > extras);
4842 KMP_DEBUG_ASSERT(num_tasks > 0);
4843
4844 // split the loop in two halves
4845 kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4846 kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
4847 kmp_uint64 gr_size0 = grainsize;
4848 kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
4849 kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
4850 if (last_chunk < 0) {
4851 ext0 = ext1 = 0;
4852 last_chunk1 = last_chunk;
4853 tc0 = grainsize * n_tsk0;
4854 tc1 = tc - tc0;
4855 } else if (n_tsk0 <= extras) {
4856 gr_size0++; // integrate extras into grainsize
4857 ext0 = 0; // no extra iters in 1st half
4858 ext1 = extras - n_tsk0; // remaining extras
4859 tc0 = gr_size0 * n_tsk0;
4860 tc1 = tc - tc0;
4861 } else { // n_tsk0 > extras
4862 ext1 = 0; // no extra iters in 2nd half
4863 ext0 = extras;
4864 tc1 = grainsize * n_tsk1;
4865 tc0 = tc - tc1;
4866 }
4867 ub0 = lower + st * (tc0 - 1);
4868 lb1 = ub0 + st;
4869
4870 // create pattern task for 2nd half of the loop
4871#if OMPX_TASKGRAPH
4872 next_task = __kmp_task_dup_alloc(thread, task,
4873 /* taskloop_recur */ 1);
4874#else
4875 next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
4876#endif
4877 // adjust lower bound (upper bound is not changed) for the 2nd half
4878 *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
4879 if (ptask_dup != NULL) // construct firstprivates, etc.
4880 ptask_dup(next_task, task, 0);
4881 *ub = ub0; // adjust upper bound for the 1st half
4882
4883 // create auxiliary task for 2nd half of the loop
4884 // make sure new task has same parent task as the pattern task
4885 kmp_taskdata_t *current_task = thread->th.th_current_task;
4886 thread->th.th_current_task = taskdata->td_parent;
4887 kmp_task_t *new_task =
4888 __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4889 sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4890 // restore current task
4891 thread->th.th_current_task = current_task;
4892 __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4893 p->task = next_task;
4894 p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4895 p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4896 p->task_dup = task_dup;
4897 p->st = st;
4898 p->ub_glob = ub_glob;
4899 p->num_tasks = n_tsk1;
4900 p->grainsize = grainsize;
4901 p->extras = ext1;
4902 p->last_chunk = last_chunk1;
4903 p->tc = tc1;
4904 p->num_t_min = num_t_min;
4905#if OMPT_SUPPORT
4906 p->codeptr_ra = codeptr_ra;
4907#endif
4908
4909#if OMPX_TASKGRAPH
4910 kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
4911 new_task_data->tdg = taskdata->tdg;
4912 new_task_data->is_taskgraph = 0;
4913#endif
4914
4915#if OMPT_SUPPORT
4916 // schedule new task with correct return address for OMPT events
4917 __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4918#else
4919 __kmp_omp_task(gtid, new_task, true); // schedule new task
4920#endif
4921
4922 // execute the 1st half of current subrange
4923 if (n_tsk0 > num_t_min)
4924 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4925 ext0, last_chunk0, tc0, num_t_min,
4926#if OMPT_SUPPORT
4927 codeptr_ra,
4928#endif
4929 task_dup);
4930 else
4931 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4932 gr_size0, ext0, last_chunk0, tc0,
4933#if OMPT_SUPPORT
4934 codeptr_ra,
4935#endif
4936 task_dup);
4937
4938 KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
4939}
4940
4941static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4942 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4943 int nogroup, int sched, kmp_uint64 grainsize,
4944 int modifier, void *task_dup) {
4945 kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4946 KMP_DEBUG_ASSERT(task != NULL);
4947 if (nogroup == 0) {
4948#if OMPT_SUPPORT && OMPT_OPTIONAL
4949 OMPT_STORE_RETURN_ADDRESS(gtid);
4950#endif
4951 __kmpc_taskgroup(loc, gtid);
4952 }
4953
4954#if OMPX_TASKGRAPH
4955 KMP_ATOMIC_DEC(&__kmp_tdg_task_id);
4956#endif
4957 // =========================================================================
4958 // calculate loop parameters
4959 kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4960 kmp_uint64 tc;
4961 // compiler provides global bounds here
4962 kmp_uint64 lower = task_bounds.get_lb();
4963 kmp_uint64 upper = task_bounds.get_ub();
4964 kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
4965 kmp_uint64 num_tasks = 0, extras = 0;
4966 kmp_int64 last_chunk =
4967 0; // reduce grainsize of last task by last_chunk in strict mode
4968 kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4969 kmp_info_t *thread = __kmp_threads[gtid];
4970 kmp_taskdata_t *current_task = thread->th.th_current_task;
4971
4972 KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4973 "grain %llu(%d, %d), dup %p\n",
4974 gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
4975 task_dup));
4976
4977 // compute trip count
4978 if (st == 1) { // most common case
4979 tc = upper - lower + 1;
4980 } else if (st < 0) {
4981 tc = (lower - upper) / (-st) + 1;
4982 } else { // st > 0
4983 tc = (upper - lower) / st + 1;
4984 }
4985 if (tc == 0) {
4986 KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
4987 // free the pattern task and exit
4988 __kmp_task_start(gtid, task, current_task);
4989 // do not execute anything for zero-trip loop
4990 __kmp_task_finish<false>(gtid, task, current_task);
4991 return;
4992 }
4993
4994#if OMPT_SUPPORT && OMPT_OPTIONAL
4995 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
4996 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
4997 if (ompt_enabled.ompt_callback_work) {
4998 ompt_callbacks.ompt_callback(ompt_callback_work)(
4999 ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
5000 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5001 }
5002#endif
5003
5004 if (num_tasks_min == 0)
5005 // TODO: can we choose better default heuristic?
5006 num_tasks_min =
5007 KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
5008
5009 // compute num_tasks/grainsize based on the input provided
5010 switch (sched) {
5011 case 0: // no schedule clause specified, we can choose the default
5012 // let's try to schedule (team_size*10) tasks
5013 grainsize = thread->th.th_team_nproc * static_cast<kmp_uint64>(10);
5014 KMP_FALLTHROUGH();
5015 case 2: // num_tasks provided
5016 if (grainsize > tc) {
5017 num_tasks = tc; // too big num_tasks requested, adjust values
5018 grainsize = 1;
5019 extras = 0;
5020 } else {
5021 num_tasks = grainsize;
5022 grainsize = tc / num_tasks;
5023 extras = tc % num_tasks;
5024 }
5025 break;
5026 case 1: // grainsize provided
5027 if (grainsize > tc) {
5028 num_tasks = 1;
5029 grainsize = tc; // too big grainsize requested, adjust values
5030 extras = 0;
5031 } else {
5032 if (modifier) {
5033 num_tasks = (tc + grainsize - 1) / grainsize;
5034 last_chunk = tc - (num_tasks * grainsize);
5035 extras = 0;
5036 } else {
5037 num_tasks = tc / grainsize;
5038 // adjust grainsize for balanced distribution of iterations
5039 grainsize = tc / num_tasks;
5040 extras = tc % num_tasks;
5041 }
5042 }
5043 break;
5044 default:
5045 KMP_ASSERT2(0, "unknown scheduling of taskloop");
5046 }
5047
5048 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5049 (last_chunk < 0 ? last_chunk : extras));
5050 KMP_DEBUG_ASSERT(num_tasks > extras);
5051 KMP_DEBUG_ASSERT(num_tasks > 0);
5052 // =========================================================================
5053
5054 // check if clause value first
5055 // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
5056 if (if_val == 0) { // if(0) specified, mark task as serial
5057 taskdata->td_flags.task_serial = 1;
5058 taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
5059 // always start serial tasks linearly
5060 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5061 grainsize, extras, last_chunk, tc,
5062#if OMPT_SUPPORT
5063 OMPT_GET_RETURN_ADDRESS(0),
5064#endif
5065 task_dup);
5066 // !taskdata->td_flags.native => currently force linear spawning of tasks
5067 // for GOMP_taskloop
5068 } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
5069 KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
5070 "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5071 gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5072 last_chunk));
5073 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5074 grainsize, extras, last_chunk, tc, num_tasks_min,
5075#if OMPT_SUPPORT
5076 OMPT_GET_RETURN_ADDRESS(0),
5077#endif
5078 task_dup);
5079 } else {
5080 KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
5081 "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5082 gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5083 last_chunk));
5084 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5085 grainsize, extras, last_chunk, tc,
5086#if OMPT_SUPPORT
5087 OMPT_GET_RETURN_ADDRESS(0),
5088#endif
5089 task_dup);
5090 }
5091
5092#if OMPT_SUPPORT && OMPT_OPTIONAL
5093 if (ompt_enabled.ompt_callback_work) {
5094 ompt_callbacks.ompt_callback(ompt_callback_work)(
5095 ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
5096 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5097 }
5098#endif
5099
5100 if (nogroup == 0) {
5101#if OMPT_SUPPORT && OMPT_OPTIONAL
5102 OMPT_STORE_RETURN_ADDRESS(gtid);
5103#endif
5104 __kmpc_end_taskgroup(loc, gtid);
5105 }
5106 KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
5107}
5108
5125void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5126 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
5127 int sched, kmp_uint64 grainsize, void *task_dup) {
5128 __kmp_assert_valid_gtid(gtid);
5129 KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
5130 __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5131 0, task_dup);
5132 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
5133}
5134
5152void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5153 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5154 int nogroup, int sched, kmp_uint64 grainsize,
5155 int modifier, void *task_dup) {
5156 __kmp_assert_valid_gtid(gtid);
5157 KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
5158 __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5159 modifier, task_dup);
5160 KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
5161}
5162
5172 if (gtid == KMP_GTID_DNE)
5173 return NULL;
5174
5175 kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5176 kmp_taskdata_t *taskdata = thread->th.th_current_task;
5177
5178 if (!taskdata)
5179 return NULL;
5180
5181 return &taskdata->td_target_data.async_handle;
5182}
5183
5192bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
5193 if (gtid == KMP_GTID_DNE)
5194 return FALSE;
5195
5196 kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5197 kmp_taskdata_t *taskdata = thread->th.th_current_task;
5198
5199 if (!taskdata)
5200 return FALSE;
5201
5202 return taskdata->td_task_team != NULL;
5203}
5204
5205#if OMPX_TASKGRAPH
5206// __kmp_find_tdg: identify a TDG through its ID
5207// tdg_id: ID of the TDG
5208// returns: If a TDG corresponding to this ID is found and not
5209// its initial state, return the pointer to it, otherwise nullptr
5210static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
5211 kmp_tdg_info_t *res = nullptr;
5212 if (__kmp_max_tdgs == 0)
5213 return res;
5214
5215 if (__kmp_global_tdgs == NULL)
5216 __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
5217 sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);
5218
5219 if ((__kmp_global_tdgs[tdg_id]) &&
5220 (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
5221 res = __kmp_global_tdgs[tdg_id];
5222 return res;
5223}
5224
5225// __kmp_print_tdg_dot: prints the TDG to a dot file
5226// tdg: ID of the TDG
5227// gtid: Global Thread ID
5228void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg, kmp_int32 gtid) {
5229 kmp_int32 tdg_id = tdg->tdg_id;
5230 KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));
5231
5232 char file_name[20];
5233 sprintf(file_name, "tdg_%d.dot", tdg_id);
5234 kmp_safe_raii_file_t tdg_file(file_name, "w");
5235
5236 kmp_int32 num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5237 fprintf(tdg_file,
5238 "digraph TDG {\n"
5239 " compound=true\n"
5240 " subgraph cluster {\n"
5241 " label=TDG_%d\n",
5242 tdg_id);
5243 for (kmp_int32 i = 0; i < num_tasks; i++) {
5244 fprintf(tdg_file, " %d[style=bold]\n", i);
5245 }
5246 fprintf(tdg_file, " }\n");
5247 for (kmp_int32 i = 0; i < num_tasks; i++) {
5248 kmp_int32 nsuccessors = tdg->record_map[i].nsuccessors;
5249 kmp_int32 *successors = tdg->record_map[i].successors;
5250 if (nsuccessors > 0) {
5251 for (kmp_int32 j = 0; j < nsuccessors; j++)
5252 fprintf(tdg_file, " %d -> %d \n", i, successors[j]);
5253 }
5254 }
5255 fprintf(tdg_file, "}");
5256 KA_TRACE(10, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id));
5257}
5258
5259// __kmp_exec_tdg: launch the execution of a previous
5260// recorded TDG
5261// gtid: Global Thread ID
5262// tdg: ID of the TDG
5263void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5264 KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
5265 KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
5266 tdg->tdg_id, tdg->num_roots));
5267 kmp_node_info_t *this_record_map = tdg->record_map;
5268 kmp_int32 *this_root_tasks = tdg->root_tasks;
5269 kmp_int32 this_num_roots = tdg->num_roots;
5270 kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5271
5272 kmp_info_t *thread = __kmp_threads[gtid];
5273 kmp_taskdata_t *parent_task = thread->th.th_current_task;
5274
5275 if (tdg->rec_taskred_data) {
5276 __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
5277 }
5278
5279 for (kmp_int32 j = 0; j < this_num_tasks; j++) {
5280 kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);
5281
5282 td->td_parent = parent_task;
5283 this_record_map[j].parent_task = parent_task;
5284
5285 kmp_taskgroup_t *parent_taskgroup =
5286 this_record_map[j].parent_task->td_taskgroup;
5287
5288 KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
5289 this_record_map[j].npredecessors);
5290 KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);
5291
5292 if (parent_taskgroup) {
5293 KMP_ATOMIC_INC(&parent_taskgroup->count);
5294 // The taskgroup is different so we must update it
5295 td->td_taskgroup = parent_taskgroup;
5296 } else if (td->td_taskgroup != nullptr) {
5297 // If the parent doesnt have a taskgroup, remove it from the task
5298 td->td_taskgroup = nullptr;
5299 }
5300 if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
5301 KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
5302 }
5303
5304 for (kmp_int32 j = 0; j < this_num_roots; ++j) {
5305 __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
5306 }
5307 KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
5308 tdg->tdg_id, tdg->num_roots));
5309}
5310
5311// __kmp_start_record: set up a TDG structure and turn the
5312// recording flag to true
5313// gtid: Global Thread ID of the encountering thread
5314// input_flags: Flags associated with the TDG
5315// tdg_id: ID of the TDG to record
5316static inline void __kmp_start_record(kmp_int32 gtid,
5317 kmp_taskgraph_flags_t *flags,
5318 kmp_int32 tdg_id) {
5319 kmp_tdg_info_t *tdg =
5320 (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));
5321 __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
5322 // Initializing the TDG structure
5323 tdg->tdg_id = tdg_id;
5324 tdg->map_size = INIT_MAPSIZE;
5325 tdg->num_roots = -1;
5326 tdg->root_tasks = nullptr;
5327 tdg->tdg_status = KMP_TDG_RECORDING;
5328 tdg->rec_num_taskred = 0;
5329 tdg->rec_taskred_data = nullptr;
5330 KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);
5331
5332 // Initializing the list of nodes in this TDG
5333 kmp_node_info_t *this_record_map =
5334 (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));
5335 for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {
5336 kmp_int32 *successorsList =
5337 (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));
5338 this_record_map[i].task = nullptr;
5339 this_record_map[i].successors = successorsList;
5340 this_record_map[i].nsuccessors = 0;
5341 this_record_map[i].npredecessors = 0;
5342 this_record_map[i].successors_size = __kmp_successors_size;
5343 KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);
5344 }
5345
5346 __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
5347}
5348
5349// __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
5350// the beginning of the record process of a task region
5351// loc_ref: Location of TDG, not used yet
5352// gtid: Global Thread ID of the encountering thread
5353// input_flags: Flags associated with the TDG
5354// tdg_id: ID of the TDG to record, for now, incremental integer
5355// returns: 1 if we record, otherwise, 0
5356kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid,
5357 kmp_int32 input_flags, kmp_int32 tdg_id) {
5358
5359 kmp_int32 res;
5360 kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;
5361 KA_TRACE(10,
5362 ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
5363 gtid, loc_ref, input_flags, tdg_id));
5364
5365 if (__kmp_max_tdgs == 0) {
5366 KA_TRACE(
5367 10,
5368 ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
5369 "__kmp_max_tdgs = 0\n",
5370 gtid, loc_ref, input_flags, tdg_id));
5371 return 1;
5372 }
5373
5374 __kmpc_taskgroup(loc_ref, gtid);
5375 if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
5376 // TODO: use re_record flag
5377 __kmp_exec_tdg(gtid, tdg);
5378 res = 0;
5379 } else {
5380 __kmp_curr_tdg_idx = tdg_id;
5381 KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
5382 __kmp_start_record(gtid, flags, tdg_id);
5383 __kmp_num_tdg++;
5384 res = 1;
5385 }
5386 KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
5387 gtid, tdg_id, res ? "record" : "execute"));
5388 return res;
5389}
5390
5391// __kmp_end_record: set up a TDG after recording it
5392// gtid: Global thread ID
5393// tdg: Pointer to the TDG
5394void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5395 // Store roots
5396 kmp_node_info_t *this_record_map = tdg->record_map;
5397 kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5398 kmp_int32 *this_root_tasks =
5399 (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));
5400 kmp_int32 this_map_size = tdg->map_size;
5401 kmp_int32 this_num_roots = 0;
5402 kmp_info_t *thread = __kmp_threads[gtid];
5403
5404 for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5405 if (this_record_map[i].npredecessors == 0) {
5406 this_root_tasks[this_num_roots++] = i;
5407 }
5408 }
5409
5410 // Update with roots info and mapsize
5411 tdg->map_size = this_map_size;
5412 tdg->num_roots = this_num_roots;
5413 tdg->root_tasks = this_root_tasks;
5414 KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
5415 tdg->tdg_status = KMP_TDG_READY;
5416
5417 if (thread->th.th_current_task->td_dephash) {
5418 __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
5419 thread->th.th_current_task->td_dephash = NULL;
5420 }
5421
5422 // Reset predecessor counter
5423 for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5424 KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
5425 this_record_map[i].npredecessors);
5426 }
5427 KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);
5428
5429 if (__kmp_tdg_dot)
5430 __kmp_print_tdg_dot(tdg, gtid);
5431}
5432
5433// __kmpc_end_record_task: wrapper around __kmp_end_record to mark
5434// the end of recording phase
5435//
5436// loc_ref: Source location information
5437// gtid: Global thread ID
5438// input_flags: Flags attached to the graph
5439// tdg_id: ID of the TDG just finished recording
5440void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
5441 kmp_int32 input_flags, kmp_int32 tdg_id) {
5442 kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);
5443
5444 KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
5445 " tdg=%d with flags=%d\n",
5446 gtid, loc_ref, tdg_id, input_flags));
5447 if (__kmp_max_tdgs) {
5448 // TODO: use input_flags->nowait
5449 __kmpc_end_taskgroup(loc_ref, gtid);
5450 if (__kmp_tdg_is_recording(tdg->tdg_status))
5451 __kmp_end_record(gtid, tdg);
5452 }
5453 KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
5454 " tdg=%d, its status is now READY\n",
5455 gtid, loc_ref, tdg_id));
5456}
5457#endif
struct kmp_taskred_data kmp_taskred_data_t
struct kmp_task_red_input kmp_task_red_input_t
struct kmp_taskred_flags kmp_taskred_flags_t
struct kmp_taskred_input kmp_taskred_input_t
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition kmp_stats.h:911
void * __kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data)
void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void *task_dup)
void * __kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void * __kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
bool __kmpc_omp_has_task_team(kmp_int32 gtid)
void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask)
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws)
kmp_int32 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins, kmp_task_affinity_info_t *affin_list)
void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, int modifier, void *task_dup)
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask)
void * __kmpc_taskred_init(int gtid, int num, void *data)
void ** __kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid)
Definition kmp.h:227
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags