LLVM OpenMP
kmp_tasking.cpp
Go to the documentation of this file.
1/*
2 * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_i18n.h"
15#include "kmp_itt.h"
16#include "kmp_stats.h"
17#include "kmp_wait_release.h"
18#include "kmp_taskdeps.h"
19
20#if OMPT_SUPPORT
21#include "ompt-specific.h"
22#endif
23
24#if ENABLE_LIBOMPTARGET
25static void (*tgt_target_nowait_query)(void **);
26
27void __kmp_init_target_task() {
28 *(void **)(&tgt_target_nowait_query) = KMP_DLSYM("__tgt_target_nowait_query");
29}
30#endif
31
32/* forward declaration */
33static void __kmp_enable_tasking(kmp_task_team_t *task_team,
34 kmp_info_t *this_thr);
35static void __kmp_alloc_task_deque(kmp_info_t *thread,
36 kmp_thread_data_t *thread_data);
38 kmp_task_team_t *task_team);
40#if OMPX_TASKGRAPH
41static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
42int __kmp_taskloop_task(int gtid, void *ptask);
43#endif
44
45// returns 1 if new task is allowed to execute, 0 otherwise
46// checks Task Scheduling constraint (if requested) and
47// mutexinoutset dependencies if any
48static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
49 const kmp_taskdata_t *tasknew,
50 const kmp_taskdata_t *taskcurr) {
51 if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
52 // Check if the candidate obeys the Task Scheduling Constraints (TSC)
53 // only descendant of all deferred tied tasks can be scheduled, checking
54 // the last one is enough, as it in turn is the descendant of all others
55 kmp_taskdata_t *current = taskcurr->td_last_tied;
56 KMP_DEBUG_ASSERT(current != NULL);
57 // check if the task is not suspended on barrier
58 if (current->td_flags.tasktype == TASK_EXPLICIT ||
59 current->td_taskwait_thread > 0) { // <= 0 on barrier
60 kmp_int32 level = current->td_level;
62 while (parent != current && parent->td_level > level) {
63 // check generation up to the level of the current task
64 parent = parent->td_parent;
65 KMP_DEBUG_ASSERT(parent != NULL);
66 }
67 if (parent != current)
68 return false;
69 }
70 }
71 // Check mutexinoutset dependencies, acquire locks
72 kmp_depnode_t *node = tasknew->td_depnode;
73#if OMPX_TASKGRAPH
74 if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
75#else
76 if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
77#endif
78 for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
79 KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
80 if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
81 continue;
82 // could not get the lock, release previous locks
83 for (int j = i - 1; j >= 0; --j)
84 __kmp_release_lock(node->dn.mtx_locks[j], gtid);
85 return false;
86 }
87 // negative num_locks means all locks acquired successfully
88 node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
89 }
90 return true;
91}
92
93// __kmp_realloc_task_deque:
94// Re-allocates a task deque for a particular thread, copies the content from
95// the old deque and adjusts the necessary data structures relating to the
96// deque. This operation must be done with the deque_lock being held
98 kmp_thread_data_t *thread_data) {
99 kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
100 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
101 kmp_int32 new_size = 2 * size;
102
103 KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
104 "%d] for thread_data %p\n",
105 __kmp_gtid_from_thread(thread), size, new_size, thread_data));
106
107 kmp_taskdata_t **new_deque =
109
110 int i, j;
111 for (i = thread_data->td.td_deque_head, j = 0; j < size;
112 i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
113 new_deque[j] = thread_data->td.td_deque[i];
114
115 __kmp_free(thread_data->td.td_deque);
116
117 thread_data->td.td_deque_head = 0;
118 thread_data->td.td_deque_tail = size;
119 thread_data->td.td_deque = new_deque;
120 thread_data->td.td_deque_size = new_size;
121}
122
125 kmp_thread_data_t *thread_data = &l->td;
126 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
127 thread_data->td.td_deque_last_stolen = -1;
128 KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] "
129 "for thread_data %p\n",
130 __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data));
131 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
133 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
134 return l;
135}
136
137// The function finds the deque of priority tasks with given priority, or
138// allocates a new deque and put it into sorted (high -> low) list of deques.
139// Deques of non-default priority tasks are shared between all threads in team,
140// as opposed to per-thread deques of tasks with default priority.
141// The function is called under the lock task_team->tt.tt_task_pri_lock.
142static kmp_thread_data_t *
144 kmp_thread_data_t *thread_data;
145 kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
146 if (lst->priority == pri) {
147 // Found queue of tasks with given priority.
148 thread_data = &lst->td;
149 } else if (lst->priority < pri) {
150 // All current priority queues contain tasks with lower priority.
151 // Allocate new one for given priority tasks.
153 thread_data = &list->td;
154 list->priority = pri;
155 list->next = lst;
156 task_team->tt.tt_task_pri_list = list;
157 } else { // task_team->tt.tt_task_pri_list->priority > pri
158 kmp_task_pri_t *next_queue = lst->next;
159 while (next_queue && next_queue->priority > pri) {
160 lst = next_queue;
161 next_queue = lst->next;
162 }
163 // lst->priority > pri && (next == NULL || pri >= next->priority)
164 if (next_queue == NULL) {
165 // No queue with pri priority, need to allocate new one.
167 thread_data = &list->td;
168 list->priority = pri;
169 list->next = NULL;
170 lst->next = list;
171 } else if (next_queue->priority == pri) {
172 // Found queue of tasks with given priority.
173 thread_data = &next_queue->td;
174 } else { // lst->priority > pri > next->priority
175 // insert newly allocated between existed queues
177 thread_data = &list->td;
178 list->priority = pri;
179 list->next = next_queue;
180 lst->next = list;
181 }
182 }
183 return thread_data;
184}
185
186// __kmp_push_priority_task: Add a task to the team's priority task deque
188 kmp_taskdata_t *taskdata,
189 kmp_task_team_t *task_team,
190 kmp_int32 pri) {
191 kmp_thread_data_t *thread_data = NULL;
192 KA_TRACE(20,
193 ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n",
194 gtid, taskdata, pri));
195
196 // Find task queue specific to priority value
197 kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
198 if (UNLIKELY(lst == NULL)) {
200 if (task_team->tt.tt_task_pri_list == NULL) {
201 // List of queues is still empty, allocate one.
203 thread_data = &list->td;
204 list->priority = pri;
205 list->next = NULL;
206 task_team->tt.tt_task_pri_list = list;
207 } else {
208 // Other thread initialized a queue. Check if it fits and get thread_data.
209 thread_data = __kmp_get_priority_deque_data(task_team, pri);
210 }
212 } else {
213 if (lst->priority == pri) {
214 // Found queue of tasks with given priority.
215 thread_data = &lst->td;
216 } else {
218 thread_data = __kmp_get_priority_deque_data(task_team, pri);
220 }
221 }
222 KMP_DEBUG_ASSERT(thread_data);
223
224 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
225 // Check if deque is full
226 if (TCR_4(thread_data->td.td_deque_ntasks) >=
227 TASK_DEQUE_SIZE(thread_data->td)) {
230 thread->th.th_current_task)) {
231 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
232 KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "
233 "TASK_NOT_PUSHED for task %p\n",
234 gtid, taskdata));
235 return TASK_NOT_PUSHED;
236 } else {
237 // expand deque to push the task which is not allowed to execute
238 __kmp_realloc_task_deque(thread, thread_data);
239 }
240 }
241 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
242 TASK_DEQUE_SIZE(thread_data->td));
243 // Push taskdata.
244 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
245 // Wrap index.
246 thread_data->td.td_deque_tail =
247 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
248 TCW_4(thread_data->td.td_deque_ntasks,
249 TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
250 KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
251 KMP_FSYNC_RELEASING(taskdata); // releasing child
252 KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning "
253 "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n",
254 gtid, taskdata, thread_data->td.td_deque_ntasks,
255 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
256 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
257 task_team->tt.tt_num_task_pri++; // atomic inc
259}
260
261// __kmp_push_task: Add a task to the thread's deque
263 kmp_info_t *thread = __kmp_threads[gtid];
265
266 // If we encounter a hidden helper task, and the current thread is not a
267 // hidden helper thread, we have to give the task to any hidden helper thread
268 // starting from its shadow one.
269 if (UNLIKELY(taskdata->td_flags.hidden_helper &&
270 !KMP_HIDDEN_HELPER_THREAD(gtid))) {
271 kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
273 // Signal the hidden helper threads.
276 }
277
278 kmp_task_team_t *task_team = thread->th.th_task_team;
279 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
280 kmp_thread_data_t *thread_data;
281
282 KA_TRACE(20,
283 ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
284
285 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
286 // untied task needs to increment counter so that the task structure is not
287 // freed prematurely
290 KA_TRACE(
291 20,
292 ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
293 gtid, counter, taskdata));
294 }
295
296 // The first check avoids building task_team thread data if serialized
297 if (UNLIKELY(taskdata->td_flags.task_serial)) {
298 KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
299 "TASK_NOT_PUSHED for task %p\n",
300 gtid, taskdata));
301 return TASK_NOT_PUSHED;
302 }
303
304 // Now that serialized tasks have returned, we can assume that we are not in
305 // immediate exec mode
307 if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
308 __kmp_enable_tasking(task_team, thread);
309 }
311 KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
312
313 if (taskdata->td_flags.priority_specified && task->data2.priority > 0 &&
315 int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority);
316 return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri);
317 }
318
319 // Find tasking deque specific to encountering thread
320 thread_data = &task_team->tt.tt_threads_data[tid];
321
322 // No lock needed since only owner can allocate. If the task is hidden_helper,
323 // we don't need it either because we have initialized the dequeue for hidden
324 // helper thread data.
325 if (UNLIKELY(thread_data->td.td_deque == NULL)) {
326 __kmp_alloc_task_deque(thread, thread_data);
327 }
328
329 int locked = 0;
330 // Check if deque is full
331 if (TCR_4(thread_data->td.td_deque_ntasks) >=
332 TASK_DEQUE_SIZE(thread_data->td)) {
335 thread->th.th_current_task)) {
336 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
337 "TASK_NOT_PUSHED for task %p\n",
338 gtid, taskdata));
339 return TASK_NOT_PUSHED;
340 } else {
341 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
342 locked = 1;
343 if (TCR_4(thread_data->td.td_deque_ntasks) >=
344 TASK_DEQUE_SIZE(thread_data->td)) {
345 // expand deque to push the task which is not allowed to execute
346 __kmp_realloc_task_deque(thread, thread_data);
347 }
348 }
349 }
350 // Lock the deque for the task push operation
351 if (!locked) {
352 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
353 // Need to recheck as we can get a proxy task from thread outside of OpenMP
354 if (TCR_4(thread_data->td.td_deque_ntasks) >=
355 TASK_DEQUE_SIZE(thread_data->td)) {
358 thread->th.th_current_task)) {
359 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
360 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
361 "returning TASK_NOT_PUSHED for task %p\n",
362 gtid, taskdata));
363 return TASK_NOT_PUSHED;
364 } else {
365 // expand deque to push the task which is not allowed to execute
366 __kmp_realloc_task_deque(thread, thread_data);
367 }
368 }
369 }
370 // Must have room since no thread can add tasks but calling thread
371 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
372 TASK_DEQUE_SIZE(thread_data->td));
373
374 thread_data->td.td_deque[thread_data->td.td_deque_tail] =
375 taskdata; // Push taskdata
376 // Wrap index.
377 thread_data->td.td_deque_tail =
378 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
379 TCW_4(thread_data->td.td_deque_ntasks,
380 TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
381 KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
382 KMP_FSYNC_RELEASING(taskdata); // releasing child
383 KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
384 "task=%p ntasks=%d head=%u tail=%u\n",
385 gtid, taskdata, thread_data->td.td_deque_ntasks,
386 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
387
388 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
389
391}
392
393// __kmp_pop_current_task_from_thread: set up current task from called thread
394// when team ends
395//
396// this_thr: thread structure to set current_task in.
398 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
399 "this_thread=%p, curtask=%p, "
400 "curtask_parent=%p\n",
401 0, this_thr, this_thr->th.th_current_task,
402 this_thr->th.th_current_task->td_parent));
403
404 this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
405
406 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
407 "this_thread=%p, curtask=%p, "
408 "curtask_parent=%p\n",
409 0, this_thr, this_thr->th.th_current_task,
410 this_thr->th.th_current_task->td_parent));
411}
412
413// __kmp_push_current_task_to_thread: set up current task in called thread for a
414// new team
415//
416// this_thr: thread structure to set up
417// team: team for implicit task data
418// tid: thread within team to set up
420 int tid) {
421 // current task of the thread is a parent of the new just created implicit
422 // tasks of new team
423 KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
424 "curtask=%p "
425 "parent_task=%p\n",
426 tid, this_thr, this_thr->th.th_current_task,
427 team->t.t_implicit_task_taskdata[tid].td_parent));
428
429 KMP_DEBUG_ASSERT(this_thr != NULL);
430
431 if (tid == 0) {
432 if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
433 team->t.t_implicit_task_taskdata[0].td_parent =
434 this_thr->th.th_current_task;
435 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
436 }
437 } else {
438 team->t.t_implicit_task_taskdata[tid].td_parent =
439 team->t.t_implicit_task_taskdata[0].td_parent;
440 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
441 }
442
443 KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
444 "curtask=%p "
445 "parent_task=%p\n",
446 tid, this_thr, this_thr->th.th_current_task,
447 team->t.t_implicit_task_taskdata[tid].td_parent));
448}
449
450// __kmp_task_start: bookkeeping for a task starting execution
451//
452// GTID: global thread id of calling thread
453// task: task starting execution
454// current_task: task suspending
456 kmp_taskdata_t *current_task) {
458 kmp_info_t *thread = __kmp_threads[gtid];
459
460 KA_TRACE(10,
461 ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
462 gtid, taskdata, current_task));
463
465
466 // mark currently executing task as suspended
467 // TODO: GEH - make sure root team implicit task is initialized properly.
468 // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
469 current_task->td_flags.executing = 0;
470
471 // mark starting task as executing and as current task
472 thread->th.th_current_task = taskdata;
473
474 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
475 taskdata->td_flags.tiedness == TASK_UNTIED);
476 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
477 taskdata->td_flags.tiedness == TASK_UNTIED);
478 taskdata->td_flags.started = 1;
479 taskdata->td_flags.executing = 1;
480 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
481 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
482
483 // GEH TODO: shouldn't we pass some sort of location identifier here?
484 // APT: yes, we will pass location here.
485 // need to store current thread state (in a thread or taskdata structure)
486 // before setting work_state, otherwise wrong state is set after end of task
487
488 KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
489
490 return;
491}
492
493#if OMPT_SUPPORT
494//------------------------------------------------------------------------------
495
496// __ompt_task_start:
497// Build and trigger task-begin event
498static inline void __ompt_task_start(kmp_task_t *task,
499 kmp_taskdata_t *current_task,
500 kmp_int32 gtid) {
502 ompt_task_status_t status = ompt_task_switch;
503 if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
504 status = ompt_task_yield;
505 __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
506 }
507 /* let OMPT know that we're about to run this task */
508 if (ompt_enabled.ompt_callback_task_schedule) {
509 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
510 &(current_task->ompt_task_info.task_data), status,
511 &(taskdata->ompt_task_info.task_data));
512 }
513 taskdata->ompt_task_info.scheduling_parent = current_task;
514}
515
516// __ompt_task_finish:
517// Build and trigger final task-schedule event
518static inline void __ompt_task_finish(kmp_task_t *task,
519 kmp_taskdata_t *resumed_task,
520 ompt_task_status_t status) {
521 if (ompt_enabled.ompt_callback_task_schedule) {
523 if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
525 status = ompt_task_cancel;
526 }
527
528 /* let OMPT know that we're returning to the callee task */
529 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
530 &(taskdata->ompt_task_info.task_data), status,
531 (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
532 }
533}
534#endif
535
536template <bool ompt>
539 void *frame_address,
540 void *return_address) {
542 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
543
544 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
545 "current_task=%p\n",
546 gtid, loc_ref, taskdata, current_task));
547
548 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
549 // untied task needs to increment counter so that the task structure is not
550 // freed prematurely
553 KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
554 "incremented for task %p\n",
555 gtid, counter, taskdata));
556 }
557
558 taskdata->td_flags.task_serial =
559 1; // Execute this task immediately, not deferred.
560 __kmp_task_start(gtid, task, current_task);
561
562#if OMPT_SUPPORT
563 if (ompt) {
564 if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
565 current_task->ompt_task_info.frame.enter_frame.ptr =
566 taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
567 current_task->ompt_task_info.frame.enter_frame_flags =
568 taskdata->ompt_task_info.frame.exit_frame_flags =
570 }
571 if (ompt_enabled.ompt_callback_task_create) {
572 ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
573 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
574 &(parent_info->task_data), &(parent_info->frame),
575 &(taskdata->ompt_task_info.task_data),
576 TASK_TYPE_DETAILS_FORMAT(taskdata), 0, return_address);
577 }
578 __ompt_task_start(task, current_task, gtid);
579 }
580#endif // OMPT_SUPPORT
581
582 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
583 loc_ref, taskdata));
584}
585
586#if OMPT_SUPPORT
588static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
590 void *frame_address,
591 void *return_address) {
592 __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
593 return_address);
594}
595#endif // OMPT_SUPPORT
596
597// __kmpc_omp_task_begin_if0: report that a given serialized task has started
598// execution
599//
600// loc_ref: source location information; points to beginning of task block.
601// gtid: global thread number.
602// task: task thunk for the started task.
603#ifdef __s390x__
604// This is required for OMPT_GET_FRAME_ADDRESS(1) to compile on s390x.
605// In order for it to work correctly, the caller also needs to be compiled with
606// backchain. If a caller is compiled without backchain,
607// OMPT_GET_FRAME_ADDRESS(1) will produce an incorrect value, but will not
608// crash.
609__attribute__((target("backchain")))
610#endif
612 kmp_task_t *task) {
613#if OMPT_SUPPORT
614 if (UNLIKELY(ompt_enabled.enabled)) {
615 OMPT_STORE_RETURN_ADDRESS(gtid);
616 __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
618 OMPT_LOAD_RETURN_ADDRESS(gtid));
619 return;
620 }
621#endif
622 __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
623}
624
625#ifdef TASK_UNUSED
626// __kmpc_omp_task_begin: report that a given task has started execution
627// NEVER GENERATED BY COMPILER, DEPRECATED!!!
628void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
629 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
630
631 KA_TRACE(
632 10,
633 ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
634 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
635
636 __kmp_task_start(gtid, task, current_task);
637
638 KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
639 loc_ref, KMP_TASK_TO_TASKDATA(task)));
640 return;
641}
642#endif // TASK_UNUSED
643
644// __kmp_free_task: free the current task space and the space for shareds
645//
646// gtid: Global thread ID of calling thread
647// taskdata: task to free
648// thread: thread data structure of caller
649static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
650 kmp_info_t *thread) {
651 KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
652 taskdata));
653
654 // Check to make sure all flags and counters have the correct values
656 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
657 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
658 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
660 taskdata->td_flags.task_serial == 1);
663 // Clear data to not be re-used later by mistake.
664 task->data1.destructors = NULL;
665 task->data2.priority = 0;
666
667 taskdata->td_flags.freed = 1;
668#if OMPX_TASKGRAPH
669 // do not free tasks in taskgraph
670 if (!taskdata->is_taskgraph) {
671#endif
672// deallocate the taskdata and shared variable blocks associated with this task
673#if USE_FAST_MEMORY
674 __kmp_fast_free(thread, taskdata);
675#else /* ! USE_FAST_MEMORY */
676 __kmp_thread_free(thread, taskdata);
677#endif
678#if OMPX_TASKGRAPH
679 } else {
680 taskdata->td_flags.complete = 0;
681 taskdata->td_flags.started = 0;
682 taskdata->td_flags.freed = 0;
683 taskdata->td_flags.executing = 0;
684 taskdata->td_flags.task_serial =
685 (taskdata->td_parent->td_flags.final ||
686 taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);
687
688 // taskdata->td_allow_completion_event.pending_events_count = 1;
689 KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
691 // start at one because counts current task and children
693 }
694#endif
695
696 KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
697}
698
699// __kmp_free_task_and_ancestors: free the current task and ancestors without
700// children
701//
702// gtid: Global thread ID of calling thread
703// taskdata: task to free
704// thread: thread data structure of caller
706 kmp_taskdata_t *taskdata,
707 kmp_info_t *thread) {
708 // Proxy tasks must always be allowed to free their parents
709 // because they can be run in background even in serial mode.
710 kmp_int32 team_serial =
711 (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
712 !taskdata->td_flags.proxy;
714
715 kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
716 KMP_DEBUG_ASSERT(children >= 0);
717
718 // Now, go up the ancestor tree to see if any ancestors can now be freed.
719 while (children == 0) {
720 kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
721
722 KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
723 "and freeing itself\n",
724 gtid, taskdata));
725
726 // --- Deallocate my ancestor task ---
727 __kmp_free_task(gtid, taskdata, thread);
728
729 taskdata = parent_taskdata;
730
731 if (team_serial)
732 return;
733 // Stop checking ancestors at implicit task instead of walking up ancestor
734 // tree to avoid premature deallocation of ancestors.
735 if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
736 if (taskdata->td_dephash) { // do we need to cleanup dephash?
737 int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
738 kmp_tasking_flags_t flags_old = taskdata->td_flags;
739 if (children == 0 && flags_old.complete == 1) {
740 kmp_tasking_flags_t flags_new = flags_old;
741 flags_new.complete = 0;
743 RCAST(kmp_int32 *, &taskdata->td_flags),
744 *RCAST(kmp_int32 *, &flags_old),
745 *RCAST(kmp_int32 *, &flags_new))) {
746 KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
747 "dephash of implicit task %p\n",
748 gtid, taskdata));
749 // cleanup dephash of finished implicit task
750 __kmp_dephash_free_entries(thread, taskdata->td_dephash);
751 }
752 }
753 }
754 return;
755 }
756 // Predecrement simulated by "- 1" calculation
757 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
758 KMP_DEBUG_ASSERT(children >= 0);
759 }
760
761 KA_TRACE(
762 20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
763 "not freeing it yet\n",
764 gtid, taskdata, children));
765}
766
767// Only need to keep track of child task counts if any of the following:
768// 1. team parallel and tasking not serialized;
769// 2. it is a proxy or detachable or hidden helper task
770// 3. the children counter of its parent task is greater than 0.
771// The reason for the 3rd one is for serialized team that found detached task,
772// hidden helper task, T. In this case, the execution of T is still deferred,
773// and it is also possible that a regular task depends on T. In this case, if we
774// don't track the children, task synchronization will be broken.
776 kmp_tasking_flags_t flags = taskdata->td_flags;
777 bool ret = !(flags.team_serial || flags.tasking_ser);
778 ret = ret || flags.proxy == TASK_PROXY ||
779 flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
780 ret = ret ||
782#if OMPX_TASKGRAPH
783 if (taskdata->td_taskgroup && taskdata->is_taskgraph)
784 ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0;
785#endif
786 return ret;
787}
788
789// __kmp_task_finish: bookkeeping to do when a task finishes execution
790//
791// gtid: global thread ID for calling thread
792// task: task to be finished
793// resumed_task: task to be resumed. (may be NULL if task is serialized)
794//
795// template<ompt>: effectively ompt_enabled.enabled!=0
796// the version with ompt=false is inlined, allowing to optimize away all ompt
797// code in this case
798template <bool ompt>
800 kmp_taskdata_t *resumed_task) {
802 kmp_info_t *thread = __kmp_threads[gtid];
803 kmp_task_team_t *task_team =
804 thread->th.th_task_team; // might be NULL for serial teams...
805#if OMPX_TASKGRAPH
806 // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop
807 bool is_taskgraph;
808#endif
809#if KMP_DEBUG
810 kmp_int32 children = 0;
811#endif
812 KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
813 "task %p\n",
814 gtid, taskdata, resumed_task));
815
817
818#if OMPX_TASKGRAPH
819 is_taskgraph = taskdata->is_taskgraph;
820#endif
821
822 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
823 // untied task needs to check the counter so that the task structure is not
824 // freed prematurely
826 KA_TRACE(
827 20,
828 ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
829 gtid, counter, taskdata));
830 if (counter > 0) {
831 // untied task is not done, to be continued possibly by other thread, do
832 // not free it now
833 if (resumed_task == NULL) {
835 resumed_task = taskdata->td_parent; // In a serialized task, the resumed
836 // task is the parent
837 }
838 thread->th.th_current_task = resumed_task; // restore current_task
839 resumed_task->td_flags.executing = 1; // resume previous task
840 KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
841 "resuming task %p\n",
842 gtid, taskdata, resumed_task));
843 return;
844 }
845 }
846
847 // bookkeeping for resuming task:
848 // GEH - note tasking_ser => task_serial
850 (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
851 taskdata->td_flags.task_serial);
852 if (taskdata->td_flags.task_serial) {
853 if (resumed_task == NULL) {
854 resumed_task = taskdata->td_parent; // In a serialized task, the resumed
855 // task is the parent
856 }
857 } else {
858 KMP_DEBUG_ASSERT(resumed_task !=
859 NULL); // verify that resumed task is passed as argument
860 }
861
862 /* If the tasks' destructor thunk flag has been set, we need to invoke the
863 destructor thunk that has been generated by the compiler. The code is
864 placed here, since at this point other tasks might have been released
865 hence overlapping the destructor invocations with some other work in the
866 released tasks. The OpenMP spec is not specific on when the destructors
867 are invoked, so we should be free to choose. */
868 if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
869 kmp_routine_entry_t destr_thunk = task->data1.destructors;
870 KMP_ASSERT(destr_thunk);
871 destr_thunk(gtid, task);
872 }
873
874 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
875 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
876 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
877
878 bool completed = true;
879 if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
880 if (taskdata->td_allow_completion_event.type ==
882 // event hasn't been fulfilled yet. Try to detach task.
884 if (taskdata->td_allow_completion_event.type ==
886 // task finished execution
887 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
888 taskdata->td_flags.executing = 0; // suspend the finishing task
889
890#if OMPT_SUPPORT
891 // For a detached task, which is not completed, we switch back
892 // the omp_fulfill_event signals completion
893 // locking is necessary to avoid a race with ompt_task_late_fulfill
894 if (ompt)
895 __ompt_task_finish(task, resumed_task, ompt_task_detach);
896#endif
897
898 // no access to taskdata after this point!
899 // __kmp_fulfill_event might free taskdata at any time from now
900
901 taskdata->td_flags.proxy = TASK_PROXY; // proxify!
902 completed = false;
903 }
905 }
906 }
907
908 // Tasks with valid target async handles must be re-enqueued.
909 if (taskdata->td_target_data.async_handle != NULL) {
910 // Note: no need to translate gtid to its shadow. If the current thread is a
911 // hidden helper one, then the gtid is already correct. Otherwise, hidden
912 // helper threads are disabled, and gtid refers to a OpenMP thread.
913#if OMPT_SUPPORT
914 if (ompt) {
915 __ompt_task_finish(task, resumed_task, ompt_task_switch);
916 }
917#endif
919 if (KMP_HIDDEN_HELPER_THREAD(gtid))
921 completed = false;
922 }
923
924 if (completed) {
925 taskdata->td_flags.complete = 1; // mark the task as completed
926#if OMPX_TASKGRAPH
927 taskdata->td_flags.onced = 1; // mark the task as ran once already
928#endif
929
930#if OMPT_SUPPORT
931 // This is not a detached task, we are done here
932 if (ompt)
933 __ompt_task_finish(task, resumed_task, ompt_task_complete);
934#endif
935 // TODO: What would be the balance between the conditions in the function
936 // and an atomic operation?
937 if (__kmp_track_children_task(taskdata)) {
938 __kmp_release_deps(gtid, taskdata);
939 // Predecrement simulated by "- 1" calculation
940#if KMP_DEBUG
941 children = -1 +
942#endif
944 KMP_DEBUG_ASSERT(children >= 0);
945#if OMPX_TASKGRAPH
946 if (taskdata->td_taskgroup && !taskdata->is_taskgraph)
947#else
948 if (taskdata->td_taskgroup)
949#endif
950 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
951 } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
953 // if we found proxy or hidden helper tasks there could exist a dependency
954 // chain with the proxy task as origin
955 __kmp_release_deps(gtid, taskdata);
956 }
957 // td_flags.executing must be marked as 0 after __kmp_release_deps has been
958 // called. Othertwise, if a task is executed immediately from the
959 // release_deps code, the flag will be reset to 1 again by this same
960 // function
961 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
962 taskdata->td_flags.executing = 0; // suspend the finishing task
963
964 // Decrement the counter of hidden helper tasks to be executed.
965 if (taskdata->td_flags.hidden_helper) {
966 // Hidden helper tasks can only be executed by hidden helper threads.
969 }
970 }
971
972 KA_TRACE(
973 20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
974 gtid, taskdata, children));
975
976 // Free this task and then ancestor tasks if they have no children.
977 // Restore th_current_task first as suggested by John:
978 // johnmc: if an asynchronous inquiry peers into the runtime system
979 // it doesn't see the freed task as the current task.
980 thread->th.th_current_task = resumed_task;
981 if (completed)
982 __kmp_free_task_and_ancestors(gtid, taskdata, thread);
983
984 // TODO: GEH - make sure root team implicit task is initialized properly.
985 // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
986 resumed_task->td_flags.executing = 1; // resume previous task
987
988#if OMPX_TASKGRAPH
989 if (is_taskgraph && __kmp_track_children_task(taskdata) &&
990 taskdata->td_taskgroup) {
991 // TDG: we only release taskgroup barrier here because
992 // free_task_and_ancestors will call
993 // __kmp_free_task, which resets all task parameters such as
994 // taskdata->started, etc. If we release the barrier earlier, these
995 // parameters could be read before being reset. This is not an issue for
996 // non-TDG implementation because we never reuse a task(data) structure
997 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
998 }
999#endif
1000
1001 KA_TRACE(
1002 10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
1003 gtid, taskdata, resumed_task));
1004
1005 return;
1006}
1007
1008template <bool ompt>
1010 kmp_int32 gtid,
1011 kmp_task_t *task) {
1012 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
1013 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1014 KMP_DEBUG_ASSERT(gtid >= 0);
1015 // this routine will provide task to resume
1016 __kmp_task_finish<ompt>(gtid, task, NULL);
1017
1018 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
1019 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1020
1021#if OMPT_SUPPORT
1022 if (ompt) {
1023 ompt_frame_t *ompt_frame;
1024 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1025 ompt_frame->enter_frame = ompt_data_none;
1026 ompt_frame->enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
1027 }
1028#endif
1029
1030 return;
1031}
1032
1033#if OMPT_SUPPORT
1035void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1036 kmp_task_t *task) {
1038}
1039#endif // OMPT_SUPPORT
1040
1041// __kmpc_omp_task_complete_if0: report that a task has completed execution
1042//
1043// loc_ref: source location information; points to end of task block.
1044// gtid: global thread number.
1045// task: task thunk for the completed task.
1047 kmp_task_t *task) {
1048#if OMPT_SUPPORT
1049 if (UNLIKELY(ompt_enabled.enabled)) {
1050 __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1051 return;
1052 }
1053#endif
1055}
1056
1057#ifdef TASK_UNUSED
1058// __kmpc_omp_task_complete: report that a task has completed execution
1059// NEVER GENERATED BY COMPILER, DEPRECATED!!!
1060void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1061 kmp_task_t *task) {
1062 KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1063 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1064
1066 NULL); // Not sure how to find task to resume
1067
1068 KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1069 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1070 return;
1071}
1072#endif // TASK_UNUSED
1073
1074// __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1075// task for a given thread
1076//
1077// loc_ref: reference to source location of parallel region
1078// this_thr: thread data structure corresponding to implicit task
1079// team: team for this_thr
1080// tid: thread id of given thread within team
1081// set_curr_task: TRUE if need to push current task to thread
1082// NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1083// have already been done elsewhere.
1084// TODO: Get better loc_ref. Value passed in may be NULL
1086 kmp_team_t *team, int tid, int set_curr_task) {
1087 kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1088
1089 KF_TRACE(
1090 10,
1091 ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1092 tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1093
1094 task->td_task_id = KMP_GEN_TASK_ID();
1095 task->td_team = team;
1096 // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1097 // in debugger)
1098 task->td_ident = loc_ref;
1099 task->td_taskwait_ident = NULL;
1100 task->td_taskwait_counter = 0;
1101 task->td_taskwait_thread = 0;
1102
1103 task->td_flags.tiedness = TASK_TIED;
1104 task->td_flags.tasktype = TASK_IMPLICIT;
1105 task->td_flags.proxy = TASK_FULL;
1106
1107 // All implicit tasks are executed immediately, not deferred
1108 task->td_flags.task_serial = 1;
1109 task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1110 task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1111
1112 task->td_flags.started = 1;
1113 task->td_flags.executing = 1;
1114 task->td_flags.complete = 0;
1115 task->td_flags.freed = 0;
1116#if OMPX_TASKGRAPH
1117 task->td_flags.onced = 0;
1118#endif
1119
1120 task->td_depnode = NULL;
1121 task->td_last_tied = task;
1122 task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1123
1124 if (set_curr_task) { // only do this init first time thread is created
1125 KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1126 // Not used: don't need to deallocate implicit task
1127 KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1128 task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1129 task->td_dephash = NULL;
1130 __kmp_push_current_task_to_thread(this_thr, team, tid);
1131 } else {
1132 KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1133 KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1134 }
1135
1136#if OMPT_SUPPORT
1137 if (UNLIKELY(ompt_enabled.enabled))
1138 __ompt_task_init(task, tid);
1139#endif
1140
1141 KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1142 team, task));
1143}
1144
1145// __kmp_finish_implicit_task: Release resources associated to implicit tasks
1146// at the end of parallel regions. Some resources are kept for reuse in the next
1147// parallel region.
1148//
1149// thread: thread data structure corresponding to implicit task
1151 kmp_taskdata_t *task = thread->th.th_current_task;
1152#if ENABLE_LIBOMPTARGET
1153 // Give an opportunity to the offload runtime to synchronize any unfinished
1154 // target async regions before finishing the implicit task
1155 if (UNLIKELY(kmp_target_sync_cb != NULL))
1156 (*kmp_target_sync_cb)(NULL, thread->th.th_info.ds.ds_gtid,
1157 KMP_TASKDATA_TO_TASK(task), NULL);
1158#endif // ENABLE_LIBOMPTARGET
1159 if (task->td_dephash) {
1160 int children;
1161 task->td_flags.complete = 1;
1162#if OMPX_TASKGRAPH
1163 task->td_flags.onced = 1;
1164#endif
1165 children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1166 kmp_tasking_flags_t flags_old = task->td_flags;
1167 if (children == 0 && flags_old.complete == 1) {
1168 kmp_tasking_flags_t flags_new = flags_old;
1169 flags_new.complete = 0;
1171 *RCAST(kmp_int32 *, &flags_old),
1172 *RCAST(kmp_int32 *, &flags_new))) {
1173 KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1174 "dephash of implicit task %p\n",
1175 thread->th.th_info.ds.ds_gtid, task));
1176 __kmp_dephash_free_entries(thread, task->td_dephash);
1177 }
1178 }
1179 }
1180}
1181
1182// __kmp_free_implicit_task: Release resources associated to implicit tasks
1183// when these are destroyed regions
1184//
1185// thread: thread data structure corresponding to implicit task
1187 kmp_taskdata_t *task = thread->th.th_current_task;
1188 if (task && task->td_dephash) {
1189 __kmp_dephash_free(thread, task->td_dephash);
1190 task->td_dephash = NULL;
1191 }
1192}
1193
1194// Round up a size to a power of two specified by val: Used to insert padding
1195// between structures co-allocated using a single malloc() call
1196static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1197 if (size & (val - 1)) {
1198 size &= ~(val - 1);
1199 if (size <= KMP_SIZE_T_MAX - val) {
1200 size += val; // Round up if there is no overflow.
1201 }
1202 }
1203 return size;
1204} // __kmp_round_up_to_va
1205
1206// __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1207//
1208// loc_ref: source location information
1209// gtid: global thread number.
1210// flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1211// task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1212// sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1213// private vars accessed in task.
1214// sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1215// in task.
1216// task_entry: Pointer to task code entry point generated by compiler.
1217// returns: a pointer to the allocated kmp_task_t structure (task).
1219 kmp_tasking_flags_t *flags,
1220 size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1223 kmp_taskdata_t *taskdata;
1224 kmp_info_t *thread = __kmp_threads[gtid];
1225 kmp_team_t *team = thread->th.th_team;
1226 kmp_taskdata_t *parent_task = thread->th.th_current_task;
1227 size_t shareds_offset;
1228
1231
1232 if (flags->hidden_helper) {
1236 } else {
1237 // If the hidden helper task is not enabled, reset the flag to FALSE.
1238 flags->hidden_helper = FALSE;
1239 }
1240 }
1241
1242 KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1243 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1244 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1245 sizeof_shareds, task_entry));
1246
1247 KMP_DEBUG_ASSERT(parent_task);
1248 if (parent_task->td_flags.final) {
1249 if (flags->merged_if0) {
1250 }
1251 flags->final = 1;
1252 }
1253
1254 if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1255 // Untied task encountered causes the TSC algorithm to check entire deque of
1256 // the victim thread. If no untied task encountered, then checking the head
1257 // of the deque should be enough.
1258 KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1259 }
1260
1261 // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1262 // the tasking setup
1263 // when that happens is too late.
1264 if (UNLIKELY(flags->proxy == TASK_PROXY ||
1265 flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1266 if (flags->proxy == TASK_PROXY) {
1267 flags->tiedness = TASK_UNTIED;
1268 flags->merged_if0 = 1;
1269 }
1270 /* are we running in a sequential parallel or tskm_immediate_exec... we need
1271 tasking support enabled */
1272 if ((thread->th.th_task_team) == NULL) {
1273 /* This should only happen if the team is serialized
1274 setup a task team and propagate it to the thread */
1275 KMP_DEBUG_ASSERT(team->t.t_serialized);
1276 KA_TRACE(30,
1277 ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1278 gtid));
1279 __kmp_task_team_setup(thread, team);
1280 thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1281 }
1282 kmp_task_team_t *task_team = thread->th.th_task_team;
1283
1284 /* tasking must be enabled now as the task might not be pushed */
1285 if (!KMP_TASKING_ENABLED(task_team)) {
1286 KA_TRACE(
1287 30,
1288 ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1289 __kmp_enable_tasking(task_team, thread);
1290 kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1291 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1292 // No lock needed since only owner can allocate
1293 if (thread_data->td.td_deque == NULL) {
1294 __kmp_alloc_task_deque(thread, thread_data);
1295 }
1296 }
1297
1298 if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1299 task_team->tt.tt_found_proxy_tasks == FALSE)
1300 TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1301 if (flags->hidden_helper &&
1304 }
1305
1306 // Calculate shared structure offset including padding after kmp_task_t struct
1307 // to align pointers in shared struct
1308 shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1309 shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(kmp_uint64));
1310
1311 // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1312 KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1313 shareds_offset));
1314 KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1315 sizeof_shareds));
1316
1317 // Avoid double allocation here by combining shareds with taskdata
1318#if USE_FAST_MEMORY
1319 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1320 sizeof_shareds);
1321#else /* ! USE_FAST_MEMORY */
1322 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1323 sizeof_shareds);
1324#endif /* USE_FAST_MEMORY */
1325
1326 task = KMP_TASKDATA_TO_TASK(taskdata);
1327
1328// Make sure task & taskdata are aligned appropriately
1329#if KMP_ARCH_X86 || KMP_ARCH_PPC64 || KMP_ARCH_S390X || !KMP_HAVE_QUAD
1330 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1331 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1332#else
1333 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1334 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1335#endif
1336 if (sizeof_shareds > 0) {
1337 // Avoid double allocation here by combining shareds with taskdata
1338 task->shareds = &((char *)taskdata)[shareds_offset];
1339 // Make sure shareds struct is aligned to pointer size
1340 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1341 0);
1342 } else {
1343 task->shareds = NULL;
1344 }
1346 task->part_id = 0; // AC: Always start with 0 part id
1347
1348 taskdata->td_task_id = KMP_GEN_TASK_ID();
1349 taskdata->td_team = thread->th.th_team;
1350 taskdata->td_alloc_thread = thread;
1351 taskdata->td_parent = parent_task;
1352 taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1353 KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1354 taskdata->td_ident = loc_ref;
1355 taskdata->td_taskwait_ident = NULL;
1356 taskdata->td_taskwait_counter = 0;
1357 taskdata->td_taskwait_thread = 0;
1358 KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1359 // avoid copying icvs for proxy tasks
1360 if (flags->proxy == TASK_FULL)
1361 copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1362
1363 taskdata->td_flags = *flags;
1364 taskdata->td_task_team = thread->th.th_task_team;
1365 taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1366 taskdata->td_flags.tasktype = TASK_EXPLICIT;
1367 // If it is hidden helper task, we need to set the team and task team
1368 // correspondingly.
1369 if (flags->hidden_helper) {
1370 kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1371 taskdata->td_team = shadow_thread->th.th_team;
1372 taskdata->td_task_team = shadow_thread->th.th_task_team;
1373 }
1374
1375 // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1377
1378 // GEH - TODO: fix this to copy parent task's value of team_serial flag
1379 taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1380
1381 // GEH - Note we serialize the task if the team is serialized to make sure
1382 // implicit parallel region tasks are not left until program termination to
1383 // execute. Also, it helps locality to execute immediately.
1384
1385 taskdata->td_flags.task_serial =
1386 (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1387 taskdata->td_flags.tasking_ser || flags->merged_if0);
1388
1389 taskdata->td_flags.started = 0;
1390 taskdata->td_flags.executing = 0;
1391 taskdata->td_flags.complete = 0;
1392 taskdata->td_flags.freed = 0;
1393#if OMPX_TASKGRAPH
1394 taskdata->td_flags.onced = 0;
1395 taskdata->is_taskgraph = 0;
1396 taskdata->tdg = nullptr;
1397#endif
1399 // start at one because counts current task and children
1401 taskdata->td_taskgroup =
1402 parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1403 taskdata->td_dephash = NULL;
1404 taskdata->td_depnode = NULL;
1405 taskdata->td_target_data.async_handle = NULL;
1406 if (flags->tiedness == TASK_UNTIED)
1407 taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1408 else
1409 taskdata->td_last_tied = taskdata;
1411#if OMPT_SUPPORT
1412 if (UNLIKELY(ompt_enabled.enabled))
1413 __ompt_task_init(taskdata, gtid);
1414#endif
1415 // TODO: What would be the balance between the conditions in the function and
1416 // an atomic operation?
1417 if (__kmp_track_children_task(taskdata)) {
1419 if (parent_task->td_taskgroup)
1420 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1421 // Only need to keep track of allocated child tasks for explicit tasks since
1422 // implicit not deallocated
1423 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1425 }
1426 if (flags->hidden_helper) {
1427 taskdata->td_flags.task_serial = FALSE;
1428 // Increment the number of hidden helper tasks to be executed
1430 }
1431 }
1432
1433#if OMPX_TASKGRAPH
1434 kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
1435 if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&
1437 taskdata->is_taskgraph = 1;
1438 taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
1439 taskdata->td_task_id = KMP_GEN_TASK_ID();
1440 taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
1441 }
1442#endif
1443 KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1444 gtid, taskdata, taskdata->td_parent));
1445
1446 return task;
1447}
1448
1450 kmp_int32 flags, size_t sizeof_kmp_task_t,
1451 size_t sizeof_shareds,
1453 kmp_task_t *retval;
1454 kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1456 input_flags->native = FALSE;
1457 // __kmp_task_alloc() sets up all other runtime flags
1458 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1459 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1460 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1461 input_flags->proxy ? "proxy" : "",
1462 input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1463 sizeof_shareds, task_entry));
1464
1465 retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1466 sizeof_shareds, task_entry);
1467
1468 KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1469
1470 return retval;
1471}
1472
1474 kmp_int32 flags,
1475 size_t sizeof_kmp_task_t,
1476 size_t sizeof_shareds,
1478 kmp_int64 device_id) {
1479 auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1480 // target task is untied defined in the specification
1481 input_flags.tiedness = TASK_UNTIED;
1482 input_flags.target = 1;
1483
1485 input_flags.hidden_helper = TRUE;
1486
1487 return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1488 sizeof_shareds, task_entry);
1489}
1490
1491/*!
1492@ingroup TASKING
1493@param loc_ref location of the original task directive
1494@param gtid Global Thread ID of encountering thread
1495@param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new
1496task''
1497@param naffins Number of affinity items
1498@param affin_list List of affinity items
1499@return Returns non-zero if registering affinity information was not successful.
1500 Returns 0 if registration was successful
1501This entry registers the affinity information attached to a task with the task
1502thunk structure kmp_taskdata_t.
1503*/
1506 kmp_task_t *new_task, kmp_int32 naffins,
1507 kmp_task_affinity_info_t *affin_list) {
1508 if (naffins > 0)
1509 KMP_DEBUG_ASSERT(affin_list != NULL);
1510
1511 for (kmp_int32 i = 0; i < naffins; ++i) {
1512 KA_TRACE(30, ("__kmpc_omp_reg_task_with_affinity: T#%d aff[%d] "
1513 "base_addr=0x%llx len=%zu flags={%d,%d,%d}\n",
1514 gtid, i, (unsigned long long)affin_list[i].base_addr,
1515 affin_list[i].len, (int)affin_list[i].flags.flag1,
1516 (int)affin_list[i].flags.flag2,
1517 (int)affin_list[i].flags.reserved));
1518 }
1519
1520 return 0;
1521}
1522
1523// __kmp_invoke_task: invoke the specified task
1524//
1525// gtid: global thread ID of caller
1526// task: the task to invoke
1527// current_task: the task to resume after task invocation
1528#ifdef __s390x__
1529__attribute__((target("backchain")))
1530#endif
1531static void
1533 kmp_taskdata_t *current_task) {
1535 kmp_info_t *thread;
1536 int discard = 0 /* false */;
1537 KA_TRACE(
1538 30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1539 gtid, taskdata, current_task));
1541 if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1542 taskdata->td_flags.complete == 1)) {
1543 // This is a proxy task that was already completed but it needs to run
1544 // its bottom-half finish
1545 KA_TRACE(
1546 30,
1547 ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1548 gtid, taskdata));
1549
1551
1552 KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1553 "proxy task %p, resuming task %p\n",
1554 gtid, taskdata, current_task));
1555
1556 return;
1557 }
1558
1559#if OMPT_SUPPORT
1560 // For untied tasks, the first task executed only calls __kmpc_omp_task and
1561 // does not execute code.
1562 ompt_thread_info_t oldInfo;
1563 if (UNLIKELY(ompt_enabled.enabled)) {
1564 // Store the threads states and restore them after the task
1565 thread = __kmp_threads[gtid];
1566 oldInfo = thread->th.ompt_thread_info;
1567 thread->th.ompt_thread_info.wait_id = 0;
1568 thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1569 ? ompt_state_work_serial
1570 : ompt_state_work_parallel;
1571 taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1572 }
1573#endif
1574
1575 // Proxy tasks are not handled by the runtime
1576 if (taskdata->td_flags.proxy != TASK_PROXY) {
1577 __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1578 }
1579
1580 // TODO: cancel tasks if the parallel region has also been cancelled
1581 // TODO: check if this sequence can be hoisted above __kmp_task_start
1582 // if cancellation has been enabled for this run ...
1584 thread = __kmp_threads[gtid];
1585 kmp_team_t *this_team = thread->th.th_team;
1586 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1587 if ((taskgroup && taskgroup->cancel_request) ||
1588 (this_team->t.t_cancel_request == cancel_parallel)) {
1589#if OMPT_SUPPORT && OMPT_OPTIONAL
1590 ompt_data_t *task_data;
1591 if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1592 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1593 ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1594 task_data,
1595 ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1596 : ompt_cancel_parallel) |
1597 ompt_cancel_discarded_task,
1598 NULL);
1599 }
1600#endif
1601 KMP_COUNT_BLOCK(TASK_cancelled);
1602 // this task belongs to a task group and we need to cancel it
1603 discard = 1 /* true */;
1604 }
1605 }
1606
1607 // Invoke the task routine and pass in relevant data.
1608 // Thunks generated by gcc take a different argument list.
1609 if (!discard) {
1610 if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1611 taskdata->td_last_tied = current_task->td_last_tied;
1612 KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1613 }
1614#if KMP_STATS_ENABLED
1615 KMP_COUNT_BLOCK(TASK_executed);
1616 switch (KMP_GET_THREAD_STATE()) {
1617 case FORK_JOIN_BARRIER:
1618 KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1619 break;
1620 case PLAIN_BARRIER:
1621 KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1622 break;
1623 case TASKYIELD:
1624 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1625 break;
1626 case TASKWAIT:
1627 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1628 break;
1629 case TASKGROUP:
1630 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1631 break;
1632 default:
1633 KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1634 break;
1635 }
1636#endif // KMP_STATS_ENABLED
1637
1638// OMPT task begin
1639#if OMPT_SUPPORT
1640 if (UNLIKELY(ompt_enabled.enabled))
1641 __ompt_task_start(task, current_task, gtid);
1642#endif
1643#if OMPT_SUPPORT && OMPT_OPTIONAL
1644 if (UNLIKELY(ompt_enabled.ompt_callback_dispatch &&
1645 taskdata->ompt_task_info.dispatch_chunk.iterations > 0)) {
1646 ompt_data_t instance = ompt_data_none;
1647 instance.ptr = &(taskdata->ompt_task_info.dispatch_chunk);
1648 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1649 ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
1650 &(team_info->parallel_data), &(taskdata->ompt_task_info.task_data),
1651 ompt_dispatch_taskloop_chunk, instance);
1652 taskdata->ompt_task_info.dispatch_chunk = {0, 0};
1653 }
1654#endif // OMPT_SUPPORT && OMPT_OPTIONAL
1655
1656#if OMPD_SUPPORT
1657 if (ompd_state & OMPD_ENABLE_BP)
1658 ompd_bp_task_begin();
1659#endif
1660
1661#if USE_ITT_BUILD && USE_ITT_NOTIFY
1662 kmp_uint64 cur_time;
1663 kmp_int32 kmp_itt_count_task =
1664 __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1665 current_task->td_flags.tasktype == TASK_IMPLICIT;
1666 if (kmp_itt_count_task) {
1667 thread = __kmp_threads[gtid];
1668 // Time outer level explicit task on barrier for adjusting imbalance time
1669 if (thread->th.th_bar_arrive_time)
1670 cur_time = __itt_get_timestamp();
1671 else
1672 kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1673 }
1674 KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1675#endif
1676
1677#if ENABLE_LIBOMPTARGET
1678 if (taskdata->td_target_data.async_handle != NULL) {
1679 // If we have a valid target async handle, that means that we have already
1680 // executed the task routine once. We must query for the handle completion
1681 // instead of re-executing the routine.
1682 KMP_ASSERT(tgt_target_nowait_query);
1683 tgt_target_nowait_query(&taskdata->td_target_data.async_handle);
1684 } else
1685#endif
1686 if (task->routine != NULL) {
1687#ifdef KMP_GOMP_COMPAT
1688 if (taskdata->td_flags.native) {
1689 ((void (*)(void *))(*(task->routine)))(task->shareds);
1690 } else
1691#endif /* KMP_GOMP_COMPAT */
1692 {
1693 (*(task->routine))(gtid, task);
1694 }
1695 }
1697
1698#if USE_ITT_BUILD && USE_ITT_NOTIFY
1699 if (kmp_itt_count_task) {
1700 // Barrier imbalance - adjust arrive time with the task duration
1701 thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1702 }
1703 KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1704 KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1705#endif
1706 }
1707
1708#if OMPD_SUPPORT
1709 if (ompd_state & OMPD_ENABLE_BP)
1710 ompd_bp_task_end();
1711#endif
1712
1713 // Proxy tasks are not handled by the runtime
1714 if (taskdata->td_flags.proxy != TASK_PROXY) {
1715#if OMPT_SUPPORT
1716 if (UNLIKELY(ompt_enabled.enabled)) {
1717 thread->th.ompt_thread_info = oldInfo;
1718 if (taskdata->td_flags.tiedness == TASK_TIED) {
1719 taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1720 }
1721 __kmp_task_finish<true>(gtid, task, current_task);
1722 } else
1723#endif
1724 __kmp_task_finish<false>(gtid, task, current_task);
1725 }
1726#if OMPT_SUPPORT
1727 else if (UNLIKELY(ompt_enabled.enabled && taskdata->td_flags.target)) {
1728 __ompt_task_finish(task, current_task, ompt_task_switch);
1729 }
1730#endif
1731
1732 KA_TRACE(
1733 30,
1734 ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1735 gtid, taskdata, current_task));
1736 return;
1737}
1738
1739// __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1740//
1741// loc_ref: location of original task pragma (ignored)
1742// gtid: Global Thread ID of encountering thread
1743// new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1744// Returns:
1745// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1746// be resumed later.
1747// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1748// resumed later.
1750 kmp_task_t *new_task) {
1751 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1752
1753 KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1754 loc_ref, new_taskdata));
1755
1756#if OMPT_SUPPORT
1758 if (UNLIKELY(ompt_enabled.enabled)) {
1759 parent = new_taskdata->td_parent;
1760 if (ompt_enabled.ompt_callback_task_create) {
1761 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1762 &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1763 &(new_taskdata->ompt_task_info.task_data),
1764 TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1766 }
1767 }
1768#endif
1769
1770 /* Should we execute the new task or queue it? For now, let's just always try
1771 to queue it. If the queue fills up, then we'll execute it. */
1772
1773 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1774 { // Execute this task immediately
1775 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1776 new_taskdata->td_flags.task_serial = 1;
1777 __kmp_invoke_task(gtid, new_task, current_task);
1778 }
1779
1780 KA_TRACE(
1781 10,
1782 ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1783 "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1784 gtid, loc_ref, new_taskdata));
1785
1786#if OMPT_SUPPORT
1787 if (UNLIKELY(ompt_enabled.enabled)) {
1788 parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1789 parent->ompt_task_info.frame.enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
1790 }
1791#endif
1793}
1794
1795// __kmp_omp_task: Schedule a non-thread-switchable task for execution
1796//
1797// gtid: Global Thread ID of encountering thread
1798// new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1799// serialize_immediate: if TRUE then if the task is executed immediately its
1800// execution will be serialized
1801// Returns:
1802// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1803// be resumed later.
1804// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1805// resumed later.
1807 bool serialize_immediate) {
1808 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1809
1810#if OMPX_TASKGRAPH
1811 if (new_taskdata->is_taskgraph &&
1812 __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
1813 kmp_tdg_info_t *tdg = new_taskdata->tdg;
1814 // extend the record_map if needed
1815 if (new_taskdata->td_tdg_task_id >= new_taskdata->tdg->map_size) {
1816 __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
1817 // map_size could have been updated by another thread if recursive
1818 // taskloop
1819 if (new_taskdata->td_tdg_task_id >= tdg->map_size) {
1820 kmp_uint old_size = tdg->map_size;
1821 kmp_uint new_size = old_size * 2;
1822 kmp_node_info_t *old_record = tdg->record_map;
1823 kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
1824 new_size * sizeof(kmp_node_info_t));
1825
1826 KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));
1827 tdg->record_map = new_record;
1828
1829 __kmp_free(old_record);
1830
1831 for (kmp_int i = old_size; i < new_size; i++) {
1832 kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
1833 __kmp_successors_size * sizeof(kmp_int32));
1834 new_record[i].task = nullptr;
1835 new_record[i].successors = successorsList;
1836 new_record[i].nsuccessors = 0;
1837 new_record[i].npredecessors = 0;
1838 new_record[i].successors_size = __kmp_successors_size;
1839 KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
1840 }
1841 // update the size at the end, so that we avoid other
1842 // threads use old_record while map_size is already updated
1843 tdg->map_size = new_size;
1844 }
1845 __kmp_release_bootstrap_lock(&tdg->graph_lock);
1846 }
1847 // record a task
1848 if (tdg->record_map[new_taskdata->td_tdg_task_id].task == nullptr) {
1849 tdg->record_map[new_taskdata->td_tdg_task_id].task = new_task;
1850 tdg->record_map[new_taskdata->td_tdg_task_id].parent_task =
1851 new_taskdata->td_parent;
1852 KMP_ATOMIC_INC(&tdg->num_tasks);
1853 }
1854 }
1855#endif
1856
1857 /* Should we execute the new task or queue it? For now, let's just always try
1858 to queue it. If the queue fills up, then we'll execute it. */
1859 if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1860 __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1861 { // Execute this task immediately
1862 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1863 if (serialize_immediate)
1864 new_taskdata->td_flags.task_serial = 1;
1865 __kmp_invoke_task(gtid, new_task, current_task);
1868 kmp_info_t *this_thr = __kmp_threads[gtid];
1869 kmp_team_t *team = this_thr->th.th_team;
1870 kmp_int32 nthreads = this_thr->th.th_team_nproc;
1871 for (int i = 0; i < nthreads; ++i) {
1872 kmp_info_t *thread = team->t.t_threads[i];
1873 if (thread == this_thr)
1874 continue;
1875 if (thread->th.th_sleep_loc != NULL) {
1877 break; // awake one thread at a time
1878 }
1879 }
1880 }
1882}
1883
1884// __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1885// non-thread-switchable task from the parent thread only!
1886//
1887// loc_ref: location of original task pragma (ignored)
1888// gtid: Global Thread ID of encountering thread
1889// new_task: non-thread-switchable task thunk allocated by
1890// __kmp_omp_task_alloc()
1891// Returns:
1892// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1893// be resumed later.
1894// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1895// resumed later.
1897 kmp_task_t *new_task) {
1898 kmp_int32 res;
1899 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1900
1901#if KMP_DEBUG || OMPT_SUPPORT
1902 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1903#endif
1904 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1905 new_taskdata));
1907
1908#if OMPT_SUPPORT
1909 kmp_taskdata_t *parent = NULL;
1910 if (UNLIKELY(ompt_enabled.enabled)) {
1911 if (!new_taskdata->td_flags.started) {
1912 OMPT_STORE_RETURN_ADDRESS(gtid);
1913 parent = new_taskdata->td_parent;
1914 if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1915 parent->ompt_task_info.frame.enter_frame.ptr =
1917 }
1918 if (ompt_enabled.ompt_callback_task_create) {
1919 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1920 &(parent->ompt_task_info.task_data),
1921 &(parent->ompt_task_info.frame),
1922 &(new_taskdata->ompt_task_info.task_data),
1923 TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1924 OMPT_LOAD_RETURN_ADDRESS(gtid));
1925 }
1926 } else {
1927 // We are scheduling the continuation of an UNTIED task.
1928 // Scheduling back to the parent task.
1929 __ompt_task_finish(new_task,
1930 new_taskdata->ompt_task_info.scheduling_parent,
1931 ompt_task_switch);
1932 new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1933 }
1934 }
1935#endif
1936
1937 res = __kmp_omp_task(gtid, new_task, true);
1938
1939 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1940 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1941 gtid, loc_ref, new_taskdata));
1942#if OMPT_SUPPORT
1943 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1944 parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1945 }
1946#endif
1947 return res;
1948}
1949
1950// __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1951// a taskloop task with the correct OMPT return address
1952//
1953// loc_ref: location of original task pragma (ignored)
1954// gtid: Global Thread ID of encountering thread
1955// new_task: non-thread-switchable task thunk allocated by
1956// __kmp_omp_task_alloc()
1957// codeptr_ra: return address for OMPT callback
1958// Returns:
1959// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1960// be resumed later.
1961// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1962// resumed later.
1964 kmp_task_t *new_task, void *codeptr_ra) {
1965 kmp_int32 res;
1966 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1967
1968#if KMP_DEBUG || OMPT_SUPPORT
1969 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1970#endif
1971 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1972 new_taskdata));
1973
1974#if OMPT_SUPPORT
1975 kmp_taskdata_t *parent = NULL;
1976 if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1977 parent = new_taskdata->td_parent;
1978 if (!parent->ompt_task_info.frame.enter_frame.ptr)
1979 parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1980 if (ompt_enabled.ompt_callback_task_create) {
1981 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1982 &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1983 &(new_taskdata->ompt_task_info.task_data),
1984 TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, codeptr_ra);
1985 }
1986 }
1987#endif
1988
1989 res = __kmp_omp_task(gtid, new_task, true);
1990
1991 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1992 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1993 gtid, loc_ref, new_taskdata));
1994#if OMPT_SUPPORT
1995 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1996 parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1997 }
1998#endif
1999 return res;
2000}
2001
2002template <bool ompt>
2004 void *frame_address,
2005 void *return_address) {
2006 kmp_taskdata_t *taskdata = nullptr;
2007 kmp_info_t *thread;
2008 int thread_finished = FALSE;
2010
2011 KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
2012 KMP_DEBUG_ASSERT(gtid >= 0);
2013
2015 thread = __kmp_threads[gtid];
2016 taskdata = thread->th.th_current_task;
2017
2018#if OMPT_SUPPORT && OMPT_OPTIONAL
2019 ompt_data_t *my_task_data;
2020 ompt_data_t *my_parallel_data;
2021
2022 if (ompt) {
2023 my_task_data = &(taskdata->ompt_task_info.task_data);
2024 my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
2025
2026 taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
2027
2028 if (ompt_enabled.ompt_callback_sync_region) {
2029 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2030 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2031 my_task_data, return_address);
2032 }
2033
2034 if (ompt_enabled.ompt_callback_sync_region_wait) {
2035 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2036 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2037 my_task_data, return_address);
2038 }
2039 }
2040#endif // OMPT_SUPPORT && OMPT_OPTIONAL
2041
2042#if ENABLE_LIBOMPTARGET
2043 // Give an opportunity to the offload runtime to make progress and create
2044 // any necessary proxy tasks
2045 if (UNLIKELY(kmp_target_sync_cb))
2046 (*kmp_target_sync_cb)(loc_ref, gtid, KMP_TASKDATA_TO_TASK(taskdata),
2047 NULL);
2048#endif // ENABLE_LIBOMPTARGET
2049
2050// Debugger: The taskwait is active. Store location and thread encountered the
2051// taskwait.
2052#if USE_ITT_BUILD
2053// Note: These values are used by ITT events as well.
2054#endif /* USE_ITT_BUILD */
2055 taskdata->td_taskwait_counter += 1;
2056 taskdata->td_taskwait_ident = loc_ref;
2057 taskdata->td_taskwait_thread = gtid + 1;
2058
2059#if USE_ITT_BUILD
2060 void *itt_sync_obj = NULL;
2061#if USE_ITT_NOTIFY
2062 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2063#endif /* USE_ITT_NOTIFY */
2064#endif /* USE_ITT_BUILD */
2065
2066 bool must_wait =
2067 !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
2068
2069 must_wait = must_wait || (thread->th.th_task_team != NULL &&
2070 thread->th.th_task_team->tt.tt_found_proxy_tasks);
2071 // If hidden helper thread is encountered, we must enable wait here.
2072 must_wait =
2073 must_wait ||
2074 (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
2075 thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
2076
2077 if (must_wait) {
2079 RCAST(std::atomic<kmp_uint32> *,
2080 &(taskdata->td_incomplete_child_tasks)),
2081 0U);
2082 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
2083 flag.execute_tasks(thread, gtid, FALSE,
2084 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2086 }
2087 }
2088#if USE_ITT_BUILD
2089 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2090 KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
2091#endif /* USE_ITT_BUILD */
2092
2093 // Debugger: The taskwait is completed. Location remains, but thread is
2094 // negated.
2095 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2096
2097#if OMPT_SUPPORT && OMPT_OPTIONAL
2098 if (ompt) {
2099 if (ompt_enabled.ompt_callback_sync_region_wait) {
2100 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2101 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2102 my_task_data, return_address);
2103 }
2104 if (ompt_enabled.ompt_callback_sync_region) {
2105 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2106 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2107 my_task_data, return_address);
2108 }
2109 taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
2110 }
2111#endif // OMPT_SUPPORT && OMPT_OPTIONAL
2112 }
2113
2114 KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
2115 "returning TASK_CURRENT_NOT_QUEUED\n",
2116 gtid, taskdata));
2117
2119}
2120
2121#if OMPT_SUPPORT && OMPT_OPTIONAL
2123static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
2124 void *frame_address,
2125 void *return_address) {
2126 return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
2127 return_address);
2128}
2129#endif // OMPT_SUPPORT && OMPT_OPTIONAL
2130
2131// __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
2132// complete
2134#if OMPT_SUPPORT && OMPT_OPTIONAL
2135 if (UNLIKELY(ompt_enabled.enabled)) {
2136 OMPT_STORE_RETURN_ADDRESS(gtid);
2137 return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
2138 OMPT_LOAD_RETURN_ADDRESS(gtid));
2139 }
2140#endif
2141 return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
2142}
2143
2144// __kmpc_omp_taskyield: switch to a different task
2145kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
2146 kmp_taskdata_t *taskdata = NULL;
2147 kmp_info_t *thread;
2148 int thread_finished = FALSE;
2149
2150 KMP_COUNT_BLOCK(OMP_TASKYIELD);
2151 KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2152
2153 KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2154 gtid, loc_ref, end_part));
2156
2158 thread = __kmp_threads[gtid];
2159 taskdata = thread->th.th_current_task;
2160// Should we model this as a task wait or not?
2161// Debugger: The taskwait is active. Store location and thread encountered the
2162// taskwait.
2163#if USE_ITT_BUILD
2164// Note: These values are used by ITT events as well.
2165#endif /* USE_ITT_BUILD */
2166 taskdata->td_taskwait_counter += 1;
2167 taskdata->td_taskwait_ident = loc_ref;
2168 taskdata->td_taskwait_thread = gtid + 1;
2169
2170#if USE_ITT_BUILD
2171 void *itt_sync_obj = NULL;
2172#if USE_ITT_NOTIFY
2173 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2174#endif /* USE_ITT_NOTIFY */
2175#endif /* USE_ITT_BUILD */
2176 if (!taskdata->td_flags.team_serial) {
2177 kmp_task_team_t *task_team = thread->th.th_task_team;
2178 if (task_team != NULL) {
2179 if (KMP_TASKING_ENABLED(task_team)) {
2180#if OMPT_SUPPORT
2181 if (UNLIKELY(ompt_enabled.enabled))
2182 thread->th.ompt_thread_info.ompt_task_yielded = 1;
2183#endif
2185 thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2186 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2188#if OMPT_SUPPORT
2189 if (UNLIKELY(ompt_enabled.enabled))
2190 thread->th.ompt_thread_info.ompt_task_yielded = 0;
2191#endif
2192 }
2193 }
2194 }
2195#if USE_ITT_BUILD
2196 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2197#endif /* USE_ITT_BUILD */
2198
2199 // Debugger: The taskwait is completed. Location remains, but thread is
2200 // negated.
2201 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2202 }
2203
2204 KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2205 "returning TASK_CURRENT_NOT_QUEUED\n",
2206 gtid, taskdata));
2207
2209}
2210
2211// Task Reduction implementation
2212//
2213// Note: initial implementation didn't take into account the possibility
2214// to specify omp_orig for initializer of the UDR (user defined reduction).
2215// Corrected implementation takes into account the omp_orig object.
2216// Compiler is free to use old implementation if omp_orig is not specified.
2217
2218/*!
2219@ingroup BASIC_TYPES
2220@{
2221*/
2222
2223/*!
2224Flags for special info per task reduction item.
2225*/
2226typedef struct kmp_taskred_flags {
2227 /*! 1 - use lazy alloc/init (e.g. big objects, num tasks < num threads) */
2228 unsigned lazy_priv : 1;
2229 unsigned reserved31 : 31;
2231
2232/*!
2233Internal struct for reduction data item related info set up by compiler.
2234*/
2235typedef struct kmp_task_red_input {
2236 void *reduce_shar; /**< shared between tasks item to reduce into */
2237 size_t reduce_size; /**< size of data item in bytes */
2238 // three compiler-generated routines (init, fini are optional):
2239 void *reduce_init; /**< data initialization routine (single parameter) */
2240 void *reduce_fini; /**< data finalization routine */
2241 void *reduce_comb; /**< data combiner routine */
2242 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2244
2245/*!
2246Internal struct for reduction data item related info saved by the library.
2247*/
2248typedef struct kmp_taskred_data {
2249 void *reduce_shar; /**< shared between tasks item to reduce into */
2250 size_t reduce_size; /**< size of data item */
2251 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2252 void *reduce_priv; /**< array of thread specific items */
2253 void *reduce_pend; /**< end of private data for faster comparison op */
2254 // three compiler-generated routines (init, fini are optional):
2255 void *reduce_comb; /**< data combiner routine */
2256 void *reduce_init; /**< data initialization routine (two parameters) */
2257 void *reduce_fini; /**< data finalization routine */
2258 void *reduce_orig; /**< original item (can be used in UDR initializer) */
2260
2261/*!
2262Internal struct for reduction data item related info set up by compiler.
2263
2264New interface: added reduce_orig field to provide omp_orig for UDR initializer.
2265*/
2266typedef struct kmp_taskred_input {
2267 void *reduce_shar; /**< shared between tasks item to reduce into */
2268 void *reduce_orig; /**< original reduction item used for initialization */
2269 size_t reduce_size; /**< size of data item */
2270 // three compiler-generated routines (init, fini are optional):
2271 void *reduce_init; /**< data initialization routine (two parameters) */
2272 void *reduce_fini; /**< data finalization routine */
2273 void *reduce_comb; /**< data combiner routine */
2274 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2276/*!
2277@}
2278*/
2279
2280template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2281template <>
2286template <>
2288 kmp_taskred_input_t &src) {
2289 if (src.reduce_orig != NULL) {
2290 item.reduce_orig = src.reduce_orig;
2291 } else {
2292 item.reduce_orig = src.reduce_shar;
2293 } // non-NULL reduce_orig means new interface used
2294}
2295
2296template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2297template <>
2299 size_t offset) {
2300 ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2301}
2302template <>
2304 size_t offset) {
2305 ((void (*)(void *, void *))item.reduce_init)(
2306 (char *)(item.reduce_priv) + offset, item.reduce_orig);
2307}
2308
2309template <typename T>
2310void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2312 kmp_info_t *thread = __kmp_threads[gtid];
2313 kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2314 kmp_uint32 nth = thread->th.th_team_nproc;
2316
2317 // check input data just in case
2318 KMP_ASSERT(tg != NULL);
2319 KMP_ASSERT(data != NULL);
2320 KMP_ASSERT(num > 0);
2321 if (nth == 1 && !__kmp_enable_hidden_helper) {
2322 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2323 gtid, tg));
2324 return (void *)tg;
2325 }
2326 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2327 gtid, tg, num));
2329 thread, num * sizeof(kmp_taskred_data_t));
2330 for (int i = 0; i < num; ++i) {
2331 size_t size = data[i].reduce_size - 1;
2332 // round the size up to cache line per thread-specific item
2334 KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2335 arr[i].reduce_shar = data[i].reduce_shar;
2336 arr[i].reduce_size = size;
2337 arr[i].flags = data[i].flags;
2338 arr[i].reduce_comb = data[i].reduce_comb;
2339 arr[i].reduce_init = data[i].reduce_init;
2340 arr[i].reduce_fini = data[i].reduce_fini;
2342 if (!arr[i].flags.lazy_priv) {
2343 // allocate cache-line aligned block and fill it with zeros
2344 arr[i].reduce_priv = __kmp_allocate(nth * size);
2345 arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2346 if (arr[i].reduce_init != NULL) {
2347 // initialize all thread-specific items
2348 for (size_t j = 0; j < nth; ++j) {
2350 }
2351 }
2352 } else {
2353 // only allocate space for pointers now,
2354 // objects will be lazily allocated/initialized if/when requested
2355 // note that __kmp_allocate zeroes the allocated memory
2356 arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2357 }
2358 }
2359 tg->reduce_data = (void *)arr;
2360 tg->reduce_num_data = num;
2361 return (void *)tg;
2362}
2363
2364/*!
2365@ingroup TASKING
2366@param gtid Global thread ID
2367@param num Number of data items to reduce
2368@param data Array of data for reduction
2369@return The taskgroup identifier
2370
2371Initialize task reduction for the taskgroup.
2372
2373Note: this entry supposes the optional compiler-generated initializer routine
2374has single parameter - pointer to object to be initialized. That means
2375the reduction either does not use omp_orig object, or the omp_orig is accessible
2376without help of the runtime library.
2377*/
2378void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2379#if OMPX_TASKGRAPH
2380 kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2381 if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2382 kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2383 this_tdg->rec_taskred_data =
2384 __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2385 this_tdg->rec_num_taskred = num;
2386 KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2387 sizeof(kmp_task_red_input_t) * num);
2388 }
2389#endif
2391}
2392
2393/*!
2394@ingroup TASKING
2395@param gtid Global thread ID
2396@param num Number of data items to reduce
2397@param data Array of data for reduction
2398@return The taskgroup identifier
2399
2400Initialize task reduction for the taskgroup.
2401
2402Note: this entry supposes the optional compiler-generated initializer routine
2403has two parameters, pointer to object to be initialized and pointer to omp_orig
2404*/
2405void *__kmpc_taskred_init(int gtid, int num, void *data) {
2406#if OMPX_TASKGRAPH
2407 kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2408 if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2409 kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2410 this_tdg->rec_taskred_data =
2411 __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2412 this_tdg->rec_num_taskred = num;
2413 KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2414 sizeof(kmp_task_red_input_t) * num);
2415 }
2416#endif
2418}
2419
2420// Copy task reduction data (except for shared pointers).
2421template <typename T>
2423 kmp_taskgroup_t *tg, void *reduce_data) {
2425 KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2426 " from data %p\n",
2427 thr, tg, reduce_data));
2429 thr, num * sizeof(kmp_taskred_data_t));
2430 // threads will share private copies, thunk routines, sizes, flags, etc.:
2431 KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2432 for (int i = 0; i < num; ++i) {
2433 arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2434 }
2435 tg->reduce_data = (void *)arr;
2436 tg->reduce_num_data = num;
2437}
2438
2439/*!
2440@ingroup TASKING
2441@param gtid Global thread ID
2442@param tskgrp The taskgroup ID (optional)
2443@param data Shared location of the item
2444@return The pointer to per-thread data
2445
2446Get thread-specific location of data item
2447*/
2448void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2450 kmp_info_t *thread = __kmp_threads[gtid];
2451 kmp_int32 nth = thread->th.th_team_nproc;
2452 if (nth == 1)
2453 return data; // nothing to do
2454
2455 kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2456 if (tg == NULL)
2457 tg = thread->th.th_current_task->td_taskgroup;
2458 KMP_ASSERT(tg != NULL);
2460 kmp_int32 num;
2461 kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2462
2463#if OMPX_TASKGRAPH
2464 if ((thread->th.th_current_task->is_taskgraph) &&
2465 (!__kmp_tdg_is_recording(
2466 __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {
2467 tg = thread->th.th_current_task->td_taskgroup;
2468 KMP_ASSERT(tg != NULL);
2469 KMP_ASSERT(tg->reduce_data != NULL);
2471 num = tg->reduce_num_data;
2472 }
2473#endif
2474
2475 KMP_ASSERT(data != NULL);
2476 while (tg != NULL) {
2478 num = tg->reduce_num_data;
2479 for (int i = 0; i < num; ++i) {
2480 if (!arr[i].flags.lazy_priv) {
2481 if (data == arr[i].reduce_shar ||
2482 (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2483 return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2484 } else {
2485 // check shared location first
2486 void **p_priv = (void **)(arr[i].reduce_priv);
2487 if (data == arr[i].reduce_shar)
2488 goto found;
2489 // check if we get some thread specific location as parameter
2490 for (int j = 0; j < nth; ++j)
2491 if (data == p_priv[j])
2492 goto found;
2493 continue; // not found, continue search
2494 found:
2495 if (p_priv[tid] == NULL) {
2496 // allocate thread specific object lazily
2497 p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2498 if (arr[i].reduce_init != NULL) {
2499 if (arr[i].reduce_orig != NULL) { // new interface
2500 ((void (*)(void *, void *))arr[i].reduce_init)(
2501 p_priv[tid], arr[i].reduce_orig);
2502 } else { // old interface (single parameter)
2503 ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2504 }
2505 }
2506 }
2507 return p_priv[tid];
2508 }
2509 }
2510 KMP_ASSERT(tg->parent);
2511 tg = tg->parent;
2512 }
2513 KMP_ASSERT2(0, "Unknown task reduction item");
2514 return NULL; // ERROR, this line never executed
2515}
2516
2517// Finalize task reduction.
2518// Called from __kmpc_end_taskgroup()
2520 kmp_int32 nth = th->th.th_team_nproc;
2522 nth > 1 ||
2523 __kmp_enable_hidden_helper); // should not be called if nth == 1 unless we
2524 // are using hidden helper threads
2526 kmp_int32 num = tg->reduce_num_data;
2527 for (int i = 0; i < num; ++i) {
2528 void *sh_data = arr[i].reduce_shar;
2529 void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2530 void (*f_comb)(void *, void *) =
2531 (void (*)(void *, void *))(arr[i].reduce_comb);
2532 if (!arr[i].flags.lazy_priv) {
2533 void *pr_data = arr[i].reduce_priv;
2534 size_t size = arr[i].reduce_size;
2535 for (int j = 0; j < nth; ++j) {
2536 void *priv_data = (char *)pr_data + j * size;
2537 f_comb(sh_data, priv_data); // combine results
2538 if (f_fini)
2539 f_fini(priv_data); // finalize if needed
2540 }
2541 } else {
2542 void **pr_data = (void **)(arr[i].reduce_priv);
2543 for (int j = 0; j < nth; ++j) {
2544 if (pr_data[j] != NULL) {
2545 f_comb(sh_data, pr_data[j]); // combine results
2546 if (f_fini)
2547 f_fini(pr_data[j]); // finalize if needed
2548 __kmp_free(pr_data[j]);
2549 }
2550 }
2551 }
2552 __kmp_free(arr[i].reduce_priv);
2553 }
2555 tg->reduce_data = NULL;
2556 tg->reduce_num_data = 0;
2557}
2558
2559// Cleanup task reduction data for parallel or worksharing,
2560// do not touch task private data other threads still working with.
2561// Called from __kmpc_end_taskgroup()
2564 tg->reduce_data = NULL;
2565 tg->reduce_num_data = 0;
2566}
2567
2568template <typename T>
2570 int num, T *data) {
2572 kmp_info_t *thr = __kmp_threads[gtid];
2573 kmp_int32 nth = thr->th.th_team_nproc;
2574 __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2575 if (nth == 1) {
2576 KA_TRACE(10,
2577 ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2578 gtid, thr->th.th_current_task->td_taskgroup));
2579 return (void *)thr->th.th_current_task->td_taskgroup;
2580 }
2581 kmp_team_t *team = thr->th.th_team;
2582 void *reduce_data;
2583 kmp_taskgroup_t *tg;
2584 reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2585 if (reduce_data == NULL &&
2586 __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2587 (void *)1)) {
2588 // single thread enters this block to initialize common reduction data
2589 KMP_DEBUG_ASSERT(reduce_data == NULL);
2590 // first initialize own data, then make a copy other threads can use
2592 reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2593 KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2594 // fini counters should be 0 at this point
2595 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2596 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2597 KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2598 } else {
2599 while (
2600 (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2601 (void *)1) { // wait for task reduction initialization
2602 KMP_CPU_PAUSE();
2603 }
2604 KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2605 tg = thr->th.th_current_task->td_taskgroup;
2606 __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2607 }
2608 return tg;
2609}
2610
2611/*!
2612@ingroup TASKING
2613@param loc Source location info
2614@param gtid Global thread ID
2615@param is_ws Is 1 if the reduction is for worksharing, 0 otherwise
2616@param num Number of data items to reduce
2617@param data Array of data for reduction
2618@return The taskgroup identifier
2619
2620Initialize task reduction for a parallel or worksharing.
2621
2622Note: this entry supposes the optional compiler-generated initializer routine
2623has single parameter - pointer to object to be initialized. That means
2624the reduction either does not use omp_orig object, or the omp_orig is accessible
2625without help of the runtime library.
2626*/
2628 int num, void *data) {
2629 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2631}
2632
2633/*!
2634@ingroup TASKING
2635@param loc Source location info
2636@param gtid Global thread ID
2637@param is_ws Is 1 if the reduction is for worksharing, 0 otherwise
2638@param num Number of data items to reduce
2639@param data Array of data for reduction
2640@return The taskgroup identifier
2641
2642Initialize task reduction for a parallel or worksharing.
2643
2644Note: this entry supposes the optional compiler-generated initializer routine
2645has two parameters, pointer to object to be initialized and pointer to omp_orig
2646*/
2647void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2648 void *data) {
2649 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2651}
2652
2653/*!
2654@ingroup TASKING
2655@param loc Source location info
2656@param gtid Global thread ID
2657@param is_ws Is 1 if the reduction is for worksharing, 0 otherwise
2658
2659Finalize task reduction for a parallel or worksharing.
2660*/
2663}
2664
2665// __kmpc_taskgroup: Start a new taskgroup
2666void __kmpc_taskgroup(ident_t *loc, int gtid) {
2668 kmp_info_t *thread = __kmp_threads[gtid];
2669 kmp_taskdata_t *taskdata = thread->th.th_current_task;
2670 kmp_taskgroup_t *tg_new =
2672 KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2673 KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2675 tg_new->parent = taskdata->td_taskgroup;
2676 tg_new->reduce_data = NULL;
2677 tg_new->reduce_num_data = 0;
2678 tg_new->gomp_data = NULL;
2679 taskdata->td_taskgroup = tg_new;
2680
2681#if OMPT_SUPPORT && OMPT_OPTIONAL
2682 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2683 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2684 if (!codeptr)
2685 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2686 kmp_team_t *team = thread->th.th_team;
2687 ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2688 // FIXME: I think this is wrong for lwt!
2689 ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2690
2691 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2692 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2693 &(my_task_data), codeptr);
2694 }
2695#endif
2696}
2697
2698// __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2699// and its descendants are complete
2702 kmp_info_t *thread = __kmp_threads[gtid];
2703 kmp_taskdata_t *taskdata = thread->th.th_current_task;
2704 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2705 int thread_finished = FALSE;
2706
2707#if OMPT_SUPPORT && OMPT_OPTIONAL
2708 kmp_team_t *team;
2709 ompt_data_t my_task_data;
2710 ompt_data_t my_parallel_data;
2711 void *codeptr = nullptr;
2712 if (UNLIKELY(ompt_enabled.enabled)) {
2713 team = thread->th.th_team;
2714 my_task_data = taskdata->ompt_task_info.task_data;
2715 // FIXME: I think this is wrong for lwt!
2716 my_parallel_data = team->t.ompt_team_info.parallel_data;
2717 codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2718 if (!codeptr)
2719 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2720 }
2721#endif
2722
2723 KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2724 KMP_DEBUG_ASSERT(taskgroup != NULL);
2725 KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2726
2728 // mark task as waiting not on a barrier
2729 taskdata->td_taskwait_counter += 1;
2730 taskdata->td_taskwait_ident = loc;
2731 taskdata->td_taskwait_thread = gtid + 1;
2732#if USE_ITT_BUILD
2733 // For ITT the taskgroup wait is similar to taskwait until we need to
2734 // distinguish them
2735 void *itt_sync_obj = NULL;
2736#if USE_ITT_NOTIFY
2737 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2738#endif /* USE_ITT_NOTIFY */
2739#endif /* USE_ITT_BUILD */
2740
2741#if OMPT_SUPPORT && OMPT_OPTIONAL
2742 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2743 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2744 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2745 &(my_task_data), codeptr);
2746 }
2747#endif
2748
2749#if ENABLE_LIBOMPTARGET
2750 // Give an opportunity to the offload runtime to make progress and create
2751 // any necessary proxy tasks
2752 if (UNLIKELY(kmp_target_sync_cb))
2753 (*kmp_target_sync_cb)(loc, gtid, KMP_TASKDATA_TO_TASK(taskdata), NULL);
2754#endif // ENABLE_LIBOMPTARGET
2755
2756 if (!taskdata->td_flags.team_serial ||
2757 (thread->th.th_task_team != NULL &&
2758 (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
2759 thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
2761 RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2762 while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2763 flag.execute_tasks(thread, gtid, FALSE,
2764 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2766 }
2767 }
2768 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2769
2770#if OMPT_SUPPORT && OMPT_OPTIONAL
2771 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2772 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2773 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2774 &(my_task_data), codeptr);
2775 }
2776#endif
2777
2778#if USE_ITT_BUILD
2779 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2780 KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2781#endif /* USE_ITT_BUILD */
2782 }
2783 KMP_DEBUG_ASSERT(taskgroup->count == 0);
2784
2785 if (taskgroup->reduce_data != NULL &&
2786 !taskgroup->gomp_data) { // need to reduce?
2787 int cnt;
2788 void *reduce_data;
2789 kmp_team_t *t = thread->th.th_team;
2791 // check if <priv> data of the first reduction variable shared for the team
2792 void *priv0 = arr[0].reduce_priv;
2793 if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2794 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2795 // finishing task reduction on parallel
2796 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2797 if (cnt == thread->th.th_team_nproc - 1) {
2798 // we are the last thread passing __kmpc_reduction_modifier_fini()
2799 // finalize task reduction:
2800 __kmp_task_reduction_fini(thread, taskgroup);
2801 // cleanup fields in the team structure:
2802 // TODO: is relaxed store enough here (whole barrier should follow)?
2803 __kmp_thread_free(thread, reduce_data);
2804 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2805 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2806 } else {
2807 // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2808 // so do not finalize reduction, just clean own copy of the data
2809 __kmp_task_reduction_clean(thread, taskgroup);
2810 }
2811 } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2812 NULL &&
2813 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2814 // finishing task reduction on worksharing
2815 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2816 if (cnt == thread->th.th_team_nproc - 1) {
2817 // we are the last thread passing __kmpc_reduction_modifier_fini()
2818 __kmp_task_reduction_fini(thread, taskgroup);
2819 // cleanup fields in team structure:
2820 // TODO: is relaxed store enough here (whole barrier should follow)?
2821 __kmp_thread_free(thread, reduce_data);
2822 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2823 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2824 } else {
2825 // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2826 // so do not finalize reduction, just clean own copy of the data
2827 __kmp_task_reduction_clean(thread, taskgroup);
2828 }
2829 } else {
2830 // finishing task reduction on taskgroup
2831 __kmp_task_reduction_fini(thread, taskgroup);
2832 }
2833 }
2834 // Restore parent taskgroup for the current task
2835 taskdata->td_taskgroup = taskgroup->parent;
2836 __kmp_thread_free(thread, taskgroup);
2837
2838 KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2839 gtid, taskdata));
2840
2841#if OMPT_SUPPORT && OMPT_OPTIONAL
2842 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2843 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2844 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2845 &(my_task_data), codeptr);
2846 }
2847#endif
2848}
2849
2851 kmp_task_team_t *task_team,
2852 kmp_int32 is_constrained) {
2853 kmp_task_t *task = NULL;
2854 kmp_taskdata_t *taskdata;
2855 kmp_taskdata_t *current;
2856 kmp_thread_data_t *thread_data;
2857 int ntasks = task_team->tt.tt_num_task_pri;
2858 if (ntasks == 0) {
2859 KA_TRACE(
2860 20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid));
2861 return NULL;
2862 }
2863 do {
2864 // decrement num_tasks to "reserve" one task to get for execution
2865 if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks,
2866 ntasks - 1))
2867 break;
2868 ntasks = task_team->tt.tt_num_task_pri;
2869 } while (ntasks > 0);
2870 if (ntasks == 0) {
2871 KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",
2872 __kmp_get_gtid()));
2873 return NULL;
2874 }
2875 // We got a "ticket" to get a "reserved" priority task
2876 int deque_ntasks;
2877 kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
2878 do {
2879 KMP_ASSERT(list != NULL);
2880 thread_data = &list->td;
2881 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2882 deque_ntasks = thread_data->td.td_deque_ntasks;
2883 if (deque_ntasks == 0) {
2884 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2885 KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n",
2886 __kmp_get_gtid(), thread_data));
2887 list = list->next;
2888 }
2889 } while (deque_ntasks == 0);
2890 KMP_DEBUG_ASSERT(deque_ntasks);
2891 int target = thread_data->td.td_deque_head;
2892 current = __kmp_threads[gtid]->th.th_current_task;
2893 taskdata = thread_data->td.td_deque[target];
2894 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2895 // Bump head pointer and Wrap.
2896 thread_data->td.td_deque_head =
2897 (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2898 } else {
2899 if (!task_team->tt.tt_untied_task_encountered) {
2900 // The TSC does not allow to steal victim task
2901 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2902 KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task "
2903 "from %p: task_team=%p ntasks=%d head=%u tail=%u\n",
2904 gtid, thread_data, task_team, deque_ntasks, target,
2905 thread_data->td.td_deque_tail));
2906 task_team->tt.tt_num_task_pri++; // atomic inc, restore value
2907 return NULL;
2908 }
2909 int i;
2910 // walk through the deque trying to steal any task
2911 taskdata = NULL;
2912 for (i = 1; i < deque_ntasks; ++i) {
2913 target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2914 taskdata = thread_data->td.td_deque[target];
2915 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2916 break; // found task to execute
2917 } else {
2918 taskdata = NULL;
2919 }
2920 }
2921 if (taskdata == NULL) {
2922 // No appropriate candidate found to execute
2923 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2924 KA_TRACE(
2925 10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from "
2926 "%p: task_team=%p ntasks=%d head=%u tail=%u\n",
2927 gtid, thread_data, task_team, deque_ntasks,
2928 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2929 task_team->tt.tt_num_task_pri++; // atomic inc, restore value
2930 return NULL;
2931 }
2932 int prev = target;
2933 for (i = i + 1; i < deque_ntasks; ++i) {
2934 // shift remaining tasks in the deque left by 1
2935 target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2936 thread_data->td.td_deque[prev] = thread_data->td.td_deque[target];
2937 prev = target;
2938 }
2940 thread_data->td.td_deque_tail ==
2941 (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td)));
2942 thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped))
2943 }
2944 thread_data->td.td_deque_ntasks = deque_ntasks - 1;
2945 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2946 task = KMP_TASKDATA_TO_TASK(taskdata);
2947 return task;
2948}
2949
2950// __kmp_remove_my_task: remove a task from my own deque
2952 kmp_task_team_t *task_team,
2953 kmp_int32 is_constrained) {
2955 kmp_taskdata_t *taskdata;
2956 kmp_thread_data_t *thread_data;
2958
2960 KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2961 NULL); // Caller should check this condition
2962
2963 thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2964
2965 KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2966 gtid, thread_data->td.td_deque_ntasks,
2967 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2968
2969 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2970 KA_TRACE(10,
2971 ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2972 "ntasks=%d head=%u tail=%u\n",
2973 gtid, thread_data->td.td_deque_ntasks,
2974 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2975 return NULL;
2976 }
2977
2978 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2979
2980 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2981 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2982 KA_TRACE(10,
2983 ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2984 "ntasks=%d head=%u tail=%u\n",
2985 gtid, thread_data->td.td_deque_ntasks,
2986 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2987 return NULL;
2988 }
2989
2990 tail = (thread_data->td.td_deque_tail - 1) &
2991 TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2992 taskdata = thread_data->td.td_deque[tail];
2993
2994 if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2995 thread->th.th_current_task)) {
2996 // The TSC does not allow to steal victim task
2997 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2998 KA_TRACE(10,
2999 ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
3000 "ntasks=%d head=%u tail=%u\n",
3001 gtid, thread_data->td.td_deque_ntasks,
3002 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3003 return NULL;
3004 }
3005
3006 thread_data->td.td_deque_tail = tail;
3007 TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
3008
3009 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3010
3011 KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
3012 "ntasks=%d head=%u tail=%u\n",
3013 gtid, taskdata, thread_data->td.td_deque_ntasks,
3014 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3015
3016 task = KMP_TASKDATA_TO_TASK(taskdata);
3017 return task;
3018}
3019
3020// __kmp_steal_task: remove a task from another thread's deque
3021// Assume that calling thread has already checked existence of
3022// task_team thread_data before calling this routine.
3024 kmp_task_team_t *task_team,
3025 std::atomic<kmp_int32> *unfinished_threads,
3026 int *thread_finished,
3027 kmp_int32 is_constrained) {
3029 kmp_taskdata_t *taskdata;
3030 kmp_taskdata_t *current;
3031 kmp_thread_data_t *victim_td, *threads_data;
3033 kmp_info_t *victim_thr;
3034
3036
3037 threads_data = task_team->tt.tt_threads_data;
3038 KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
3039 KMP_DEBUG_ASSERT(victim_tid >= 0);
3040 KMP_DEBUG_ASSERT(victim_tid < task_team->tt.tt_max_threads);
3041
3042 victim_td = &threads_data[victim_tid];
3043 victim_thr = victim_td->td.td_thr;
3044 (void)victim_thr; // Use in TRACE messages which aren't always enabled.
3045
3046 KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
3047 "task_team=%p ntasks=%d head=%u tail=%u\n",
3048 gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3049 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3050 victim_td->td.td_deque_tail));
3051
3052 if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
3053 KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
3054 "task_team=%p ntasks=%d head=%u tail=%u\n",
3055 gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3056 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3057 victim_td->td.td_deque_tail));
3058 return NULL;
3059 }
3060
3061 __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
3062
3063 int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
3064 // Check again after we acquire the lock
3065 if (ntasks == 0) {
3066 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3067 KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
3068 "task_team=%p ntasks=%d head=%u tail=%u\n",
3069 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3070 victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3071 return NULL;
3072 }
3073
3074 KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
3075 current = __kmp_threads[gtid]->th.th_current_task;
3076 taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
3077 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3078 // Bump head pointer and Wrap.
3079 victim_td->td.td_deque_head =
3080 (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
3081 } else {
3082 if (!task_team->tt.tt_untied_task_encountered) {
3083 // The TSC does not allow to steal victim task
3084 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3085 KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
3086 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3087 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3088 victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3089 return NULL;
3090 }
3091 int i;
3092 // walk through victim's deque trying to steal any task
3093 target = victim_td->td.td_deque_head;
3094 taskdata = NULL;
3095 for (i = 1; i < ntasks; ++i) {
3096 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3097 taskdata = victim_td->td.td_deque[target];
3098 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3099 break; // found victim task
3100 } else {
3101 taskdata = NULL;
3102 }
3103 }
3104 if (taskdata == NULL) {
3105 // No appropriate candidate to steal found
3106 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3107 KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
3108 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3109 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3110 victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3111 return NULL;
3112 }
3113 int prev = target;
3114 for (i = i + 1; i < ntasks; ++i) {
3115 // shift remaining tasks in the deque left by 1
3116 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3117 victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
3118 prev = target;
3119 }
3121 victim_td->td.td_deque_tail ==
3122 (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
3123 victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
3124 }
3125 if (*thread_finished) {
3126 // We need to un-mark this victim as a finished victim. This must be done
3127 // before releasing the lock, or else other threads (starting with the
3128 // primary thread victim) might be prematurely released from the barrier!!!
3129#if KMP_DEBUG
3131#endif
3132 KMP_ATOMIC_INC(unfinished_threads);
3133 KA_TRACE(
3134 20,
3135 ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
3136 gtid, count + 1, task_team));
3137 *thread_finished = FALSE;
3138 }
3139 TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
3140
3141 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3142
3143 KMP_COUNT_BLOCK(TASK_stolen);
3144 KA_TRACE(10,
3145 ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
3146 "task_team=%p ntasks=%d head=%u tail=%u\n",
3147 gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
3148 ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3149
3150 task = KMP_TASKDATA_TO_TASK(taskdata);
3151 return task;
3152}
3153
3154// __kmp_execute_tasks_template: Choose and execute tasks until either the
3155// condition is statisfied (return true) or there are none left (return false).
3156//
3157// final_spin is TRUE if this is the spin at the release barrier.
3158// thread_finished indicates whether the thread is finished executing all
3159// the tasks it has on its deque, and is at the release barrier.
3160// spinner is the location on which to spin.
3161// spinner == NULL means only execute a single task and return.
3162// checker is the value to check to terminate the spin.
3163template <class C>
3165 kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
3166 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3167 kmp_int32 is_constrained) {
3168 kmp_task_team_t *task_team = thread->th.th_task_team;
3169 kmp_thread_data_t *threads_data;
3171 kmp_info_t *other_thread;
3172 kmp_taskdata_t *current_task = thread->th.th_current_task;
3173 std::atomic<kmp_int32> *unfinished_threads;
3174 kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
3175 tid = thread->th.th_info.ds.ds_tid;
3176
3178 KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
3179
3180 if (task_team == NULL || current_task == NULL)
3181 return FALSE;
3182
3183 KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
3184 "*thread_finished=%d\n",
3185 gtid, final_spin, *thread_finished));
3186
3187 thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3188 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3189
3190 KMP_DEBUG_ASSERT(threads_data != NULL);
3191
3192 nthreads = task_team->tt.tt_nproc;
3193 unfinished_threads = &(task_team->tt.tt_unfinished_threads);
3194 KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
3195
3196 while (1) { // Outer loop keeps trying to find tasks in case of single thread
3197 // getting tasks from target constructs
3198 while (1) { // Inner loop to find a task and execute it
3199#if ENABLE_LIBOMPTARGET
3200 // Give an opportunity to the offload runtime to make progress
3201 if (UNLIKELY(kmp_target_sync_cb))
3202 (*kmp_target_sync_cb)(NULL, gtid, KMP_TASKDATA_TO_TASK(current_task),
3203 NULL);
3204#endif // ENABLE_LIBOMPTARGET
3205
3206 task = NULL;
3207 if (task_team->tt.tt_num_task_pri) { // get priority task first
3208 task = __kmp_get_priority_task(gtid, task_team, is_constrained);
3209 }
3210 if (task == NULL && use_own_tasks) { // check own queue next
3211 task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
3212 }
3213 if ((task == NULL) && (nthreads > 1)) { // Steal a task finally
3214 int asleep = 1;
3215 use_own_tasks = 0;
3216 // Try to steal from the last place I stole from successfully.
3217 if (victim_tid == -2) { // haven't stolen anything yet
3218 victim_tid = threads_data[tid].td.td_deque_last_stolen;
3219 if (victim_tid !=
3220 -1) // if we have a last stolen from victim, get the thread
3221 other_thread = threads_data[victim_tid].td.td_thr;
3222 }
3223 if (victim_tid != -1) { // found last victim
3224 asleep = 0;
3225 } else if (!new_victim) { // no recent steals and we haven't already
3226 // used a new victim; select a random thread
3227 do { // Find a different thread to steal work from.
3228 // Pick a random thread. Initial plan was to cycle through all the
3229 // threads, and only return if we tried to steal from every thread,
3230 // and failed. Arch says that's not such a great idea.
3231 victim_tid = __kmp_get_random(thread) % (nthreads - 1);
3232 if (victim_tid >= tid) {
3233 ++victim_tid; // Adjusts random distribution to exclude self
3234 }
3235 // Found a potential victim
3236 other_thread = threads_data[victim_tid].td.td_thr;
3237 // There is a slight chance that __kmp_enable_tasking() did not wake
3238 // up all threads waiting at the barrier. If victim is sleeping,
3239 // then wake it up. Since we were going to pay the cache miss
3240 // penalty for referencing another thread's kmp_info_t struct
3241 // anyway,
3242 // the check shouldn't cost too much performance at this point. In
3243 // extra barrier mode, tasks do not sleep at the separate tasking
3244 // barrier, so this isn't a problem.
3245 asleep = 0;
3248 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
3249 NULL)) {
3250 asleep = 1;
3251 __kmp_null_resume_wrapper(other_thread);
3252 // A sleeping thread should not have any tasks on it's queue.
3253 // There is a slight possibility that it resumes, steals a task
3254 // from another thread, which spawns more tasks, all in the time
3255 // that it takes this thread to check => don't write an assertion
3256 // that the victim's queue is empty. Try stealing from a
3257 // different thread.
3258 }
3259 } while (asleep);
3260 }
3261
3262 if (!asleep) {
3263 // We have a victim to try to steal from
3264 task =
3265 __kmp_steal_task(victim_tid, gtid, task_team, unfinished_threads,
3266 thread_finished, is_constrained);
3267 }
3268 if (task != NULL) { // set last stolen to victim
3269 if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
3270 threads_data[tid].td.td_deque_last_stolen = victim_tid;
3271 // The pre-refactored code did not try more than 1 successful new
3272 // vicitm, unless the last one generated more local tasks;
3273 // new_victim keeps track of this
3274 new_victim = 1;
3275 }
3276 } else { // No tasks found; unset last_stolen
3277 KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
3278 victim_tid = -2; // no successful victim found
3279 }
3280 }
3281
3282 if (task == NULL)
3283 break; // break out of tasking loop
3284
3285// Found a task; execute it
3286#if USE_ITT_BUILD && USE_ITT_NOTIFY
3287 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
3288 if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
3289 // get the object reliably
3290 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
3291 }
3292 __kmp_itt_task_starting(itt_sync_obj);
3293 }
3294#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
3295 __kmp_invoke_task(gtid, task, current_task);
3296#if USE_ITT_BUILD
3297 if (itt_sync_obj != NULL)
3298 __kmp_itt_task_finished(itt_sync_obj);
3299#endif /* USE_ITT_BUILD */
3300 // If this thread is only partway through the barrier and the condition is
3301 // met, then return now, so that the barrier gather/release pattern can
3302 // proceed. If this thread is in the last spin loop in the barrier,
3303 // waiting to be released, we know that the termination condition will not
3304 // be satisfied, so don't waste any cycles checking it.
3305 if (flag == NULL || (!final_spin && flag->done_check())) {
3306 KA_TRACE(
3307 15,
3308 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3309 gtid));
3310 return TRUE;
3311 }
3312 if (thread->th.th_task_team == NULL) {
3313 break;
3314 }
3315 KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3316 // If execution of a stolen task results in more tasks being placed on our
3317 // run queue, reset use_own_tasks
3318 if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3319 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3320 "other tasks, restart\n",
3321 gtid));
3322 use_own_tasks = 1;
3323 new_victim = 0;
3324 }
3325 }
3326
3327 // The task source has been exhausted. If in final spin loop of barrier,
3328 // check if termination condition is satisfied. The work queue may be empty
3329 // but there might be proxy tasks still executing.
3330 if (final_spin &&
3331 KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3332 // First, decrement the #unfinished threads, if that has not already been
3333 // done. This decrement might be to the spin location, and result in the
3334 // termination condition being satisfied.
3335 if (!*thread_finished) {
3336#if KMP_DEBUG
3337 kmp_int32 count = -1 +
3338#endif
3339 KMP_ATOMIC_DEC(unfinished_threads);
3340 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3341 "unfinished_threads to %d task_team=%p\n",
3342 gtid, count, task_team));
3343 *thread_finished = TRUE;
3344 }
3345
3346 // It is now unsafe to reference thread->th.th_team !!!
3347 // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3348 // thread to pass through the barrier, where it might reset each thread's
3349 // th.th_team field for the next parallel region. If we can steal more
3350 // work, we know that this has not happened yet.
3351 if (flag != NULL && flag->done_check()) {
3352 KA_TRACE(
3353 15,
3354 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3355 gtid));
3356 return TRUE;
3357 }
3358 }
3359
3360 // If this thread's task team is NULL, primary thread has recognized that
3361 // there are no more tasks; bail out
3362 if (thread->th.th_task_team == NULL) {
3363 KA_TRACE(15,
3364 ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3365 return FALSE;
3366 }
3367
3368 // Check the flag again to see if it has already done in case to be trapped
3369 // into infinite loop when a if0 task depends on a hidden helper task
3370 // outside any parallel region. Detached tasks are not impacted in this case
3371 // because the only thread executing this function has to execute the proxy
3372 // task so it is in another code path that has the same check.
3373 if (flag == NULL || (!final_spin && flag->done_check())) {
3374 KA_TRACE(15,
3375 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3376 gtid));
3377 return TRUE;
3378 }
3379
3380 // We could be getting tasks from target constructs; if this is the only
3381 // thread, keep trying to execute tasks from own queue
3382 if (nthreads == 1 &&
3384 use_own_tasks = 1;
3385 else {
3386 KA_TRACE(15,
3387 ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3388 return FALSE;
3389 }
3390 }
3391}
3392
3393template <bool C, bool S>
3395 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3396 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3397 kmp_int32 is_constrained) {
3399 thread, gtid, flag, final_spin,
3400 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3401}
3402
3403template <bool C, bool S>
3405 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3406 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3407 kmp_int32 is_constrained) {
3409 thread, gtid, flag, final_spin,
3410 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3411}
3412
3413template <bool C, bool S>
3416 int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3417 kmp_int32 is_constrained) {
3419 thread, gtid, flag, final_spin,
3420 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3421}
3422
3424 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3425 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3426 kmp_int32 is_constrained) {
3428 thread, gtid, flag, final_spin,
3429 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3430}
3431
3432template int
3435 int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3436
3439 int,
3440 int *USE_ITT_BUILD_ARG(void *),
3441 kmp_int32);
3442
3445 int,
3446 int *USE_ITT_BUILD_ARG(void *),
3447 kmp_int32);
3448
3451 int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3452
3455 int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3456
3457// __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3458// next barrier so they can assist in executing enqueued tasks.
3459// First thread in allocates the task team atomically.
3461 kmp_info_t *this_thr) {
3462 kmp_thread_data_t *threads_data;
3463 int nthreads, i, is_init_thread;
3464
3465 KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3466 __kmp_gtid_from_thread(this_thr)));
3467
3468 KMP_DEBUG_ASSERT(task_team != NULL);
3469 KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3470
3471 nthreads = task_team->tt.tt_nproc;
3472 KMP_DEBUG_ASSERT(nthreads > 0);
3473 KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3474
3475 // Allocate or increase the size of threads_data if necessary
3476 is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3477
3478 if (!is_init_thread) {
3479 // Some other thread already set up the array.
3480 KA_TRACE(
3481 20,
3482 ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3483 __kmp_gtid_from_thread(this_thr)));
3484 return;
3485 }
3486 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3487 KMP_DEBUG_ASSERT(threads_data != NULL);
3488
3491 // Release any threads sleeping at the barrier, so that they can steal
3492 // tasks and execute them. In extra barrier mode, tasks do not sleep
3493 // at the separate tasking barrier, so this isn't a problem.
3494 for (i = 0; i < nthreads; i++) {
3495 void *sleep_loc;
3496 kmp_info_t *thread = threads_data[i].td.td_thr;
3497
3498 if (i == this_thr->th.th_info.ds.ds_tid) {
3499 continue;
3500 }
3501 // Since we haven't locked the thread's suspend mutex lock at this
3502 // point, there is a small window where a thread might be putting
3503 // itself to sleep, but hasn't set the th_sleep_loc field yet.
3504 // To work around this, __kmp_execute_tasks_template() periodically checks
3505 // see if other threads are sleeping (using the same random mechanism that
3506 // is used for task stealing) and awakens them if they are.
3507 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3508 NULL) {
3509 KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3510 __kmp_gtid_from_thread(this_thr),
3511 __kmp_gtid_from_thread(thread)));
3513 } else {
3514 KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3515 __kmp_gtid_from_thread(this_thr),
3516 __kmp_gtid_from_thread(thread)));
3517 }
3518 }
3519 }
3520
3521 KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3522 __kmp_gtid_from_thread(this_thr)));
3523}
3524
3525/* // TODO: Check the comment consistency
3526 * Utility routines for "task teams". A task team (kmp_task_t) is kind of
3527 * like a shadow of the kmp_team_t data struct, with a different lifetime.
3528 * After a child * thread checks into a barrier and calls __kmp_release() from
3529 * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3530 * longer assume that the kmp_team_t structure is intact (at any moment, the
3531 * primary thread may exit the barrier code and free the team data structure,
3532 * and return the threads to the thread pool).
3533 *
3534 * This does not work with the tasking code, as the thread is still
3535 * expected to participate in the execution of any tasks that may have been
3536 * spawned my a member of the team, and the thread still needs access to all
3537 * to each thread in the team, so that it can steal work from it.
3538 *
3539 * Enter the existence of the kmp_task_team_t struct. It employs a reference
3540 * counting mechanism, and is allocated by the primary thread before calling
3541 * __kmp_<barrier_kind>_release, and then is release by the last thread to
3542 * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
3543 * of the kmp_task_team_t structs for consecutive barriers can overlap
3544 * (and will, unless the primary thread is the last thread to exit the barrier
3545 * release phase, which is not typical). The existence of such a struct is
3546 * useful outside the context of tasking.
3547 *
3548 * We currently use the existence of the threads array as an indicator that
3549 * tasks were spawned since the last barrier. If the structure is to be
3550 * useful outside the context of tasking, then this will have to change, but
3551 * not setting the field minimizes the performance impact of tasking on
3552 * barriers, when no explicit tasks were spawned (pushed, actually).
3553 */
3554
3556 NULL; // Free list for task_team data structures
3557// Lock for task team data structures
3560
3561// __kmp_alloc_task_deque:
3562// Allocates a task deque for a particular thread, and initialize the necessary
3563// data structures relating to the deque. This only happens once per thread
3564// per task team since task teams are recycled. No lock is needed during
3565// allocation since each thread allocates its own deque.
3567 kmp_thread_data_t *thread_data) {
3568 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3569 KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3570
3571 // Initialize last stolen task field to "none"
3572 thread_data->td.td_deque_last_stolen = -1;
3573
3574 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3575 KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3576 KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3577
3578 KE_TRACE(
3579 10,
3580 ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3581 __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3582 // Allocate space for task deque, and zero the deque
3583 // Cannot use __kmp_thread_calloc() because threads not around for
3584 // kmp_reap_task_team( ).
3585 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3587 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3588}
3589
3590// __kmp_free_task_deque:
3591// Deallocates a task deque for a particular thread. Happens at library
3592// deallocation so don't need to reset all thread data fields.
3593static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3594 if (thread_data->td.td_deque != NULL) {
3595 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3596 TCW_4(thread_data->td.td_deque_ntasks, 0);
3597 __kmp_free(thread_data->td.td_deque);
3598 thread_data->td.td_deque = NULL;
3599 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3600 }
3601}
3602
3603// __kmp_realloc_task_threads_data:
3604// Allocates a threads_data array for a task team, either by allocating an
3605// initial array or enlarging an existing array. Only the first thread to get
3606// the lock allocs or enlarges the array and re-initializes the array elements.
3607// That thread returns "TRUE", the rest return "FALSE".
3608// Assumes that the new array size is given by task_team -> tt.tt_nproc.
3609// The current size is given by task_team -> tt.tt_max_threads.
3611 kmp_task_team_t *task_team) {
3612 kmp_thread_data_t **threads_data_p;
3613 kmp_int32 nthreads, maxthreads;
3614 int is_init_thread = FALSE;
3615
3616 if (TCR_4(task_team->tt.tt_found_tasks)) {
3617 // Already reallocated and initialized.
3618 return FALSE;
3619 }
3620
3621 threads_data_p = &task_team->tt.tt_threads_data;
3622 nthreads = task_team->tt.tt_nproc;
3623 maxthreads = task_team->tt.tt_max_threads;
3624
3625 // All threads must lock when they encounter the first task of the implicit
3626 // task region to make sure threads_data fields are (re)initialized before
3627 // used.
3629
3630 if (!TCR_4(task_team->tt.tt_found_tasks)) {
3631 // first thread to enable tasking
3632 kmp_team_t *team = thread->th.th_team;
3633 int i;
3634
3635 is_init_thread = TRUE;
3636 if (maxthreads < nthreads) {
3637
3638 if (*threads_data_p != NULL) {
3639 kmp_thread_data_t *old_data = *threads_data_p;
3640 kmp_thread_data_t *new_data = NULL;
3641
3642 KE_TRACE(
3643 10,
3644 ("__kmp_realloc_task_threads_data: T#%d reallocating "
3645 "threads data for task_team %p, new_size = %d, old_size = %d\n",
3646 __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3647 // Reallocate threads_data to have more elements than current array
3648 // Cannot use __kmp_thread_realloc() because threads not around for
3649 // kmp_reap_task_team( ). Note all new array entries are initialized
3650 // to zero by __kmp_allocate().
3651 new_data = (kmp_thread_data_t *)__kmp_allocate(
3652 nthreads * sizeof(kmp_thread_data_t));
3653 // copy old data to new data
3654 KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3655 (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3656
3657 // Install the new data and free the old data
3658 (*threads_data_p) = new_data;
3659 __kmp_free(old_data);
3660 } else {
3661 KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3662 "threads data for task_team %p, size = %d\n",
3663 __kmp_gtid_from_thread(thread), task_team, nthreads));
3664 // Make the initial allocate for threads_data array, and zero entries
3665 // Cannot use __kmp_thread_calloc() because threads not around for
3666 // kmp_reap_task_team( ).
3667 *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3668 nthreads * sizeof(kmp_thread_data_t));
3669 }
3670 task_team->tt.tt_max_threads = nthreads;
3671 } else {
3672 // If array has (more than) enough elements, go ahead and use it
3673 KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3674 }
3675
3676 // initialize threads_data pointers back to thread_info structures
3677 for (i = 0; i < nthreads; i++) {
3678 kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3679 thread_data->td.td_thr = team->t.t_threads[i];
3680
3681 if (thread_data->td.td_deque_last_stolen >= nthreads) {
3682 // The last stolen field survives across teams / barrier, and the number
3683 // of threads may have changed. It's possible (likely?) that a new
3684 // parallel region will exhibit the same behavior as previous region.
3685 thread_data->td.td_deque_last_stolen = -1;
3686 }
3687 }
3688
3689 KMP_MB();
3690 TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3691 }
3692
3694 return is_init_thread;
3695}
3696
3697// __kmp_free_task_threads_data:
3698// Deallocates a threads_data array for a task team, including any attached
3699// tasking deques. Only occurs at library shutdown.
3702 if (task_team->tt.tt_threads_data != NULL) {
3703 int i;
3704 for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3706 }
3707 __kmp_free(task_team->tt.tt_threads_data);
3708 task_team->tt.tt_threads_data = NULL;
3709 }
3711}
3712
3713// __kmp_free_task_pri_list:
3714// Deallocates tasking deques used for priority tasks.
3715// Only occurs at library shutdown.
3718 if (task_team->tt.tt_task_pri_list != NULL) {
3719 kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3720 while (list != NULL) {
3721 kmp_task_pri_t *next = list->next;
3722 __kmp_free_task_deque(&list->td);
3723 __kmp_free(list);
3724 list = next;
3725 }
3726 task_team->tt.tt_task_pri_list = NULL;
3727 }
3729}
3730
3731static inline void __kmp_task_team_init(kmp_task_team_t *task_team,
3732 kmp_team_t *team) {
3733 int team_nth = team->t.t_nproc;
3734 // Only need to init if task team is isn't active or team size changed
3735 if (!task_team->tt.tt_active || team_nth != task_team->tt.tt_nproc) {
3736 TCW_4(task_team->tt.tt_found_tasks, FALSE);
3737 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3739 TCW_4(task_team->tt.tt_nproc, team_nth);
3740 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team_nth);
3741 TCW_4(task_team->tt.tt_active, TRUE);
3742 }
3743}
3744
3745// __kmp_allocate_task_team:
3746// Allocates a task team associated with a specific team, taking it from
3747// the global task team free list if possible. Also initializes data
3748// structures.
3750 kmp_team_t *team) {
3751 kmp_task_team_t *task_team = NULL;
3752
3753 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3754 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3755
3756 if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3757 // Take a task team from the task team pool
3759 if (__kmp_free_task_teams != NULL) {
3760 task_team = __kmp_free_task_teams;
3762 task_team->tt.tt_next = NULL;
3763 }
3765 }
3766
3767 if (task_team == NULL) {
3768 KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3769 "task team for team %p\n",
3770 __kmp_gtid_from_thread(thread), team));
3771 // Allocate a new task team if one is not available. Cannot use
3772 // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3773 task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3776#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3777 // suppress race conditions detection on synchronization flags in debug mode
3778 // this helps to analyze library internals eliminating false positives
3779 __itt_suppress_mark_range(
3780 __itt_suppress_range, __itt_suppress_threading_errors,
3781 &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3782 __itt_suppress_mark_range(__itt_suppress_range,
3783 __itt_suppress_threading_errors,
3784 CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3785 sizeof(task_team->tt.tt_active));
3786#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3787 // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3788 // task_team->tt.tt_threads_data = NULL;
3789 // task_team->tt.tt_max_threads = 0;
3790 // task_team->tt.tt_next = NULL;
3791 }
3792
3793 __kmp_task_team_init(task_team, team);
3794
3795 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3796 "unfinished_threads init'd to %d\n",
3797 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3799 return task_team;
3800}
3801
3802// __kmp_free_task_team:
3803// Frees the task team associated with a specific thread, and adds it
3804// to the global task team free list.
3806 KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3807 thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3808
3809 // Put task team back on free list
3811
3812 KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3813 task_team->tt.tt_next = __kmp_free_task_teams;
3814 TCW_PTR(__kmp_free_task_teams, task_team);
3815
3817}
3818
3819// __kmp_reap_task_teams:
3820// Free all the task teams on the task team free list.
3821// Should only be done during library shutdown.
3822// Cannot do anything that needs a thread structure or gtid since they are
3823// already gone.
3825 kmp_task_team_t *task_team;
3826
3827 if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3828 // Free all task_teams on the free list
3830 while ((task_team = __kmp_free_task_teams) != NULL) {
3831 __kmp_free_task_teams = task_team->tt.tt_next;
3832 task_team->tt.tt_next = NULL;
3833
3834 // Free threads_data if necessary
3835 if (task_team->tt.tt_threads_data != NULL) {
3837 }
3838 if (task_team->tt.tt_task_pri_list != NULL) {
3839 __kmp_free_task_pri_list(task_team);
3840 }
3841 __kmp_free(task_team);
3842 }
3844 }
3845}
3846
3847// View the array of two task team pointers as a pair of pointers:
3848// 1) a single task_team pointer
3849// 2) next pointer for stack
3850// Serial teams can create a stack of task teams for nested serial teams.
3852 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
3853 kmp_task_team_list_t *current =
3854 (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
3855 kmp_task_team_list_t *node =
3857 node->task_team = current->task_team;
3858 node->next = current->next;
3859 thread->th.th_task_team = current->task_team = NULL;
3860 current->next = node;
3861}
3862
3863// Serial team pops a task team off the stack
3865 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
3866 kmp_task_team_list_t *current =
3867 (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
3868 if (current->task_team) {
3869 __kmp_free_task_team(thread, current->task_team);
3870 }
3871 kmp_task_team_list_t *next = current->next;
3872 if (next) {
3873 current->task_team = next->task_team;
3874 current->next = next->next;
3875 KMP_DEBUG_ASSERT(next != current);
3876 __kmp_free(next);
3877 thread->th.th_task_team = current->task_team;
3878 }
3879}
3880
3881// __kmp_wait_to_unref_task_teams:
3882// Some threads could still be in the fork barrier release code, possibly
3883// trying to steal tasks. Wait for each thread to unreference its task team.
3885 kmp_info_t *thread;
3886 kmp_uint32 spins;
3887 kmp_uint64 time;
3888 int done;
3889
3890 KMP_INIT_YIELD(spins);
3891 KMP_INIT_BACKOFF(time);
3892
3893 for (;;) {
3894 done = TRUE;
3895
3896 // TODO: GEH - this may be is wrong because some sync would be necessary
3897 // in case threads are added to the pool during the traversal. Need to
3898 // verify that lock for thread pool is held when calling this routine.
3899 for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3900 thread = thread->th.th_next_pool) {
3901#if KMP_OS_WINDOWS
3902 DWORD exit_val;
3903#endif
3904 if (TCR_PTR(thread->th.th_task_team) == NULL) {
3905 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3906 __kmp_gtid_from_thread(thread)));
3907 continue;
3908 }
3909#if KMP_OS_WINDOWS
3910 // TODO: GEH - add this check for Linux* OS / OS X* as well?
3911 if (!__kmp_is_thread_alive(thread, &exit_val)) {
3912 thread->th.th_task_team = NULL;
3913 continue;
3914 }
3915#endif
3916
3917 done = FALSE; // Because th_task_team pointer is not NULL for this thread
3918
3919 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3920 "unreference task_team\n",
3921 __kmp_gtid_from_thread(thread)));
3922
3924 void *sleep_loc;
3925 // If the thread is sleeping, awaken it.
3926 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3927 NULL) {
3928 KA_TRACE(
3929 10,
3930 ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3933 }
3934 }
3935 }
3936 if (done) {
3937 break;
3938 }
3939
3940 // If oversubscribed or have waited a bit, yield.
3941 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3942 }
3943}
3944
3945// __kmp_task_team_setup: Create a task_team for the current team, but use
3946// an already created, unused one if it already exists.
3949
3950 // For the serial and root teams, setup the first task team pointer to point
3951 // to task team. The other pointer is a stack of task teams from previous
3952 // serial levels.
3953 if (team == this_thr->th.th_serial_team ||
3954 team == this_thr->th.th_root->r.r_root_team) {
3955 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
3956 if (team->t.t_task_team[0] == NULL) {
3957 team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);
3958 KA_TRACE(
3959 20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
3960 " for serial/root team %p\n",
3961 __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));
3962
3963 } else
3964 __kmp_task_team_init(team->t.t_task_team[0], team);
3965 return;
3966 }
3967
3968 // If this task_team hasn't been created yet, allocate it. It will be used in
3969 // the region after the next.
3970 // If it exists, it is the current task team and shouldn't be touched yet as
3971 // it may still be in use.
3972 if (team->t.t_task_team[this_thr->th.th_task_state] == NULL) {
3973 team->t.t_task_team[this_thr->th.th_task_state] =
3974 __kmp_allocate_task_team(this_thr, team);
3975 KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
3976 " for team %d at parity=%d\n",
3977 __kmp_gtid_from_thread(this_thr),
3978 team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
3979 this_thr->th.th_task_state));
3980 }
3981
3982 // After threads exit the release, they will call sync, and then point to this
3983 // other task_team; make sure it is allocated and properly initialized. As
3984 // threads spin in the barrier release phase, they will continue to use the
3985 // previous task_team struct(above), until they receive the signal to stop
3986 // checking for tasks (they can't safely reference the kmp_team_t struct,
3987 // which could be reallocated by the primary thread).
3988 int other_team = 1 - this_thr->th.th_task_state;
3989 KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
3990 if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3991 team->t.t_task_team[other_team] = __kmp_allocate_task_team(this_thr, team);
3992 KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
3993 "task_team %p for team %d at parity=%d\n",
3994 __kmp_gtid_from_thread(this_thr),
3995 team->t.t_task_team[other_team], team->t.t_id, other_team));
3996 } else { // Leave the old task team struct in place for the upcoming region;
3997 // adjust as needed
3998 kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3999 __kmp_task_team_init(task_team, team);
4000 // if team size has changed, the first thread to enable tasking will
4001 // realloc threads_data if necessary
4002 KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
4003 "%p for team %d at parity=%d\n",
4004 __kmp_gtid_from_thread(this_thr),
4005 team->t.t_task_team[other_team], team->t.t_id, other_team));
4006 }
4007
4008 // For regular thread, task enabling should be called when the task is going
4009 // to be pushed to a dequeue. However, for the hidden helper thread, we need
4010 // it ahead of time so that some operations can be performed without race
4011 // condition.
4012 if (this_thr == __kmp_hidden_helper_main_thread) {
4013 for (int i = 0; i < 2; ++i) {
4014 kmp_task_team_t *task_team = team->t.t_task_team[i];
4015 if (KMP_TASKING_ENABLED(task_team)) {
4016 continue;
4017 }
4018 __kmp_enable_tasking(task_team, this_thr);
4019 for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
4020 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
4021 if (thread_data->td.td_deque == NULL) {
4023 }
4024 }
4025 }
4026 }
4027}
4028
4029// __kmp_task_team_sync: Propagation of task team data from team to threads
4030// which happens just after the release phase of a team barrier. This may be
4031// called by any thread. This is not called for serial or root teams.
4034 KMP_DEBUG_ASSERT(team != this_thr->th.th_serial_team);
4035 KMP_DEBUG_ASSERT(team != this_thr->th.th_root->r.r_root_team);
4036
4037 // Toggle the th_task_state field, to switch which task_team this thread
4038 // refers to
4039 this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
4040
4041 // It is now safe to propagate the task team pointer from the team struct to
4042 // the current thread.
4043 TCW_PTR(this_thr->th.th_task_team,
4044 team->t.t_task_team[this_thr->th.th_task_state]);
4045 KA_TRACE(20,
4046 ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
4047 "%p from Team #%d (parity=%d)\n",
4048 __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
4049 team->t.t_id, this_thr->th.th_task_state));
4050}
4051
4052// __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
4053// barrier gather phase. Only called by the primary thread.
4054//
4055// wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
4056// by passing in 0 optionally as the last argument. When wait is zero, primary
4057// thread does not wait for unfinished_threads to reach 0.
4059 kmp_info_t *this_thr,
4060 kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
4061 kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
4062
4064 KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
4065
4066 if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
4067 if (wait) {
4068 KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
4069 "(for unfinished_threads to reach 0) on task_team = %p\n",
4070 __kmp_gtid_from_thread(this_thr), task_team));
4071 // Worker threads may have dropped through to release phase, but could
4072 // still be executing tasks. Wait here for tasks to complete. To avoid
4073 // memory contention, only primary thread checks termination condition.
4075 RCAST(std::atomic<kmp_uint32> *,
4076 &task_team->tt.tt_unfinished_threads),
4077 0U);
4078 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
4079 }
4080 // Deactivate the old task team, so that the worker threads will stop
4081 // referencing it while spinning.
4082 KA_TRACE(
4083 20,
4084 ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
4085 "setting active to false, setting local and team's pointer to NULL\n",
4086 __kmp_gtid_from_thread(this_thr), task_team));
4090 TCW_SYNC_4(task_team->tt.tt_active, FALSE);
4091 KMP_MB();
4092
4093 TCW_PTR(this_thr->th.th_task_team, NULL);
4094 }
4095}
4096
4097// __kmp_tasking_barrier:
4098// This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
4099// Internal function to execute all tasks prior to a regular barrier or a join
4100// barrier. It is a full barrier itself, which unfortunately turns regular
4101// barriers into double barriers and join barriers into 1 1/2 barriers.
4102void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
4103 std::atomic<kmp_uint32> *spin = RCAST(
4104 std::atomic<kmp_uint32> *,
4105 &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
4106 int flag = FALSE;
4108
4109#if USE_ITT_BUILD
4110 KMP_FSYNC_SPIN_INIT(spin, NULL);
4111#endif /* USE_ITT_BUILD */
4112 kmp_flag_32<false, false> spin_flag(spin, 0U);
4113 while (!spin_flag.execute_tasks(thread, gtid, TRUE,
4114 &flag USE_ITT_BUILD_ARG(NULL), 0)) {
4115#if USE_ITT_BUILD
4116 // TODO: What about itt_sync_obj??
4117 KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
4118#endif /* USE_ITT_BUILD */
4119
4120 if (TCR_4(__kmp_global.g.g_done)) {
4121 if (__kmp_global.g.g_abort)
4123 break;
4124 }
4125 KMP_YIELD(TRUE);
4126 }
4127#if USE_ITT_BUILD
4128 KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
4129#endif /* USE_ITT_BUILD */
4130}
4131
4132// __kmp_give_task puts a task into a given thread queue if:
4133// - the queue for that thread was created
4134// - there's space in that queue
4135// Because of this, __kmp_push_task needs to check if there's space after
4136// getting the lock
4138 kmp_int32 pass) {
4140 kmp_task_team_t *task_team = taskdata->td_task_team;
4141
4142 KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
4143 taskdata, tid));
4144
4145 // If task_team is NULL something went really bad...
4146 KMP_DEBUG_ASSERT(task_team != NULL);
4147
4148 bool result = false;
4149 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
4150
4151 if (thread_data->td.td_deque == NULL) {
4152 // There's no queue in this thread, go find another one
4153 // We're guaranteed that at least one thread has a queue
4154 KA_TRACE(30,
4155 ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
4156 tid, taskdata));
4157 return result;
4158 }
4159
4160 if (TCR_4(thread_data->td.td_deque_ntasks) >=
4161 TASK_DEQUE_SIZE(thread_data->td)) {
4162 KA_TRACE(
4163 30,
4164 ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
4165 taskdata, tid));
4166
4167 // if this deque is bigger than the pass ratio give a chance to another
4168 // thread
4169 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4170 return result;
4171
4172 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4173 if (TCR_4(thread_data->td.td_deque_ntasks) >=
4174 TASK_DEQUE_SIZE(thread_data->td)) {
4175 // expand deque to push the task which is not allowed to execute
4176 __kmp_realloc_task_deque(thread, thread_data);
4177 }
4178
4179 } else {
4180
4181 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4182
4183 if (TCR_4(thread_data->td.td_deque_ntasks) >=
4184 TASK_DEQUE_SIZE(thread_data->td)) {
4185 KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
4186 "thread %d.\n",
4187 taskdata, tid));
4188
4189 // if this deque is bigger than the pass ratio give a chance to another
4190 // thread
4191 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4192 goto release_and_exit;
4193
4194 __kmp_realloc_task_deque(thread, thread_data);
4195 }
4196 }
4197
4198 // lock is held here, and there is space in the deque
4199
4200 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
4201 // Wrap index.
4202 thread_data->td.td_deque_tail =
4203 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
4204 TCW_4(thread_data->td.td_deque_ntasks,
4205 TCR_4(thread_data->td.td_deque_ntasks) + 1);
4206
4207 result = true;
4208 KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
4209 taskdata, tid));
4210
4211release_and_exit:
4212 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
4213
4214 return result;
4215}
4216
4217#define PROXY_TASK_FLAG 0x40000000
4218/* The finish of the proxy tasks is divided in two pieces:
4219 - the top half is the one that can be done from a thread outside the team
4220 - the bottom half must be run from a thread within the team
4221
4222 In order to run the bottom half the task gets queued back into one of the
4223 threads of the team. Once the td_incomplete_child_task counter of the parent
4224 is decremented the threads can leave the barriers. So, the bottom half needs
4225 to be queued before the counter is decremented. The top half is therefore
4226 divided in two parts:
4227 - things that can be run before queuing the bottom half
4228 - things that must be run after queuing the bottom half
4229
4230 This creates a second race as the bottom half can free the task before the
4231 second top half is executed. To avoid this we use the
4232 td_incomplete_child_task of the proxy task to synchronize the top and bottom
4233 half. */
4237 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
4238 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
4239
4240 taskdata->td_flags.complete = 1; // mark the task as completed
4241#if OMPX_TASKGRAPH
4242 taskdata->td_flags.onced = 1;
4243#endif
4244
4245 if (taskdata->td_taskgroup)
4246 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
4247
4248 // Create an imaginary children for this task so the bottom half cannot
4249 // release the task before we have completed the second top half
4251}
4252
4254#if KMP_DEBUG
4255 kmp_int32 children = 0;
4256 // Predecrement simulated by "- 1" calculation
4257 children = -1 +
4258#endif
4260 KMP_DEBUG_ASSERT(children >= 0);
4261
4262 // Remove the imaginary children
4264}
4265
4268 kmp_info_t *thread = __kmp_threads[gtid];
4269
4272 1); // top half must run before bottom half
4273
4274 // We need to wait to make sure the top half is finished
4275 // Spinning here should be ok as this should happen quickly
4276 while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
4277 PROXY_TASK_FLAG) > 0)
4278 ;
4279
4280 __kmp_release_deps(gtid, taskdata);
4281 __kmp_free_task_and_ancestors(gtid, taskdata, thread);
4282}
4283
4284/*!
4285@ingroup TASKING
4286@param gtid Global Thread ID of encountering thread
4287@param ptask Task which execution is completed
4288
4289Execute the completion of a proxy task from a thread of that is part of the
4290team. Run first and bottom halves directly.
4291*/
4293 KMP_DEBUG_ASSERT(ptask != NULL);
4295 KA_TRACE(
4296 10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
4297 gtid, taskdata));
4300
4304
4305 KA_TRACE(10,
4306 ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
4307 gtid, taskdata));
4308}
4309
4311 KMP_DEBUG_ASSERT(ptask != NULL);
4313
4314 // Enqueue task to complete bottom half completion from a thread within the
4315 // corresponding team
4316 kmp_team_t *team = taskdata->td_team;
4317 kmp_int32 nthreads = team->t.t_nproc;
4318 kmp_info_t *thread;
4319
4320 // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
4321 // but we cannot use __kmp_get_random here
4322 kmp_int32 start_k = start % nthreads;
4323 kmp_int32 pass = 1;
4324 kmp_int32 k = start_k;
4325
4326 do {
4327 // For now we're just linearly trying to find a thread
4328 thread = team->t.t_threads[k];
4329 k = (k + 1) % nthreads;
4330
4331 // we did a full pass through all the threads
4332 if (k == start_k)
4333 pass = pass << 1;
4334
4335 } while (!__kmp_give_task(thread, k, ptask, pass));
4336
4338 // awake at least one thread to execute given task
4339 for (int i = 0; i < nthreads; ++i) {
4340 thread = team->t.t_threads[i];
4341 if (thread->th.th_sleep_loc != NULL) {
4343 break;
4344 }
4345 }
4346 }
4347}
4348
4349/*!
4350@ingroup TASKING
4351@param ptask Task which execution is completed
4352
4353Execute the completion of a proxy task from a thread that could not belong to
4354the team.
4355*/
4357 KMP_DEBUG_ASSERT(ptask != NULL);
4359
4360 KA_TRACE(
4361 10,
4362 ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
4363 taskdata));
4364
4366
4368
4370
4372
4373 KA_TRACE(
4374 10,
4375 ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
4376 taskdata));
4377}
4378
4389
4391 if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4392 kmp_task_t *ptask = event->ed.task;
4394 bool detached = false;
4395 int gtid = __kmp_get_gtid();
4396
4397 // The associated task might have completed or could be completing at this
4398 // point.
4399 // We need to take the lock to avoid races
4400 __kmp_acquire_tas_lock(&event->lock, gtid);
4401 if (taskdata->td_flags.proxy == TASK_PROXY) {
4402 detached = true;
4403 } else {
4404#if OMPT_SUPPORT
4405 // The OMPT event must occur under mutual exclusion,
4406 // otherwise the tool might access ptask after free
4407 if (UNLIKELY(ompt_enabled.enabled))
4408 __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4409#endif
4410 }
4412 __kmp_release_tas_lock(&event->lock, gtid);
4413
4414 if (detached) {
4415#if OMPT_SUPPORT
4416 // We free ptask afterwards and know the task is finished,
4417 // so locking is not necessary
4418 if (UNLIKELY(ompt_enabled.enabled))
4419 __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4420#endif
4421 // If the task detached complete the proxy task
4422 if (gtid >= 0) {
4423 kmp_team_t *team = taskdata->td_team;
4424 kmp_info_t *thread = __kmp_get_thread();
4425 if (thread->th.th_team == team) {
4427 return;
4428 }
4429 }
4430
4431 // fallback
4433 }
4434 }
4435}
4436
4437// __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4438// for taskloop
4439//
4440// thread: allocating thread
4441// task_src: pointer to source task to be duplicated
4442// taskloop_recur: used only when dealing with taskgraph,
4443// indicating whether we need to update task->td_task_id
4444// returns: a pointer to the allocated kmp_task_t structure (task).
4446#if OMPX_TASKGRAPH
4447 , int taskloop_recur
4448#endif
4449) {
4451 kmp_taskdata_t *taskdata;
4452 kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4453 kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4454 size_t shareds_offset;
4455 size_t task_size;
4456
4457 KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4458 task_src));
4459 KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4460 TASK_FULL); // it should not be proxy task
4462 task_size = taskdata_src->td_size_alloc;
4463
4464 // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4465 KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4466 task_size));
4467#if USE_FAST_MEMORY
4468 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4469#else
4470 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4471#endif /* USE_FAST_MEMORY */
4472 KMP_MEMCPY(taskdata, taskdata_src, task_size);
4473
4474 task = KMP_TASKDATA_TO_TASK(taskdata);
4475
4476 // Initialize new task (only specific fields not affected by memcpy)
4477#if OMPX_TASKGRAPH
4478 if (taskdata->is_taskgraph && !taskloop_recur &&
4479 __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))
4480 taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
4481#endif
4482 taskdata->td_task_id = KMP_GEN_TASK_ID();
4483 if (task->shareds != NULL) { // need setup shareds pointer
4484 shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4485 task->shareds = &((char *)taskdata)[shareds_offset];
4486 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4487 0);
4488 }
4489 taskdata->td_alloc_thread = thread;
4490 taskdata->td_parent = parent_task;
4491 // task inherits the taskgroup from the parent task
4492 taskdata->td_taskgroup = parent_task->td_taskgroup;
4493 // tied task needs to initialize the td_last_tied at creation,
4494 // untied one does this when it is scheduled for execution
4495 if (taskdata->td_flags.tiedness == TASK_TIED)
4496 taskdata->td_last_tied = taskdata;
4497
4498 // Only need to keep track of child task counts if team parallel and tasking
4499 // not serialized
4500 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4502 if (parent_task->td_taskgroup)
4503 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4504 // Only need to keep track of allocated child tasks for explicit tasks since
4505 // implicit not deallocated
4506 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4508 }
4509
4510 KA_TRACE(20,
4511 ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4512 thread, taskdata, taskdata->td_parent));
4513#if OMPT_SUPPORT
4514 if (UNLIKELY(ompt_enabled.enabled))
4515 __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4516#endif
4517 return task;
4518}
4519
4520// Routine optionally generated by the compiler for setting the lastprivate flag
4521// and calling needed constructors for private/firstprivate objects
4522// (used to form taskloop tasks from pattern task)
4523// Parameters: dest task, src task, lastprivate flag.
4525
4526KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4527
4528// class to encapsulate manipulating loop bounds in a taskloop task.
4529// this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4530// the loop bound variables.
4532 kmp_task_t *task;
4533 const kmp_taskdata_t *taskdata;
4534 size_t lower_offset;
4535 size_t upper_offset;
4536
4537public:
4539 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4540 lower_offset((char *)lb - (char *)task),
4541 upper_offset((char *)ub - (char *)task) {
4542 KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4543 KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4544 }
4546 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4547 lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4548 size_t get_lower_offset() const { return lower_offset; }
4549 size_t get_upper_offset() const { return upper_offset; }
4551 kmp_int64 retval;
4552#if defined(KMP_GOMP_COMPAT)
4553 // Intel task just returns the lower bound normally
4554 if (!taskdata->td_flags.native) {
4555 retval = *(kmp_int64 *)((char *)task + lower_offset);
4556 } else {
4557 // GOMP task has to take into account the sizeof(long)
4558 if (taskdata->td_size_loop_bounds == 4) {
4559 kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4560 retval = (kmp_int64)*lb;
4561 } else {
4562 kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4563 retval = (kmp_int64)*lb;
4564 }
4565 }
4566#else
4567 (void)taskdata;
4568 retval = *(kmp_int64 *)((char *)task + lower_offset);
4569#endif // defined(KMP_GOMP_COMPAT)
4570 return retval;
4571 }
4573 kmp_int64 retval;
4574#if defined(KMP_GOMP_COMPAT)
4575 // Intel task just returns the upper bound normally
4576 if (!taskdata->td_flags.native) {
4577 retval = *(kmp_int64 *)((char *)task + upper_offset);
4578 } else {
4579 // GOMP task has to take into account the sizeof(long)
4580 if (taskdata->td_size_loop_bounds == 4) {
4581 kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4582 retval = (kmp_int64)*ub;
4583 } else {
4584 kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4585 retval = (kmp_int64)*ub;
4586 }
4587 }
4588#else
4589 retval = *(kmp_int64 *)((char *)task + upper_offset);
4590#endif // defined(KMP_GOMP_COMPAT)
4591 return retval;
4592 }
4594#if defined(KMP_GOMP_COMPAT)
4595 // Intel task just sets the lower bound normally
4596 if (!taskdata->td_flags.native) {
4597 *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4598 } else {
4599 // GOMP task has to take into account the sizeof(long)
4600 if (taskdata->td_size_loop_bounds == 4) {
4601 kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4602 *lower = (kmp_uint32)lb;
4603 } else {
4604 kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4605 *lower = (kmp_uint64)lb;
4606 }
4607 }
4608#else
4609 *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4610#endif // defined(KMP_GOMP_COMPAT)
4611 }
4613#if defined(KMP_GOMP_COMPAT)
4614 // Intel task just sets the upper bound normally
4615 if (!taskdata->td_flags.native) {
4616 *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4617 } else {
4618 // GOMP task has to take into account the sizeof(long)
4619 if (taskdata->td_size_loop_bounds == 4) {
4620 kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4621 *upper = (kmp_uint32)ub;
4622 } else {
4623 kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4624 *upper = (kmp_uint64)ub;
4625 }
4626 }
4627#else
4628 *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4629#endif // defined(KMP_GOMP_COMPAT)
4630 }
4631};
4632
4633// __kmp_taskloop_linear: Start tasks of the taskloop linearly
4634//
4635// loc Source location information
4636// gtid Global thread ID
4637// task Pattern task, exposes the loop iteration range
4638// lb Pointer to loop lower bound in task structure
4639// ub Pointer to loop upper bound in task structure
4640// st Loop stride
4641// ub_glob Global upper bound (used for lastprivate check)
4642// num_tasks Number of tasks to execute
4643// grainsize Number of loop iterations per task
4644// extras Number of chunks with grainsize+1 iterations
4645// last_chunk Reduction of grainsize for last task
4646// tc Iterations count
4647// task_dup Tasks duplication routine
4648// codeptr_ra Return address for OMPT events
4650 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4651 kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4652 kmp_uint64 grainsize, kmp_uint64 extras,
4653 kmp_int64 last_chunk, kmp_uint64 tc,
4654#if OMPT_SUPPORT
4655 void *codeptr_ra,
4656#endif
4657 void *task_dup) {
4658 KMP_COUNT_BLOCK(OMP_TASKLOOP);
4659 KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4660 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4661 // compiler provides global bounds here
4662 kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4663 kmp_uint64 lower = task_bounds.get_lb();
4664 kmp_uint64 upper = task_bounds.get_ub();
4665 kmp_uint64 i;
4666 kmp_info_t *thread = __kmp_threads[gtid];
4667 kmp_taskdata_t *current_task = thread->th.th_current_task;
4668 kmp_task_t *next_task;
4669 kmp_int32 lastpriv = 0;
4670
4671 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4672 (last_chunk < 0 ? last_chunk : extras));
4673 KMP_DEBUG_ASSERT(num_tasks > extras);
4674 KMP_DEBUG_ASSERT(num_tasks > 0);
4675 KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4676 "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4677 gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4678 ub_glob, st, task_dup));
4679
4680 // Launch num_tasks tasks, assign grainsize iterations each task
4681 for (i = 0; i < num_tasks; ++i) {
4682 kmp_uint64 chunk_minus_1;
4683 if (extras == 0) {
4684 chunk_minus_1 = grainsize - 1;
4685 } else {
4686 chunk_minus_1 = grainsize;
4687 --extras; // first extras iterations get bigger chunk (grainsize+1)
4688 }
4689 upper = lower + st * chunk_minus_1;
4690 if (upper > *ub) {
4691 upper = *ub;
4692 }
4693 if (i == num_tasks - 1) {
4694 // schedule the last task, set lastprivate flag if needed
4695 if (st == 1) { // most common case
4696 KMP_DEBUG_ASSERT(upper == *ub);
4697 if (upper == ub_glob)
4698 lastpriv = 1;
4699 } else if (st > 0) { // positive loop stride
4700 KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4701 if ((kmp_uint64)st > ub_glob - upper)
4702 lastpriv = 1;
4703 } else { // negative loop stride
4704 KMP_DEBUG_ASSERT(upper + st < *ub);
4705 if (upper - ub_glob < (kmp_uint64)(-st))
4706 lastpriv = 1;
4707 }
4708 }
4709
4710#if OMPX_TASKGRAPH
4711 next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0);
4712#else
4713 next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4714#endif
4715
4716 kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4717 kmp_taskloop_bounds_t next_task_bounds =
4718 kmp_taskloop_bounds_t(next_task, task_bounds);
4719
4720 // adjust task-specific bounds
4721 next_task_bounds.set_lb(lower);
4722 if (next_taskdata->td_flags.native) {
4723 next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4724 } else {
4725 next_task_bounds.set_ub(upper);
4726 }
4727 if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4728 // etc.
4729 ptask_dup(next_task, task, lastpriv);
4730 KA_TRACE(40,
4731 ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4732 "upper %lld stride %lld, (offsets %p %p)\n",
4733 gtid, i, next_task, lower, upper, st,
4734 next_task_bounds.get_lower_offset(),
4735 next_task_bounds.get_upper_offset()));
4736#if OMPT_SUPPORT
4737 __kmp_omp_taskloop_task(NULL, gtid, next_task,
4738 codeptr_ra); // schedule new task
4739#if OMPT_OPTIONAL
4740 if (ompt_enabled.ompt_callback_dispatch) {
4741 OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,
4742 lower, upper, st);
4743 }
4744#endif // OMPT_OPTIONAL
4745#else
4746 __kmp_omp_task(gtid, next_task, true); // schedule new task
4747#endif
4748 lower = upper + st; // adjust lower bound for the next iteration
4749 }
4750 // free the pattern task and exit
4751 __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4752 // do not execute the pattern task, just do internal bookkeeping
4753 __kmp_task_finish<false>(gtid, task, current_task);
4754}
4755
4756// Structure to keep taskloop parameters for auxiliary task
4757// kept in the shareds of the task structure.
4775
4779 kmp_uint64,
4780#if OMPT_SUPPORT
4781 void *,
4782#endif
4783 void *);
4784
4785// Execute part of the taskloop submitted as a task.
4786int __kmp_taskloop_task(int gtid, void *ptask) {
4788 (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4789 kmp_task_t *task = p->task;
4790 kmp_uint64 *lb = p->lb;
4791 kmp_uint64 *ub = p->ub;
4792 void *task_dup = p->task_dup;
4793 // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4794 kmp_int64 st = p->st;
4795 kmp_uint64 ub_glob = p->ub_glob;
4796 kmp_uint64 num_tasks = p->num_tasks;
4797 kmp_uint64 grainsize = p->grainsize;
4798 kmp_uint64 extras = p->extras;
4799 kmp_int64 last_chunk = p->last_chunk;
4800 kmp_uint64 tc = p->tc;
4801 kmp_uint64 num_t_min = p->num_t_min;
4802#if OMPT_SUPPORT
4803 void *codeptr_ra = p->codeptr_ra;
4804#endif
4805#if KMP_DEBUG
4807 KMP_DEBUG_ASSERT(task != NULL);
4808 KA_TRACE(20,
4809 ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4810 " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4811 gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4812 st, task_dup));
4813#endif
4814 KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4815 if (num_tasks > num_t_min)
4816 __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4817 grainsize, extras, last_chunk, tc, num_t_min,
4818#if OMPT_SUPPORT
4819 codeptr_ra,
4820#endif
4821 task_dup);
4822 else
4823 __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4824 grainsize, extras, last_chunk, tc,
4825#if OMPT_SUPPORT
4826 codeptr_ra,
4827#endif
4828 task_dup);
4829
4830 KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
4831 return 0;
4832}
4833
4834// Schedule part of the taskloop as a task,
4835// execute the rest of the taskloop.
4836//
4837// loc Source location information
4838// gtid Global thread ID
4839// task Pattern task, exposes the loop iteration range
4840// lb Pointer to loop lower bound in task structure
4841// ub Pointer to loop upper bound in task structure
4842// st Loop stride
4843// ub_glob Global upper bound (used for lastprivate check)
4844// num_tasks Number of tasks to execute
4845// grainsize Number of loop iterations per task
4846// extras Number of chunks with grainsize+1 iterations
4847// last_chunk Reduction of grainsize for last task
4848// tc Iterations count
4849// num_t_min Threshold to launch tasks recursively
4850// task_dup Tasks duplication routine
4851// codeptr_ra Return address for OMPT events
4853 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4854 kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4855 kmp_uint64 grainsize, kmp_uint64 extras,
4856 kmp_int64 last_chunk, kmp_uint64 tc,
4857 kmp_uint64 num_t_min,
4858#if OMPT_SUPPORT
4859 void *codeptr_ra,
4860#endif
4861 void *task_dup) {
4863 KMP_DEBUG_ASSERT(task != NULL);
4864 KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4865 KA_TRACE(20,
4866 ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4867 " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4868 gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4869 st, task_dup));
4870 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4871 kmp_uint64 lower = *lb;
4872 kmp_info_t *thread = __kmp_threads[gtid];
4873 // kmp_taskdata_t *current_task = thread->th.th_current_task;
4874 kmp_task_t *next_task;
4875 size_t lower_offset =
4876 (char *)lb - (char *)task; // remember offset of lb in the task structure
4877 size_t upper_offset =
4878 (char *)ub - (char *)task; // remember offset of ub in the task structure
4879
4880 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4881 (last_chunk < 0 ? last_chunk : extras));
4882 KMP_DEBUG_ASSERT(num_tasks > extras);
4883 KMP_DEBUG_ASSERT(num_tasks > 0);
4884
4885 // split the loop in two halves
4886 kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4887 kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
4888 kmp_uint64 gr_size0 = grainsize;
4889 kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
4890 kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
4891 if (last_chunk < 0) {
4892 ext0 = ext1 = 0;
4893 last_chunk1 = last_chunk;
4894 tc0 = grainsize * n_tsk0;
4895 tc1 = tc - tc0;
4896 } else if (n_tsk0 <= extras) {
4897 gr_size0++; // integrate extras into grainsize
4898 ext0 = 0; // no extra iters in 1st half
4899 ext1 = extras - n_tsk0; // remaining extras
4900 tc0 = gr_size0 * n_tsk0;
4901 tc1 = tc - tc0;
4902 } else { // n_tsk0 > extras
4903 ext1 = 0; // no extra iters in 2nd half
4904 ext0 = extras;
4905 tc1 = grainsize * n_tsk1;
4906 tc0 = tc - tc1;
4907 }
4908 ub0 = lower + st * (tc0 - 1);
4909 lb1 = ub0 + st;
4910
4911 // create pattern task for 2nd half of the loop
4912#if OMPX_TASKGRAPH
4913 next_task = __kmp_task_dup_alloc(thread, task,
4914 /* taskloop_recur */ 1);
4915#else
4916 next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
4917#endif
4918 // adjust lower bound (upper bound is not changed) for the 2nd half
4919 *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
4920 if (ptask_dup != NULL) // construct firstprivates, etc.
4921 ptask_dup(next_task, task, 0);
4922 *ub = ub0; // adjust upper bound for the 1st half
4923
4924 // create auxiliary task for 2nd half of the loop
4925 // make sure new task has same parent task as the pattern task
4926 kmp_taskdata_t *current_task = thread->th.th_current_task;
4927 thread->th.th_current_task = taskdata->td_parent;
4928 kmp_task_t *new_task =
4929 __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4931 // restore current task
4932 thread->th.th_current_task = current_task;
4934 p->task = next_task;
4935 p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4936 p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4937 p->task_dup = task_dup;
4938 p->st = st;
4939 p->ub_glob = ub_glob;
4940 p->num_tasks = n_tsk1;
4941 p->grainsize = grainsize;
4942 p->extras = ext1;
4943 p->last_chunk = last_chunk1;
4944 p->tc = tc1;
4945 p->num_t_min = num_t_min;
4946#if OMPT_SUPPORT
4947 p->codeptr_ra = codeptr_ra;
4948#endif
4949
4950#if OMPX_TASKGRAPH
4951 kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
4952 new_task_data->tdg = taskdata->tdg;
4953 new_task_data->is_taskgraph = 0;
4954#endif
4955
4956#if OMPT_SUPPORT
4957 // schedule new task with correct return address for OMPT events
4958 __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4959#else
4960 __kmp_omp_task(gtid, new_task, true); // schedule new task
4961#endif
4962
4963 // execute the 1st half of current subrange
4964 if (n_tsk0 > num_t_min)
4965 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4966 ext0, last_chunk0, tc0, num_t_min,
4967#if OMPT_SUPPORT
4968 codeptr_ra,
4969#endif
4970 task_dup);
4971 else
4972 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4973 gr_size0, ext0, last_chunk0, tc0,
4974#if OMPT_SUPPORT
4975 codeptr_ra,
4976#endif
4977 task_dup);
4978
4979 KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
4980}
4981
4982static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4983 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4984 int nogroup, int sched, kmp_uint64 grainsize,
4985 int modifier, void *task_dup) {
4987 KMP_DEBUG_ASSERT(task != NULL);
4988 if (nogroup == 0) {
4989#if OMPT_SUPPORT && OMPT_OPTIONAL
4990 OMPT_STORE_RETURN_ADDRESS(gtid);
4991#endif
4992 __kmpc_taskgroup(loc, gtid);
4993 }
4994
4995#if OMPX_TASKGRAPH
4996 KMP_ATOMIC_DEC(&__kmp_tdg_task_id);
4997#endif
4998 // =========================================================================
4999 // calculate loop parameters
5000 kmp_taskloop_bounds_t task_bounds(task, lb, ub);
5001 kmp_uint64 tc;
5002 // compiler provides global bounds here
5003 kmp_uint64 lower = task_bounds.get_lb();
5004 kmp_uint64 upper = task_bounds.get_ub();
5005 kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
5006 kmp_uint64 num_tasks = 0, extras = 0;
5007 kmp_int64 last_chunk =
5008 0; // reduce grainsize of last task by last_chunk in strict mode
5009 kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
5010 kmp_info_t *thread = __kmp_threads[gtid];
5011 kmp_taskdata_t *current_task = thread->th.th_current_task;
5012
5013 KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
5014 "grain %llu(%d, %d), dup %p\n",
5015 gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
5016 task_dup));
5017
5018 // compute trip count
5019 if (st == 1) { // most common case
5020 tc = upper - lower + 1;
5021 } else if (st < 0) {
5022 tc = (lower - upper) / (-st) + 1;
5023 } else { // st > 0
5024 tc = (upper - lower) / st + 1;
5025 }
5026 if (tc == 0) {
5027 KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
5028 // free the pattern task and exit
5029 __kmp_task_start(gtid, task, current_task);
5030 // do not execute anything for zero-trip loop
5031 __kmp_task_finish<false>(gtid, task, current_task);
5032 return;
5033 }
5034
5035#if OMPT_SUPPORT && OMPT_OPTIONAL
5036 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
5038 if (ompt_enabled.ompt_callback_work) {
5039 ompt_callbacks.ompt_callback(ompt_callback_work)(
5040 ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
5041 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5042 }
5043#endif
5044
5045 if (num_tasks_min == 0)
5046 // TODO: can we choose better default heuristic?
5047 num_tasks_min =
5048 KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
5049
5050 // compute num_tasks/grainsize based on the input provided
5051 switch (sched) {
5052 case 0: // no schedule clause specified, we can choose the default
5053 // let's try to schedule (team_size*10) tasks
5054 grainsize = thread->th.th_team_nproc * static_cast<kmp_uint64>(10);
5056 case 2: // num_tasks provided
5057 if (grainsize > tc) {
5058 num_tasks = tc; // too big num_tasks requested, adjust values
5059 grainsize = 1;
5060 extras = 0;
5061 } else {
5062 num_tasks = grainsize;
5063 grainsize = tc / num_tasks;
5064 extras = tc % num_tasks;
5065 }
5066 break;
5067 case 1: // grainsize provided
5068 if (grainsize > tc) {
5069 num_tasks = 1;
5070 grainsize = tc; // too big grainsize requested, adjust values
5071 extras = 0;
5072 } else {
5073 if (modifier) {
5074 num_tasks = (tc + grainsize - 1) / grainsize;
5075 last_chunk = tc - (num_tasks * grainsize);
5076 extras = 0;
5077 } else {
5078 num_tasks = tc / grainsize;
5079 // adjust grainsize for balanced distribution of iterations
5080 grainsize = tc / num_tasks;
5081 extras = tc % num_tasks;
5082 }
5083 }
5084 break;
5085 default:
5086 KMP_ASSERT2(0, "unknown scheduling of taskloop");
5087 }
5088
5089 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5090 (last_chunk < 0 ? last_chunk : extras));
5091 KMP_DEBUG_ASSERT(num_tasks > extras);
5092 KMP_DEBUG_ASSERT(num_tasks > 0);
5093 // =========================================================================
5094
5095 // check if clause value first
5096 // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
5097 if (if_val == 0) { // if(0) specified, mark task as serial
5098 taskdata->td_flags.task_serial = 1;
5099 taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
5100 // always start serial tasks linearly
5101 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5102 grainsize, extras, last_chunk, tc,
5103#if OMPT_SUPPORT
5105#endif
5106 task_dup);
5107 // !taskdata->td_flags.native => currently force linear spawning of tasks
5108 // for GOMP_taskloop
5109 } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
5110 KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
5111 "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5112 gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5113 last_chunk));
5114 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5115 grainsize, extras, last_chunk, tc, num_tasks_min,
5116#if OMPT_SUPPORT
5118#endif
5119 task_dup);
5120 } else {
5121 KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
5122 "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5123 gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5124 last_chunk));
5125 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5126 grainsize, extras, last_chunk, tc,
5127#if OMPT_SUPPORT
5129#endif
5130 task_dup);
5131 }
5132
5133#if OMPT_SUPPORT && OMPT_OPTIONAL
5134 if (ompt_enabled.ompt_callback_work) {
5135 ompt_callbacks.ompt_callback(ompt_callback_work)(
5136 ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
5137 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5138 }
5139#endif
5140
5141 if (nogroup == 0) {
5142#if OMPT_SUPPORT && OMPT_OPTIONAL
5143 OMPT_STORE_RETURN_ADDRESS(gtid);
5144#endif
5146 }
5147 KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
5148}
5149
5150/*!
5151@ingroup TASKING
5152@param loc Source location information
5153@param gtid Global thread ID
5154@param task Task structure
5155@param if_val Value of the if clause
5156@param lb Pointer to loop lower bound in task structure
5157@param ub Pointer to loop upper bound in task structure
5158@param st Loop stride
5159@param nogroup Flag, 1 if nogroup clause specified, 0 otherwise
5160@param sched Schedule specified 0/1/2 for none/grainsize/num_tasks
5161@param grainsize Schedule value if specified
5162@param task_dup Tasks duplication routine
5163
5164Execute the taskloop construct.
5165*/
5166void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5167 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
5168 int sched, kmp_uint64 grainsize, void *task_dup) {
5170 KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
5171 __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5172 0, task_dup);
5173 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
5174}
5175
5176/*!
5177@ingroup TASKING
5178@param loc Source location information
5179@param gtid Global thread ID
5180@param task Task structure
5181@param if_val Value of the if clause
5182@param lb Pointer to loop lower bound in task structure
5183@param ub Pointer to loop upper bound in task structure
5184@param st Loop stride
5185@param nogroup Flag, 1 if nogroup clause specified, 0 otherwise
5186@param sched Schedule specified 0/1/2 for none/grainsize/num_tasks
5187@param grainsize Schedule value if specified
5188@param modifier Modifier 'strict' for sched, 1 if present, 0 otherwise
5189@param task_dup Tasks duplication routine
5190
5191Execute the taskloop construct.
5192*/
5193void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5194 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5195 int nogroup, int sched, kmp_uint64 grainsize,
5196 int modifier, void *task_dup) {
5198 KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
5199 __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5200 modifier, task_dup);
5201 KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
5202}
5203
5204/*!
5205@ingroup TASKING
5206@param gtid Global Thread ID of current thread
5207@return Returns a pointer to the thread's current task async handle. If no task
5208is present or gtid is invalid, returns NULL.
5209
5210Acqurires a pointer to the target async handle from the current task.
5211*/
5213 if (gtid == KMP_GTID_DNE)
5214 return NULL;
5215
5216 kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5217 kmp_taskdata_t *taskdata = thread->th.th_current_task;
5218
5219 if (!taskdata)
5220 return NULL;
5221
5222 return &taskdata->td_target_data.async_handle;
5223}
5224
5225/*!
5226@ingroup TASKING
5227@param gtid Global Thread ID of current thread
5228@return Returns TRUE if the current task being executed of the given thread has
5229a task team allocated to it. Otherwise, returns FALSE.
5230
5231Checks if the current thread has a task team.
5232*/
5234 if (gtid == KMP_GTID_DNE)
5235 return FALSE;
5236
5237 kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5238 kmp_taskdata_t *taskdata = thread->th.th_current_task;
5239
5240 if (!taskdata)
5241 return FALSE;
5242
5243 return taskdata->td_task_team != NULL;
5244}
5245
5246#if OMPX_TASKGRAPH
5247// __kmp_find_tdg: identify a TDG through its ID
5248// tdg_id: ID of the TDG
5249// returns: If a TDG corresponding to this ID is found and not
5250// its initial state, return the pointer to it, otherwise nullptr
5251static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
5252 kmp_tdg_info_t *res = nullptr;
5253 if (__kmp_max_tdgs == 0)
5254 return res;
5255
5256 if (__kmp_global_tdgs == NULL)
5257 __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
5258 sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);
5259
5260 if ((__kmp_global_tdgs[tdg_id]) &&
5261 (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
5262 res = __kmp_global_tdgs[tdg_id];
5263 return res;
5264}
5265
5266// __kmp_print_tdg_dot: prints the TDG to a dot file
5267// tdg: ID of the TDG
5268// gtid: Global Thread ID
5269void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg, kmp_int32 gtid) {
5270 kmp_int32 tdg_id = tdg->tdg_id;
5271 KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));
5272
5273 char file_name[20];
5274 sprintf(file_name, "tdg_%d.dot", tdg_id);
5275 kmp_safe_raii_file_t tdg_file(file_name, "w");
5276
5277 kmp_int32 num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5278 fprintf(tdg_file,
5279 "digraph TDG {\n"
5280 " compound=true\n"
5281 " subgraph cluster {\n"
5282 " label=TDG_%d\n",
5283 tdg_id);
5284 for (kmp_int32 i = 0; i < num_tasks; i++) {
5285 fprintf(tdg_file, " %d[style=bold]\n", i);
5286 }
5287 fprintf(tdg_file, " }\n");
5288 for (kmp_int32 i = 0; i < num_tasks; i++) {
5289 kmp_int32 nsuccessors = tdg->record_map[i].nsuccessors;
5290 kmp_int32 *successors = tdg->record_map[i].successors;
5291 if (nsuccessors > 0) {
5292 for (kmp_int32 j = 0; j < nsuccessors; j++)
5293 fprintf(tdg_file, " %d -> %d \n", i, successors[j]);
5294 }
5295 }
5296 fprintf(tdg_file, "}");
5297 KA_TRACE(10, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id));
5298}
5299
5300// __kmp_exec_tdg: launch the execution of a previous
5301// recorded TDG
5302// gtid: Global Thread ID
5303// tdg: ID of the TDG
5304void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5305 KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
5306 KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
5307 tdg->tdg_id, tdg->num_roots));
5308 kmp_node_info_t *this_record_map = tdg->record_map;
5309 kmp_int32 *this_root_tasks = tdg->root_tasks;
5310 kmp_int32 this_num_roots = tdg->num_roots;
5311 kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5312
5313 kmp_info_t *thread = __kmp_threads[gtid];
5314 kmp_taskdata_t *parent_task = thread->th.th_current_task;
5315
5316 if (tdg->rec_taskred_data) {
5317 __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
5318 }
5319
5320 for (kmp_int32 j = 0; j < this_num_tasks; j++) {
5321 kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);
5322
5323 td->td_parent = parent_task;
5324 this_record_map[j].parent_task = parent_task;
5325
5326 kmp_taskgroup_t *parent_taskgroup =
5327 this_record_map[j].parent_task->td_taskgroup;
5328
5329 KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
5330 this_record_map[j].npredecessors);
5331 KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);
5332
5333 if (parent_taskgroup) {
5334 KMP_ATOMIC_INC(&parent_taskgroup->count);
5335 // The taskgroup is different so we must update it
5336 td->td_taskgroup = parent_taskgroup;
5337 } else if (td->td_taskgroup != nullptr) {
5338 // If the parent doesnt have a taskgroup, remove it from the task
5339 td->td_taskgroup = nullptr;
5340 }
5341 if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
5342 KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
5343 }
5344
5345 for (kmp_int32 j = 0; j < this_num_roots; ++j) {
5346 __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
5347 }
5348 KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
5349 tdg->tdg_id, tdg->num_roots));
5350}
5351
5352// __kmp_start_record: set up a TDG structure and turn the
5353// recording flag to true
5354// gtid: Global Thread ID of the encountering thread
5355// input_flags: Flags associated with the TDG
5356// tdg_id: ID of the TDG to record
5357static inline void __kmp_start_record(kmp_int32 gtid,
5358 kmp_taskgraph_flags_t *flags,
5359 kmp_int32 tdg_id) {
5360 kmp_tdg_info_t *tdg =
5361 (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));
5362 __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
5363 // Initializing the TDG structure
5364 tdg->tdg_id = tdg_id;
5365 tdg->map_size = INIT_MAPSIZE;
5366 tdg->num_roots = -1;
5367 tdg->root_tasks = nullptr;
5368 tdg->tdg_status = KMP_TDG_RECORDING;
5369 tdg->rec_num_taskred = 0;
5370 tdg->rec_taskred_data = nullptr;
5371 KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);
5372
5373 // Initializing the list of nodes in this TDG
5374 kmp_node_info_t *this_record_map =
5375 (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));
5376 for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {
5377 kmp_int32 *successorsList =
5378 (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));
5379 this_record_map[i].task = nullptr;
5380 this_record_map[i].successors = successorsList;
5381 this_record_map[i].nsuccessors = 0;
5382 this_record_map[i].npredecessors = 0;
5383 this_record_map[i].successors_size = __kmp_successors_size;
5384 KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);
5385 }
5386
5387 __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
5388}
5389
5390// __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
5391// the beginning of the record process of a task region
5392// loc_ref: Location of TDG, not used yet
5393// gtid: Global Thread ID of the encountering thread
5394// input_flags: Flags associated with the TDG
5395// tdg_id: ID of the TDG to record, for now, incremental integer
5396// returns: 1 if we record, otherwise, 0
5398 kmp_int32 input_flags, kmp_int32 tdg_id) {
5399
5400 kmp_int32 res;
5401 kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;
5402 KA_TRACE(10,
5403 ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
5404 gtid, loc_ref, input_flags, tdg_id));
5405
5406 if (__kmp_max_tdgs == 0) {
5407 KA_TRACE(
5408 10,
5409 ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
5410 "__kmp_max_tdgs = 0\n",
5411 gtid, loc_ref, input_flags, tdg_id));
5412 return 1;
5413 }
5414
5415 __kmpc_taskgroup(loc_ref, gtid);
5416 if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
5417 // TODO: use re_record flag
5418 __kmp_exec_tdg(gtid, tdg);
5419 res = 0;
5420 } else {
5421 __kmp_curr_tdg_idx = tdg_id;
5422 KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
5423 __kmp_start_record(gtid, flags, tdg_id);
5424 __kmp_num_tdg++;
5425 res = 1;
5426 }
5427 KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
5428 gtid, tdg_id, res ? "record" : "execute"));
5429 return res;
5430}
5431
5432// __kmp_end_record: set up a TDG after recording it
5433// gtid: Global thread ID
5434// tdg: Pointer to the TDG
5435void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5436 // Store roots
5437 kmp_node_info_t *this_record_map = tdg->record_map;
5438 kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5439 kmp_int32 *this_root_tasks =
5440 (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));
5441 kmp_int32 this_map_size = tdg->map_size;
5442 kmp_int32 this_num_roots = 0;
5443 kmp_info_t *thread = __kmp_threads[gtid];
5444
5445 for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5446 if (this_record_map[i].npredecessors == 0) {
5447 this_root_tasks[this_num_roots++] = i;
5448 }
5449 }
5450
5451 // Update with roots info and mapsize
5452 tdg->map_size = this_map_size;
5453 tdg->num_roots = this_num_roots;
5454 tdg->root_tasks = this_root_tasks;
5455 KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
5456 tdg->tdg_status = KMP_TDG_READY;
5457
5458 if (thread->th.th_current_task->td_dephash) {
5459 __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
5460 thread->th.th_current_task->td_dephash = NULL;
5461 }
5462
5463 // Reset predecessor counter
5464 for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5465 KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
5466 this_record_map[i].npredecessors);
5467 }
5468 KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);
5469
5470 if (__kmp_tdg_dot)
5471 __kmp_print_tdg_dot(tdg, gtid);
5472}
5473
5474// __kmpc_end_record_task: wrapper around __kmp_end_record to mark
5475// the end of recording phase
5476//
5477// loc_ref: Source location information
5478// gtid: Global thread ID
5479// input_flags: Flags attached to the graph
5480// tdg_id: ID of the TDG just finished recording
5481void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
5482 kmp_int32 input_flags, kmp_int32 tdg_id) {
5483 kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);
5484
5485 KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
5486 " tdg=%d with flags=%d\n",
5487 gtid, loc_ref, tdg_id, input_flags));
5488 if (__kmp_max_tdgs) {
5489 // TODO: use input_flags->nowait
5490 __kmpc_end_taskgroup(loc_ref, gtid);
5491 if (__kmp_tdg_is_recording(tdg->tdg_status))
5492 __kmp_end_record(gtid, tdg);
5493 }
5494 KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
5495 " tdg=%d, its status is now READY\n",
5496 gtid, loc_ref, tdg_id));
5497}
5498#endif
void * target(void *task)
uint8_t kmp_uint8
int task_entry(kmp_int32 gtid, kmp_task_t *task)
struct task * ptask
int result[2]
int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained)
This class safely opens and closes a C-style FILE* object using RAII semantics.
Definition kmp.h:4723
kmp_uint64 get_ub() const
void set_ub(kmp_uint64 ub)
size_t get_lower_offset() const
void set_lb(kmp_uint64 lb)
size_t get_upper_offset() const
kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
kmp_uint64 get_lb() const
int64_t kmp_int64
Definition common.h:10
kmp_int32(* kmp_routine_entry_t)(kmp_int32, void *)
Definition kmp.h:2484
struct kmp_task kmp_task_t
struct kmp_taskred_data kmp_taskred_data_t
Internal struct for reduction data item related info saved by the library.
struct kmp_task_red_input kmp_task_red_input_t
Internal struct for reduction data item related info set up by compiler.
struct kmp_taskred_flags kmp_taskred_flags_t
Flags for special info per task reduction item.
struct kmp_taskred_input kmp_taskred_input_t
Internal struct for reduction data item related info set up by compiler.
void * __kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data)
void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void *task_dup)
void * __kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void * __kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
bool __kmpc_omp_has_task_team(kmp_int32 gtid)
void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask)
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws)
kmp_int32 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins, kmp_task_affinity_info_t *affin_list)
void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, int modifier, void *task_dup)
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask)
void * __kmpc_taskred_init(int gtid, int num, void *data)
void ** __kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid)
void
Definition ittnotify.h:3324
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type size_t void * data
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t new_size
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance * instance
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t count
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type size_t void ITT_FORMAT p const __itt_domain __itt_id __itt_string_handle const wchar_t size_t ITT_FORMAT lu const __itt_domain __itt_id __itt_relation __itt_id ITT_FORMAT p const wchar_t int ITT_FORMAT __itt_group_mark d __itt_event event
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id parent
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t size
void const char const char int ITT_FORMAT __itt_group_sync p
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type size_t void ITT_FORMAT p const __itt_domain __itt_id __itt_string_handle const wchar_t size_t ITT_FORMAT lu const __itt_domain __itt_id __itt_relation __itt_id tail
#define TASK_UNTIED
Definition kmp.h:40
#define TASK_NOT_PUSHED
Definition kmp.h:37
#define __kmp_free(ptr)
Definition kmp.h:3762
kmp_info_t * __kmp_hidden_helper_main_thread
#define TASK_PROXY
Definition kmp.h:43
#define KMP_CPU_PAUSE()
Definition kmp.h:1598
kmp_global_t __kmp_global
#define TASK_EXPLICIT
Definition kmp.h:41
union kmp_task_team kmp_task_team_t
Definition kmp.h:256
void __kmp_hidden_helper_worker_thread_signal()
#define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time)
Definition kmp.h:1674
#define KMP_MAX_BLOCKTIME
Definition kmp.h:1258
#define INITIAL_TASK_DEQUE_SIZE
Definition kmp.h:2838
struct kmp_taskgroup kmp_taskgroup_t
#define KMP_TASKDATA_TO_TASK(taskdata)
Definition kmp.h:2471
#define KMP_NOT_SAFE_TO_REAP
Definition kmp.h:2152
#define TASK_DEQUE_MASK(td)
Definition kmp.h:2841
unsigned short __kmp_get_random(kmp_info_t *thread)
#define TASK_FULL
Definition kmp.h:44
kmp_tasking_mode_t __kmp_tasking_mode
void __kmp_abort_thread(void)
int __kmp_dflt_blocktime
volatile kmp_info_t * __kmp_thread_pool
#define KMP_GEN_TASK_ID()
Definition kmp.h:3686
kmp_bootstrap_lock_t __kmp_task_team_lock
int __kmp_omp_cancellation
#define TASK_DEQUE_SIZE(td)
Definition kmp.h:2840
#define KMP_GTID_TO_SHADOW_GTID(gtid)
Definition kmp.h:4616
#define __kmp_get_thread()
Definition kmp.h:3610
union kmp_depnode kmp_depnode_t
Definition kmp.h:2525
#define TASK_CURRENT_NOT_QUEUED
Definition kmp.h:34
static int __kmp_tid_from_gtid(int gtid)
Definition kmp.h:3625
#define KMP_MIN(x, y)
Definition kmp.h:303
volatile int __kmp_init_hidden_helper
@ KMP_EVENT_UNINITIALIZED
Definition kmp.h:2622
@ KMP_EVENT_ALLOW_COMPLETION
Definition kmp.h:2623
volatile int __kmp_init_middle
#define TASK_DETACHABLE
Definition kmp.h:45
@ cancel_parallel
Definition kmp.h:984
@ cancel_taskgroup
Definition kmp.h:987
@ cancel_noreq
Definition kmp.h:983
#define KMP_CHECK_UPDATE(a, b)
Definition kmp.h:2387
#define TASK_IMPLICIT
Definition kmp.h:42
#define KMP_TASK_TO_TASKDATA(task)
Definition kmp.h:2470
union KMP_ALIGN_CACHE kmp_thread_data kmp_thread_data_t
#define TASK_SUCCESSFULLY_PUSHED
Definition kmp.h:38
struct kmp_task_affinity_info kmp_task_affinity_info_t
#define TASK_TIED
Definition kmp.h:39
#define __kmp_thread_malloc(th, size)
Definition kmp.h:3782
void __kmp_middle_initialize(void)
static void copy_icvs(kmp_internal_control_t *dst, kmp_internal_control_t *src)
Definition kmp.h:2218
#define KMP_TASKING_ENABLED(task_team)
Definition kmp.h:2475
kmp_info_t ** __kmp_threads
#define KMP_HIDDEN_HELPER_THREAD(gtid)
Definition kmp.h:4602
int __kmp_enable_task_throttling
int __kmp_task_stealing_constraint
union kmp_team kmp_team_t
Definition kmp.h:254
#define KMP_INIT_YIELD(count)
Definition kmp.h:1601
#define KMP_INIT_BACKOFF(time)
Definition kmp.h:1604
#define KMP_YIELD(cond)
Definition kmp.h:1616
volatile int __kmp_init_parallel
kmp_int32 __kmp_enable_hidden_helper
#define __kmp_allocate(size)
Definition kmp.h:3760
#define TRUE
Definition kmp.h:1354
enum library_type __kmp_library
#define FALSE
Definition kmp.h:1353
struct kmp_tasking_flags kmp_tasking_flags_t
@ tskm_extra_barrier
Definition kmp.h:2452
@ tskm_task_teams
Definition kmp.h:2453
@ tskm_immediate_exec
Definition kmp.h:2451
struct kmp_task_pri kmp_task_pri_t
#define UNLIKELY(x)
Definition kmp.h:153
kmp_uint64 __kmp_taskloop_min_tasks
std::atomic< kmp_int32 > __kmp_unexecuted_hidden_helper_tasks
kmp_info_t ** __kmp_hidden_helper_threads
bool __kmp_wpolicy_passive
@ bs_forkjoin_barrier
Definition kmp.h:2168
void __kmp_hidden_helper_initialize()
#define __kmp_get_gtid()
Definition kmp.h:3606
kmp_int32 __kmp_max_task_priority
static void __kmp_assert_valid_gtid(kmp_int32 gtid)
Definition kmp.h:3650
static kmp_info_t * __kmp_thread_from_gtid(int gtid)
Definition kmp.h:3640
static int __kmp_gtid_from_thread(const kmp_info_t *thr)
Definition kmp.h:3635
@ library_throughput
Definition kmp.h:504
struct kmp_taskdata kmp_taskdata_t
Definition kmp.h:255
#define KMP_GTID_DNE
Definition kmp.h:1012
union KMP_ALIGN_CACHE kmp_info kmp_info_t
#define __kmp_thread_free(th, ptr)
Definition kmp.h:3788
KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86<<, 2i, 1, KMP_ARCH_X86) ATOMIC_CMPXCHG(fixed2, shr, kmp_int16, 16, > KMP_ARCH_X86 KMP_ARCH_X86 kmp_uint32
#define KE_TRACE(d, x)
Definition kmp_debug.h:161
#define KA_TRACE(d, x)
Definition kmp_debug.h:157
#define KMP_DEBUG_USE_VAR(x)
Definition kmp_debug.h:63
#define KMP_ASSERT(cond)
Definition kmp_debug.h:59
#define KMP_BUILD_ASSERT(expr)
Definition kmp_debug.h:26
#define KF_TRACE(d, x)
Definition kmp_debug.h:162
#define KMP_DEBUG_ASSERT(cond)
Definition kmp_debug.h:61
#define KMP_ASSERT2(cond, msg)
Definition kmp_debug.h:60
unsigned long long kmp_uint64
static volatile kmp_i18n_cat_status_t status
Definition kmp_i18n.cpp:48
#define KMP_FSYNC_RELEASING(obj)
Definition kmp_itt.h:335
#define KMP_FSYNC_ACQUIRED(obj)
Definition kmp_itt.h:334
#define KMP_FSYNC_SPIN_ACQUIRED(obj)
Definition kmp_itt.h:339
#define KMP_FSYNC_CANCEL(obj)
Definition kmp_itt.h:333
#define KMP_FSYNC_SPIN_PREPARE(obj)
Definition kmp_itt.h:338
#define USE_ITT_BUILD_ARG(x)
Definition kmp_itt.h:346
#define KMP_FSYNC_SPIN_INIT(obj, spin)
Definition kmp_itt.h:337
int __kmp_acquire_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid)
Definition kmp_lock.cpp:118
void __kmp_init_tas_lock(kmp_tas_lock_t *lck)
Definition kmp_lock.cpp:186
int __kmp_release_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid)
Definition kmp_lock.cpp:157
static void __kmp_release_bootstrap_lock(kmp_bootstrap_lock_t *lck)
Definition kmp_lock.h:535
kmp_ticket_lock_t kmp_bootstrap_lock_t
Definition kmp_lock.h:521
static int __kmp_test_lock(kmp_lock_t *lck, kmp_int32 gtid)
Definition kmp_lock.h:563
static int __kmp_acquire_bootstrap_lock(kmp_bootstrap_lock_t *lck)
Definition kmp_lock.h:527
static void __kmp_release_lock(kmp_lock_t *lck, kmp_int32 gtid)
Definition kmp_lock.h:567
static void __kmp_init_bootstrap_lock(kmp_bootstrap_lock_t *lck)
Definition kmp_lock.h:539
#define KMP_BOOTSTRAP_LOCK_INITIALIZER(lock)
Definition kmp_lock.h:523
#define TCW_PTR(a, b)
Definition kmp_os.h:1169
#define KMP_ATOMIC_AND(p, v)
Definition kmp_os.h:1269
#define KMP_SIZE_T_MAX
Definition kmp_os.h:195
kmp_uint32 kmp_uint
Definition kmp_os.h:215
#define KMP_ATOMIC_ST_REL(p, v)
Definition kmp_os.h:1263
bool __kmp_atomic_compare_store(std::atomic< T > *p, T expected, T desired)
Definition kmp_os.h:1278
#define TCR_PTR(a)
Definition kmp_os.h:1168
#define RCAST(type, var)
Definition kmp_os.h:292
#define CACHE_LINE
Definition kmp_os.h:340
#define KMP_ATOMIC_LD_ACQ(p)
Definition kmp_os.h:1261
#define TCW_SYNC_4(a, b)
Definition kmp_os.h:1148
#define KMP_ATOMIC_ST_RLX(p, v)
Definition kmp_os.h:1264
#define CCAST(type, var)
Definition kmp_os.h:291
#define KMP_MB()
Definition kmp_os.h:1068
kmp_int32 kmp_int
Definition kmp_os.h:214
#define TCR_4(a)
Definition kmp_os.h:1139
#define KMP_FALLTHROUGH()
Definition kmp_os.h:364
#define KMP_ATOMIC_DEC(p)
Definition kmp_os.h:1272
#define KMP_ATOMIC_LD_RLX(p)
Definition kmp_os.h:1262
#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv)
Definition kmp_os.h:816
#define TCW_4(a, b)
Definition kmp_os.h:1140
unsigned long kmp_uintptr_t
Definition kmp_os.h:205
#define KMP_DLSYM(name)
Definition kmp_os.h:1304
#define KMP_ATOMIC_OR(p, v)
Definition kmp_os.h:1270
#define KMP_ATOMIC_INC(p)
Definition kmp_os.h:1271
#define KMP_MEMCPY
#define KMP_MEMCPY_S(dst, bsz, src, cnt)
Functions for collecting statistics.
#define KMP_PUSH_PARTITIONED_TIMER(name)
Definition kmp_stats.h:1014
#define KMP_GET_THREAD_STATE()
Definition kmp_stats.h:1017
#define KMP_POP_PARTITIONED_TIMER()
Definition kmp_stats.h:1015
#define KMP_SET_THREAD_STATE_BLOCK(state_name)
Definition kmp_stats.h:1018
#define KMP_TIME_PARTITIONED_BLOCK(name)
Definition kmp_stats.h:1013
#define KMP_COUNT_BLOCK(n)
Definition kmp_stats.h:1001
#define i
Definition kmp_stub.cpp:87
static void __kmp_dephash_free(kmp_info_t *thread, kmp_dephash_t *h)
static void __kmp_dephash_free_entries(kmp_info_t *thread, kmp_dephash_t *h)
static void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task)
void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team)
template int __kmp_atomic_execute_tasks_64< true, false >(kmp_info_t *, kmp_int32, kmp_atomic_flag_64< true, false > *, int, int *USE_ITT_BUILD_ARG(void *), kmp_int32)
void __kmp_call_init(kmp_taskred_data_t &item, size_t j)
void __kmp_call_init< kmp_task_red_input_t >(kmp_taskred_data_t &item, size_t offset)
#define PROXY_TASK_FLAG
void(* p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32)
void * __kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, T *data)
static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg)
static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task)
kmp_task_t * __kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds, kmp_routine_entry_t task_entry)
void __kmp_reap_task_teams(void)
kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part)
kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task)
static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *current_task)
void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team)
int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64< C, S > *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained)
void __kmp_wait_to_unref_task_teams(void)
static kmp_task_team_t * __kmp_allocate_task_team(kmp_info_t *thread, kmp_team_t *team)
void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team)
void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data, kmp_taskgroup_t *tg, void *reduce_data)
void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task)
static void __kmp_realloc_task_deque(kmp_info_t *thread, kmp_thread_data_t *thread_data)
static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask)
void __kmpc_end_taskgroup(ident_t *loc, int gtid)
void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task)
static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata)
static int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained)
static void __kmp_free_task_and_ancestors(kmp_int32 gtid, kmp_taskdata_t *taskdata, kmp_info_t *thread)
static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *resumed_task)
static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *current_task)
static kmp_thread_data_t * __kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri)
template int __kmp_execute_tasks_64< false, true >(kmp_info_t *, kmp_int32, kmp_flag_64< false, true > *, int, int *USE_ITT_BUILD_ARG(void *), kmp_int32)
void __kmp_finish_implicit_task(kmp_info_t *thread)
template int __kmp_execute_tasks_32< false, false >(kmp_info_t *, kmp_int32, kmp_flag_32< false, false > *, int, int *USE_ITT_BUILD_ARG(void *), kmp_int32)
struct __taskloop_params __taskloop_params_t
kmp_event_t * __kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid, kmp_task_t *task)
static void __kmp_free_task_threads_data(kmp_task_team_t *task_team)
void __kmp_assign_orig(kmp_taskred_data_t &item, T &src)
static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid, void *frame_address, void *return_address)
static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg)
static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata, kmp_info_t *thread)
template int __kmp_execute_tasks_64< true, false >(kmp_info_t *, kmp_int32, kmp_flag_64< true, false > *, int, int *USE_ITT_BUILD_ARG(void *), kmp_int32)
int __kmp_atomic_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64< C, S > *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained)
void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait)
kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task)
static kmp_task_pri_t * __kmp_alloc_task_pri_list()
static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, int modifier, void *task_dup)
kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, void *codeptr_ra)
void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, kmp_team_t *team, int tid, int set_curr_task)
static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task)
static kmp_task_t * __kmp_steal_task(kmp_int32 victim_tid, kmp_int32 gtid, kmp_task_team_t *task_team, std::atomic< kmp_int32 > *unfinished_threads, int *thread_finished, kmp_int32 is_constrained)
void * __kmp_task_reduction_init(int gtid, int num, T *data)
static void __kmp_enable_tasking(kmp_task_team_t *task_team, kmp_info_t *this_thr)
static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task, kmp_int32 pass)
int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32< C, S > *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained)
static kmp_task_t * __kmp_get_priority_task(kmp_int32 gtid, kmp_task_team_t *task_team, kmp_int32 is_constrained)
void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid)
void __kmp_free_implicit_task(kmp_info_t *thread)
int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained)
static size_t __kmp_round_up_to_val(size_t size, size_t val)
kmp_task_t * __kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds, kmp_routine_entry_t task_entry, kmp_int64 device_id)
void __kmp_fulfill_event(kmp_event_t *event)
void __kmp_assign_orig< kmp_taskred_input_t >(kmp_taskred_data_t &item, kmp_taskred_input_t &src)
static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task, void *frame_address, void *return_address)
void __kmp_assign_orig< kmp_task_red_input_t >(kmp_taskred_data_t &item, kmp_task_red_input_t &src)
static int __kmp_realloc_task_threads_data(kmp_info_t *thread, kmp_task_team_t *task_team)
template int __kmp_atomic_execute_tasks_64< false, true >(kmp_info_t *, kmp_int32, kmp_atomic_flag_64< false, true > *, int, int *USE_ITT_BUILD_ARG(void *), kmp_int32)
static kmp_task_team_t * __kmp_free_task_teams
kmp_task_t * __kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src)
kmp_task_t * __kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_tasking_flags_t *flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds, kmp_routine_entry_t task_entry)
kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid)
int __kmp_taskloop_task(int gtid, void *ptask)
void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team, int tid)
void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team)
static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata)
static void __kmp_free_task_pri_list(kmp_task_team_t *task_team)
static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained, const kmp_taskdata_t *tasknew, const kmp_taskdata_t *taskcurr)
static kmp_task_t * __kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, kmp_task_team_t *task_team, kmp_int32 is_constrained)
void __kmpc_taskgroup(ident_t *loc, int gtid)
kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task, bool serialize_immediate)
static void __kmp_task_team_init(kmp_task_team_t *task_team, kmp_team_t *team)
void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr)
void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team)
static void __kmp_alloc_task_deque(kmp_info_t *thread, kmp_thread_data_t *thread_data)
static void __kmp_free_task_deque(kmp_thread_data_t *thread_data)
static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread, kmp_taskdata_t *taskdata, kmp_task_team_t *task_team, kmp_int32 pri)
void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start=0)
void __kmp_call_init< kmp_taskred_input_t >(kmp_taskred_data_t &item, size_t offset)
void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, kmp_uint64 ub_glob, kmp_uint64 num_tasks, kmp_uint64 grainsize, kmp_uint64 extras, kmp_int64 last_chunk, kmp_uint64 tc, void *task_dup)
void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *, kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64, kmp_uint64, void *)
static bool __kmp_track_children_task(kmp_taskdata_t *taskdata)
int counter
static void __kmp_null_resume_wrapper(kmp_info_t *thr)
int32_t kmp_int32
int arr[N][N][N]
#define C
int __kmpc_start_record_task(ident_t *, int, int, int)
void __kmpc_end_record_task(ident_t *, int, int, int)
#define res
ompt_callbacks_active_t ompt_enabled
return ret
ompt_callbacks_internal_t ompt_callbacks
#define OMPT_NOINLINE
#define OMPT_GET_RETURN_ADDRESS(level)
#define TASK_TYPE_DETAILS_FORMAT(info)
#define OMPT_FRAME_FLAGS_APP
#define OMPT_GET_FRAME_ADDRESS(level)
#define OMPT_FRAME_FLAGS_RUNTIME
ompt_team_info_t * __ompt_get_teaminfo(int depth, int *size)
int __ompt_get_task_info_internal(int ancestor_level, int *type, ompt_data_t **task_data, ompt_frame_t **task_frame, ompt_data_t **parallel_data, int *thread_num)
ompt_task_info_t * __ompt_get_task_info_object(int depth)
static id loc
volatile int flag
__attribute__((noinline))
kmp_lock_t * mtx_locks[MAX_MTX_DEPS]
Definition kmp.h:2576
kmp_int32 mtx_num_locks
Definition kmp.h:2577
kmp_int32 tt_found_proxy_tasks
Definition kmp.h:2874
KMP_ALIGN_CACHE std::atomic< kmp_int32 > tt_unfinished_threads
Definition kmp.h:2882
kmp_int32 tt_max_threads
Definition kmp.h:2873
kmp_int32 tt_nproc
Definition kmp.h:2872
kmp_bootstrap_lock_t tt_task_pri_lock
Definition kmp.h:2862
std::atomic< kmp_int32 > tt_num_task_pri
Definition kmp.h:2876
kmp_bootstrap_lock_t tt_threads_lock
Definition kmp.h:2858
kmp_int32 tt_untied_task_encountered
Definition kmp.h:2875
kmp_task_pri_t * tt_task_pri_list
Definition kmp.h:2863
kmp_int32 tt_hidden_helper_task_encountered
Definition kmp.h:2879
kmp_thread_data_t * tt_threads_data
Definition kmp.h:2867
KMP_ALIGN_CACHE volatile kmp_uint32 tt_active
Definition kmp.h:2886
kmp_task_team_t * tt_next
Definition kmp.h:2865
kmp_int32 tt_found_tasks
Definition kmp.h:2869
kmp_tas_lock_t lock
Definition kmp.h:2628
union kmp_event_t::@225247164214002037241165027111053205022334261172 ed
kmp_task_t * task
Definition kmp.h:2630
kmp_event_type_t type
Definition kmp.h:2627
void * async_handle
Definition kmp.h:2765
kmp_int32 reserved
Definition kmp.h:2617
struct kmp_task_affinity_info::@350356156072337073036041067341377344020243343142 flags
kmp_int32 priority
Definition kmp.h:2851
kmp_task_pri * next
Definition kmp.h:2852
kmp_thread_data_t td
Definition kmp.h:2850
Internal struct for reduction data item related info set up by compiler.
void * reduce_shar
shared between tasks item to reduce into
void * reduce_fini
data finalization routine
kmp_taskred_flags_t flags
flags for additional info from compiler
size_t reduce_size
size of data item in bytes
void * reduce_init
data initialization routine (single parameter)
void * reduce_comb
data combiner routine
kmp_task_team_list_t * next
Definition kmp.h:2897
kmp_task_team_t * task_team
Definition kmp.h:2896
void * shareds
pointer to block of pointers to shared vars
Definition kmp.h:2498
kmp_uint32 td_taskwait_counter
Definition kmp.h:2780
ident_t * td_taskwait_ident
Definition kmp.h:2779
kmp_int32 td_level
Definition kmp.h:2775
kmp_team_t * td_team
Definition kmp.h:2771
kmp_task_team_t * td_task_team
Definition kmp.h:2795
kmp_dephash_t * td_dephash
Definition kmp.h:2792
kmp_taskdata_t * td_parent
Definition kmp.h:2774
std::atomic< kmp_int32 > td_incomplete_child_tasks
Definition kmp.h:2788
std::atomic< kmp_int32 > td_untied_count
Definition kmp.h:2776
kmp_taskgroup_t * td_taskgroup
Definition kmp.h:2790
kmp_int32 td_task_id
Definition kmp.h:2769
kmp_info_p * td_alloc_thread
Definition kmp.h:2772
ident_t * td_ident
Definition kmp.h:2777
kmp_depnode_t * td_depnode
Definition kmp.h:2794
kmp_int32 td_taskwait_thread
Definition kmp.h:2781
kmp_tasking_flags_t td_flags
Definition kmp.h:2770
kmp_taskdata_t * td_last_tied
Definition kmp.h:2801
KMP_ALIGN_CACHE kmp_internal_control_t td_icvs
Definition kmp.h:2783
kmp_event_t td_allow_completion_event
Definition kmp.h:2806
size_t td_size_alloc
Definition kmp.h:2796
kmp_target_data_t td_target_data
Definition kmp.h:2815
KMP_ALIGN_CACHE std::atomic< kmp_int32 > td_allocated_child_tasks
Definition kmp.h:2785
std::atomic< kmp_int32 > cancel_request
Definition kmp.h:2516
uintptr_t * gomp_data
Definition kmp.h:2521
std::atomic< kmp_int32 > count
Definition kmp.h:2514
void * reduce_data
Definition kmp.h:2519
struct kmp_taskgroup * parent
Definition kmp.h:2517
kmp_int32 reduce_num_data
Definition kmp.h:2520
unsigned target
Definition kmp.h:2753
unsigned priority_specified
Definition kmp.h:2731
unsigned detachable
Definition kmp.h:2733
unsigned task_serial
Definition kmp.h:2741
unsigned merged_if0
Definition kmp.h:2725
unsigned complete
Definition kmp.h:2750
unsigned freed
Definition kmp.h:2751
unsigned executing
Definition kmp.h:2749
unsigned tasking_ser
Definition kmp.h:2742
unsigned team_serial
Definition kmp.h:2744
unsigned native
Definition kmp.h:2752
unsigned tiedness
Definition kmp.h:2723
unsigned started
Definition kmp.h:2748
unsigned destructors_thunk
Definition kmp.h:2727
unsigned proxy
Definition kmp.h:2729
unsigned tasktype
Definition kmp.h:2740
unsigned final
Definition kmp.h:2724
unsigned hidden_helper
Definition kmp.h:2754
Internal struct for reduction data item related info saved by the library.
void * reduce_init
data initialization routine (two parameters)
void * reduce_priv
array of thread specific items
void * reduce_pend
end of private data for faster comparison op
void * reduce_comb
data combiner routine
kmp_taskred_flags_t flags
flags for additional info from compiler
void * reduce_fini
data finalization routine
size_t reduce_size
size of data item
void * reduce_shar
shared between tasks item to reduce into
void * reduce_orig
original item (can be used in UDR initializer)
Flags for special info per task reduction item.
unsigned lazy_priv
1 - use lazy alloc/init (e.g.
Internal struct for reduction data item related info set up by compiler.
void * reduce_shar
shared between tasks item to reduce into
void * reduce_fini
data finalization routine
void * reduce_init
data initialization routine (two parameters)
void * reduce_comb
data combiner routine
size_t reduce_size
size of data item
void * reduce_orig
original reduction item used for initialization
kmp_taskred_flags_t flags
flags for additional info from compiler
ompt_data_t task_data
ompt_frame_t frame
ompt_data_t parallel_data
ompt_wait_id_t wait_id
int(* routine)(int, struct task *)
int th
void ** shareds
kmp_base_depnode_t dn
Definition kmp.h:2589
kmp_base_task_team_t tt
Definition kmp.h:2890
kmp_base_team_t t
Definition kmp.h:3240
int __kmp_is_thread_alive(kmp_info_t *th, DWORD *exit_val)