LLVM OpenMP 22.0.0git
kmp_tasking.cpp
Go to the documentation of this file.
1/*
2 * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_i18n.h"
15#include "kmp_itt.h"
16#include "kmp_stats.h"
17#include "kmp_wait_release.h"
18#include "kmp_taskdeps.h"
19
20#if OMPT_SUPPORT
21#include "ompt-specific.h"
22#endif
23
24#if ENABLE_LIBOMPTARGET
25static void (*tgt_target_nowait_query)(void **);
26
27void __kmp_init_target_task() {
28 *(void **)(&tgt_target_nowait_query) = KMP_DLSYM("__tgt_target_nowait_query");
29}
30#endif
31
32/* forward declaration */
33static void __kmp_enable_tasking(kmp_task_team_t *task_team,
34 kmp_info_t *this_thr);
35static void __kmp_alloc_task_deque(kmp_info_t *thread,
36 kmp_thread_data_t *thread_data);
38 kmp_task_team_t *task_team);
40#if OMPX_TASKGRAPH
41static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
42int __kmp_taskloop_task(int gtid, void *ptask);
43#endif
44
45// returns 1 if new task is allowed to execute, 0 otherwise
46// checks Task Scheduling constraint (if requested) and
47// mutexinoutset dependencies if any
48static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
49 const kmp_taskdata_t *tasknew,
50 const kmp_taskdata_t *taskcurr) {
51 if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
52 // Check if the candidate obeys the Task Scheduling Constraints (TSC)
53 // only descendant of all deferred tied tasks can be scheduled, checking
54 // the last one is enough, as it in turn is the descendant of all others
55 kmp_taskdata_t *current = taskcurr->td_last_tied;
56 KMP_DEBUG_ASSERT(current != NULL);
57 // check if the task is not suspended on barrier
58 if (current->td_flags.tasktype == TASK_EXPLICIT ||
59 current->td_taskwait_thread > 0) { // <= 0 on barrier
60 kmp_int32 level = current->td_level;
62 while (parent != current && parent->td_level > level) {
63 // check generation up to the level of the current task
64 parent = parent->td_parent;
65 KMP_DEBUG_ASSERT(parent != NULL);
66 }
67 if (parent != current)
68 return false;
69 }
70 }
71 // Check mutexinoutset dependencies, acquire locks
72 kmp_depnode_t *node = tasknew->td_depnode;
73#if OMPX_TASKGRAPH
74 if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
75#else
76 if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
77#endif
78 for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
79 KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
80 if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
81 continue;
82 // could not get the lock, release previous locks
83 for (int j = i - 1; j >= 0; --j)
84 __kmp_release_lock(node->dn.mtx_locks[j], gtid);
85 return false;
86 }
87 // negative num_locks means all locks acquired successfully
88 node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
89 }
90 return true;
91}
92
93// __kmp_realloc_task_deque:
94// Re-allocates a task deque for a particular thread, copies the content from
95// the old deque and adjusts the necessary data structures relating to the
96// deque. This operation must be done with the deque_lock being held
98 kmp_thread_data_t *thread_data) {
99 kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
100 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
101 kmp_int32 new_size = 2 * size;
102
103 KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
104 "%d] for thread_data %p\n",
105 __kmp_gtid_from_thread(thread), size, new_size, thread_data));
106
107 kmp_taskdata_t **new_deque =
109
110 int i, j;
111 for (i = thread_data->td.td_deque_head, j = 0; j < size;
112 i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
113 new_deque[j] = thread_data->td.td_deque[i];
114
115 __kmp_free(thread_data->td.td_deque);
116
117 thread_data->td.td_deque_head = 0;
118 thread_data->td.td_deque_tail = size;
119 thread_data->td.td_deque = new_deque;
120 thread_data->td.td_deque_size = new_size;
121}
122
125 kmp_thread_data_t *thread_data = &l->td;
126 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
127 thread_data->td.td_deque_last_stolen = -1;
128 KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] "
129 "for thread_data %p\n",
130 __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data));
131 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
133 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
134 return l;
135}
136
137// The function finds the deque of priority tasks with given priority, or
138// allocates a new deque and put it into sorted (high -> low) list of deques.
139// Deques of non-default priority tasks are shared between all threads in team,
140// as opposed to per-thread deques of tasks with default priority.
141// The function is called under the lock task_team->tt.tt_task_pri_lock.
142static kmp_thread_data_t *
144 kmp_thread_data_t *thread_data;
145 kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
146 if (lst->priority == pri) {
147 // Found queue of tasks with given priority.
148 thread_data = &lst->td;
149 } else if (lst->priority < pri) {
150 // All current priority queues contain tasks with lower priority.
151 // Allocate new one for given priority tasks.
153 thread_data = &list->td;
154 list->priority = pri;
155 list->next = lst;
156 task_team->tt.tt_task_pri_list = list;
157 } else { // task_team->tt.tt_task_pri_list->priority > pri
158 kmp_task_pri_t *next_queue = lst->next;
159 while (next_queue && next_queue->priority > pri) {
160 lst = next_queue;
161 next_queue = lst->next;
162 }
163 // lst->priority > pri && (next == NULL || pri >= next->priority)
164 if (next_queue == NULL) {
165 // No queue with pri priority, need to allocate new one.
167 thread_data = &list->td;
168 list->priority = pri;
169 list->next = NULL;
170 lst->next = list;
171 } else if (next_queue->priority == pri) {
172 // Found queue of tasks with given priority.
173 thread_data = &next_queue->td;
174 } else { // lst->priority > pri > next->priority
175 // insert newly allocated between existed queues
177 thread_data = &list->td;
178 list->priority = pri;
179 list->next = next_queue;
180 lst->next = list;
181 }
182 }
183 return thread_data;
184}
185
186// __kmp_push_priority_task: Add a task to the team's priority task deque
188 kmp_taskdata_t *taskdata,
189 kmp_task_team_t *task_team,
190 kmp_int32 pri) {
191 kmp_thread_data_t *thread_data = NULL;
192 KA_TRACE(20,
193 ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n",
194 gtid, taskdata, pri));
195
196 // Find task queue specific to priority value
197 kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
198 if (UNLIKELY(lst == NULL)) {
200 if (task_team->tt.tt_task_pri_list == NULL) {
201 // List of queues is still empty, allocate one.
203 thread_data = &list->td;
204 list->priority = pri;
205 list->next = NULL;
206 task_team->tt.tt_task_pri_list = list;
207 } else {
208 // Other thread initialized a queue. Check if it fits and get thread_data.
209 thread_data = __kmp_get_priority_deque_data(task_team, pri);
210 }
212 } else {
213 if (lst->priority == pri) {
214 // Found queue of tasks with given priority.
215 thread_data = &lst->td;
216 } else {
218 thread_data = __kmp_get_priority_deque_data(task_team, pri);
220 }
221 }
222 KMP_DEBUG_ASSERT(thread_data);
223
224 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
225 // Check if deque is full
226 if (TCR_4(thread_data->td.td_deque_ntasks) >=
227 TASK_DEQUE_SIZE(thread_data->td)) {
230 thread->th.th_current_task)) {
231 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
232 KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "
233 "TASK_NOT_PUSHED for task %p\n",
234 gtid, taskdata));
235 return TASK_NOT_PUSHED;
236 } else {
237 // expand deque to push the task which is not allowed to execute
238 __kmp_realloc_task_deque(thread, thread_data);
239 }
240 }
241 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
242 TASK_DEQUE_SIZE(thread_data->td));
243 // Push taskdata.
244 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
245 // Wrap index.
246 thread_data->td.td_deque_tail =
247 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
248 TCW_4(thread_data->td.td_deque_ntasks,
249 TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
250 KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
251 KMP_FSYNC_RELEASING(taskdata); // releasing child
252 KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning "
253 "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n",
254 gtid, taskdata, thread_data->td.td_deque_ntasks,
255 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
256 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
257 task_team->tt.tt_num_task_pri++; // atomic inc
259}
260
261// __kmp_push_task: Add a task to the thread's deque
263 kmp_info_t *thread = __kmp_threads[gtid];
265
266 // If we encounter a hidden helper task, and the current thread is not a
267 // hidden helper thread, we have to give the task to any hidden helper thread
268 // starting from its shadow one.
269 if (UNLIKELY(taskdata->td_flags.hidden_helper &&
270 !KMP_HIDDEN_HELPER_THREAD(gtid))) {
271 kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
273 // Signal the hidden helper threads.
276 }
277
278 kmp_task_team_t *task_team = thread->th.th_task_team;
279 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
280 kmp_thread_data_t *thread_data;
281
282 KA_TRACE(20,
283 ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
284
285 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
286 // untied task needs to increment counter so that the task structure is not
287 // freed prematurely
290 KA_TRACE(
291 20,
292 ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
293 gtid, counter, taskdata));
294 }
295
296 // The first check avoids building task_team thread data if serialized
297 if (UNLIKELY(taskdata->td_flags.task_serial)) {
298 KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
299 "TASK_NOT_PUSHED for task %p\n",
300 gtid, taskdata));
301 return TASK_NOT_PUSHED;
302 }
303
304 // Now that serialized tasks have returned, we can assume that we are not in
305 // immediate exec mode
307 if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
308 __kmp_enable_tasking(task_team, thread);
309 }
311 KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
312
313 if (taskdata->td_flags.priority_specified && task->data2.priority > 0 &&
315 int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority);
316 return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri);
317 }
318
319 // Find tasking deque specific to encountering thread
320 thread_data = &task_team->tt.tt_threads_data[tid];
321
322 // No lock needed since only owner can allocate. If the task is hidden_helper,
323 // we don't need it either because we have initialized the dequeue for hidden
324 // helper thread data.
325 if (UNLIKELY(thread_data->td.td_deque == NULL)) {
326 __kmp_alloc_task_deque(thread, thread_data);
327 }
328
329 int locked = 0;
330 // Check if deque is full
331 if (TCR_4(thread_data->td.td_deque_ntasks) >=
332 TASK_DEQUE_SIZE(thread_data->td)) {
335 thread->th.th_current_task)) {
336 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
337 "TASK_NOT_PUSHED for task %p\n",
338 gtid, taskdata));
339 return TASK_NOT_PUSHED;
340 } else {
341 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
342 locked = 1;
343 if (TCR_4(thread_data->td.td_deque_ntasks) >=
344 TASK_DEQUE_SIZE(thread_data->td)) {
345 // expand deque to push the task which is not allowed to execute
346 __kmp_realloc_task_deque(thread, thread_data);
347 }
348 }
349 }
350 // Lock the deque for the task push operation
351 if (!locked) {
352 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
353 // Need to recheck as we can get a proxy task from thread outside of OpenMP
354 if (TCR_4(thread_data->td.td_deque_ntasks) >=
355 TASK_DEQUE_SIZE(thread_data->td)) {
358 thread->th.th_current_task)) {
359 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
360 KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
361 "returning TASK_NOT_PUSHED for task %p\n",
362 gtid, taskdata));
363 return TASK_NOT_PUSHED;
364 } else {
365 // expand deque to push the task which is not allowed to execute
366 __kmp_realloc_task_deque(thread, thread_data);
367 }
368 }
369 }
370 // Must have room since no thread can add tasks but calling thread
371 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
372 TASK_DEQUE_SIZE(thread_data->td));
373
374 thread_data->td.td_deque[thread_data->td.td_deque_tail] =
375 taskdata; // Push taskdata
376 // Wrap index.
377 thread_data->td.td_deque_tail =
378 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
379 TCW_4(thread_data->td.td_deque_ntasks,
380 TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
381 KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
382 KMP_FSYNC_RELEASING(taskdata); // releasing child
383 KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
384 "task=%p ntasks=%d head=%u tail=%u\n",
385 gtid, taskdata, thread_data->td.td_deque_ntasks,
386 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
387
388 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
389
391}
392
393// __kmp_pop_current_task_from_thread: set up current task from called thread
394// when team ends
395//
396// this_thr: thread structure to set current_task in.
398 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
399 "this_thread=%p, curtask=%p, "
400 "curtask_parent=%p\n",
401 0, this_thr, this_thr->th.th_current_task,
402 this_thr->th.th_current_task->td_parent));
403
404 this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
405
406 KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
407 "this_thread=%p, curtask=%p, "
408 "curtask_parent=%p\n",
409 0, this_thr, this_thr->th.th_current_task,
410 this_thr->th.th_current_task->td_parent));
411}
412
413// __kmp_push_current_task_to_thread: set up current task in called thread for a
414// new team
415//
416// this_thr: thread structure to set up
417// team: team for implicit task data
418// tid: thread within team to set up
420 int tid) {
421 // current task of the thread is a parent of the new just created implicit
422 // tasks of new team
423 KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
424 "curtask=%p "
425 "parent_task=%p\n",
426 tid, this_thr, this_thr->th.th_current_task,
427 team->t.t_implicit_task_taskdata[tid].td_parent));
428
429 KMP_DEBUG_ASSERT(this_thr != NULL);
430
431 if (tid == 0) {
432 if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
433 team->t.t_implicit_task_taskdata[0].td_parent =
434 this_thr->th.th_current_task;
435 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
436 }
437 } else {
438 team->t.t_implicit_task_taskdata[tid].td_parent =
439 team->t.t_implicit_task_taskdata[0].td_parent;
440 this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
441 }
442
443 KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
444 "curtask=%p "
445 "parent_task=%p\n",
446 tid, this_thr, this_thr->th.th_current_task,
447 team->t.t_implicit_task_taskdata[tid].td_parent));
448}
449
450// __kmp_task_start: bookkeeping for a task starting execution
451//
452// GTID: global thread id of calling thread
453// task: task starting execution
454// current_task: task suspending
456 kmp_taskdata_t *current_task) {
458 kmp_info_t *thread = __kmp_threads[gtid];
459
460 KA_TRACE(10,
461 ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
462 gtid, taskdata, current_task));
463
465
466 // mark currently executing task as suspended
467 // TODO: GEH - make sure root team implicit task is initialized properly.
468 // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
469 current_task->td_flags.executing = 0;
470
471 // mark starting task as executing and as current task
472 thread->th.th_current_task = taskdata;
473
474 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
475 taskdata->td_flags.tiedness == TASK_UNTIED);
476 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
477 taskdata->td_flags.tiedness == TASK_UNTIED);
478 taskdata->td_flags.started = 1;
479 taskdata->td_flags.executing = 1;
480 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
481 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
482
483 // GEH TODO: shouldn't we pass some sort of location identifier here?
484 // APT: yes, we will pass location here.
485 // need to store current thread state (in a thread or taskdata structure)
486 // before setting work_state, otherwise wrong state is set after end of task
487
488 KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
489
490 return;
491}
492
493#if OMPT_SUPPORT
494//------------------------------------------------------------------------------
495
496// __ompt_task_start:
497// Build and trigger task-begin event
498static inline void __ompt_task_start(kmp_task_t *task,
499 kmp_taskdata_t *current_task,
500 kmp_int32 gtid) {
502 ompt_task_status_t status = ompt_task_switch;
503 if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
504 status = ompt_task_yield;
505 __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
506 }
507 /* let OMPT know that we're about to run this task */
508 if (ompt_enabled.ompt_callback_task_schedule) {
509 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
510 &(current_task->ompt_task_info.task_data), status,
511 &(taskdata->ompt_task_info.task_data));
512 }
513 taskdata->ompt_task_info.scheduling_parent = current_task;
514}
515
516// __ompt_task_finish:
517// Build and trigger final task-schedule event
518static inline void __ompt_task_finish(kmp_task_t *task,
519 kmp_taskdata_t *resumed_task,
520 ompt_task_status_t status) {
521 if (ompt_enabled.ompt_callback_task_schedule) {
523 if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
525 status = ompt_task_cancel;
526 }
527
528 /* let OMPT know that we're returning to the callee task */
529 ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
530 &(taskdata->ompt_task_info.task_data), status,
531 (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
532 }
533}
534#endif
535
536template <bool ompt>
539 void *frame_address,
540 void *return_address) {
542 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
543
544 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
545 "current_task=%p\n",
546 gtid, loc_ref, taskdata, current_task));
547
548 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
549 // untied task needs to increment counter so that the task structure is not
550 // freed prematurely
553 KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
554 "incremented for task %p\n",
555 gtid, counter, taskdata));
556 }
557
558 taskdata->td_flags.task_serial =
559 1; // Execute this task immediately, not deferred.
560 __kmp_task_start(gtid, task, current_task);
561
562#if OMPT_SUPPORT
563 if (ompt) {
564 if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
565 current_task->ompt_task_info.frame.enter_frame.ptr =
566 taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
567 current_task->ompt_task_info.frame.enter_frame_flags =
568 taskdata->ompt_task_info.frame.exit_frame_flags =
570 }
571 if (ompt_enabled.ompt_callback_task_create) {
572 ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
573 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
574 &(parent_info->task_data), &(parent_info->frame),
575 &(taskdata->ompt_task_info.task_data),
576 TASK_TYPE_DETAILS_FORMAT(taskdata), 0, return_address);
577 }
578 __ompt_task_start(task, current_task, gtid);
579 }
580#endif // OMPT_SUPPORT
581
582 KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
583 loc_ref, taskdata));
584}
585
586#if OMPT_SUPPORT
588static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
590 void *frame_address,
591 void *return_address) {
592 __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
593 return_address);
594}
595#endif // OMPT_SUPPORT
596
597// __kmpc_omp_task_begin_if0: report that a given serialized task has started
598// execution
599//
600// loc_ref: source location information; points to beginning of task block.
601// gtid: global thread number.
602// task: task thunk for the started task.
603#ifdef __s390x__
604// This is required for OMPT_GET_FRAME_ADDRESS(1) to compile on s390x.
605// In order for it to work correctly, the caller also needs to be compiled with
606// backchain. If a caller is compiled without backchain,
607// OMPT_GET_FRAME_ADDRESS(1) will produce an incorrect value, but will not
608// crash.
609__attribute__((target("backchain")))
610#endif
612 kmp_task_t *task) {
613#if OMPT_SUPPORT
615 OMPT_STORE_RETURN_ADDRESS(gtid);
616 __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
618 OMPT_LOAD_RETURN_ADDRESS(gtid));
619 return;
620 }
621#endif
622 __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
623}
624
625#ifdef TASK_UNUSED
626// __kmpc_omp_task_begin: report that a given task has started execution
627// NEVER GENERATED BY COMPILER, DEPRECATED!!!
628void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
629 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
630
631 KA_TRACE(
632 10,
633 ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
634 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
635
636 __kmp_task_start(gtid, task, current_task);
637
638 KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
639 loc_ref, KMP_TASK_TO_TASKDATA(task)));
640 return;
641}
642#endif // TASK_UNUSED
643
644// __kmp_free_task: free the current task space and the space for shareds
645//
646// gtid: Global thread ID of calling thread
647// taskdata: task to free
648// thread: thread data structure of caller
649static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
650 kmp_info_t *thread) {
651 KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
652 taskdata));
653
654 // Check to make sure all flags and counters have the correct values
656 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
657 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
658 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
660 taskdata->td_flags.task_serial == 1);
663 // Clear data to not be re-used later by mistake.
664 task->data1.destructors = NULL;
665 task->data2.priority = 0;
666
667 taskdata->td_flags.freed = 1;
668#if OMPX_TASKGRAPH
669 // do not free tasks in taskgraph
670 if (!taskdata->is_taskgraph) {
671#endif
672// deallocate the taskdata and shared variable blocks associated with this task
673#if USE_FAST_MEMORY
674 __kmp_fast_free(thread, taskdata);
675#else /* ! USE_FAST_MEMORY */
676 __kmp_thread_free(thread, taskdata);
677#endif
678#if OMPX_TASKGRAPH
679 } else {
680 taskdata->td_flags.complete = 0;
681 taskdata->td_flags.started = 0;
682 taskdata->td_flags.freed = 0;
683 taskdata->td_flags.executing = 0;
684 taskdata->td_flags.task_serial =
685 (taskdata->td_parent->td_flags.final ||
686 taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);
687
688 // taskdata->td_allow_completion_event.pending_events_count = 1;
689 KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
691 // start at one because counts current task and children
693 }
694#endif
695
696 KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
697}
698
699// __kmp_free_task_and_ancestors: free the current task and ancestors without
700// children
701//
702// gtid: Global thread ID of calling thread
703// taskdata: task to free
704// thread: thread data structure of caller
706 kmp_taskdata_t *taskdata,
707 kmp_info_t *thread) {
708 // Proxy tasks must always be allowed to free their parents
709 // because they can be run in background even in serial mode.
710 kmp_int32 team_serial =
711 (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
712 !taskdata->td_flags.proxy;
714
715 kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
716 KMP_DEBUG_ASSERT(children >= 0);
717
718 // Now, go up the ancestor tree to see if any ancestors can now be freed.
719 while (children == 0) {
720 kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
721
722 KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
723 "and freeing itself\n",
724 gtid, taskdata));
725
726 // --- Deallocate my ancestor task ---
727 __kmp_free_task(gtid, taskdata, thread);
728
729 taskdata = parent_taskdata;
730
731 if (team_serial)
732 return;
733 // Stop checking ancestors at implicit task instead of walking up ancestor
734 // tree to avoid premature deallocation of ancestors.
735 if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
736 if (taskdata->td_dephash) { // do we need to cleanup dephash?
737 int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
738 kmp_tasking_flags_t flags_old = taskdata->td_flags;
739 if (children == 0 && flags_old.complete == 1) {
740 kmp_tasking_flags_t flags_new = flags_old;
741 flags_new.complete = 0;
743 RCAST(kmp_int32 *, &taskdata->td_flags),
744 *RCAST(kmp_int32 *, &flags_old),
745 *RCAST(kmp_int32 *, &flags_new))) {
746 KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
747 "dephash of implicit task %p\n",
748 gtid, taskdata));
749 // cleanup dephash of finished implicit task
750 __kmp_dephash_free_entries(thread, taskdata->td_dephash);
751 }
752 }
753 }
754 return;
755 }
756 // Predecrement simulated by "- 1" calculation
757 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
758 KMP_DEBUG_ASSERT(children >= 0);
759 }
760
761 KA_TRACE(
762 20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
763 "not freeing it yet\n",
764 gtid, taskdata, children));
765}
766
767// Only need to keep track of child task counts if any of the following:
768// 1. team parallel and tasking not serialized;
769// 2. it is a proxy or detachable or hidden helper task
770// 3. the children counter of its parent task is greater than 0.
771// The reason for the 3rd one is for serialized team that found detached task,
772// hidden helper task, T. In this case, the execution of T is still deferred,
773// and it is also possible that a regular task depends on T. In this case, if we
774// don't track the children, task synchronization will be broken.
776 kmp_tasking_flags_t flags = taskdata->td_flags;
777 bool ret = !(flags.team_serial || flags.tasking_ser);
778 ret = ret || flags.proxy == TASK_PROXY ||
779 flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
780 ret = ret ||
782#if OMPX_TASKGRAPH
783 if (taskdata->td_taskgroup && taskdata->is_taskgraph)
784 ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0;
785#endif
786 return ret;
787}
788
789// __kmp_task_finish: bookkeeping to do when a task finishes execution
790//
791// gtid: global thread ID for calling thread
792// task: task to be finished
793// resumed_task: task to be resumed. (may be NULL if task is serialized)
794//
795// template<ompt>: effectively ompt_enabled.enabled!=0
796// the version with ompt=false is inlined, allowing to optimize away all ompt
797// code in this case
798template <bool ompt>
800 kmp_taskdata_t *resumed_task) {
802 kmp_info_t *thread = __kmp_threads[gtid];
803 kmp_task_team_t *task_team =
804 thread->th.th_task_team; // might be NULL for serial teams...
805#if OMPX_TASKGRAPH
806 // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop
807 bool is_taskgraph;
808#endif
809#if KMP_DEBUG
810 kmp_int32 children = 0;
811#endif
812 KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
813 "task %p\n",
814 gtid, taskdata, resumed_task));
815
817
818#if OMPX_TASKGRAPH
819 is_taskgraph = taskdata->is_taskgraph;
820#endif
821
822 if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
823 // untied task needs to check the counter so that the task structure is not
824 // freed prematurely
826 KA_TRACE(
827 20,
828 ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
829 gtid, counter, taskdata));
830 if (counter > 0) {
831 // untied task is not done, to be continued possibly by other thread, do
832 // not free it now
833 if (resumed_task == NULL) {
835 resumed_task = taskdata->td_parent; // In a serialized task, the resumed
836 // task is the parent
837 }
838 thread->th.th_current_task = resumed_task; // restore current_task
839 resumed_task->td_flags.executing = 1; // resume previous task
840 KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
841 "resuming task %p\n",
842 gtid, taskdata, resumed_task));
843 return;
844 }
845 }
846
847 // bookkeeping for resuming task:
848 // GEH - note tasking_ser => task_serial
850 (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
851 taskdata->td_flags.task_serial);
852 if (taskdata->td_flags.task_serial) {
853 if (resumed_task == NULL) {
854 resumed_task = taskdata->td_parent; // In a serialized task, the resumed
855 // task is the parent
856 }
857 } else {
858 KMP_DEBUG_ASSERT(resumed_task !=
859 NULL); // verify that resumed task is passed as argument
860 }
861
862 /* If the tasks' destructor thunk flag has been set, we need to invoke the
863 destructor thunk that has been generated by the compiler. The code is
864 placed here, since at this point other tasks might have been released
865 hence overlapping the destructor invocations with some other work in the
866 released tasks. The OpenMP spec is not specific on when the destructors
867 are invoked, so we should be free to choose. */
868 if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
869 kmp_routine_entry_t destr_thunk = task->data1.destructors;
870 KMP_ASSERT(destr_thunk);
871 destr_thunk(gtid, task);
872 }
873
874 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
875 KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
876 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
877
878 bool completed = true;
879 if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
880 if (taskdata->td_allow_completion_event.type ==
882 // event hasn't been fulfilled yet. Try to detach task.
884 if (taskdata->td_allow_completion_event.type ==
886 // task finished execution
887 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
888 taskdata->td_flags.executing = 0; // suspend the finishing task
889
890#if OMPT_SUPPORT
891 // For a detached task, which is not completed, we switch back
892 // the omp_fulfill_event signals completion
893 // locking is necessary to avoid a race with ompt_task_late_fulfill
894 if (ompt)
895 __ompt_task_finish(task, resumed_task, ompt_task_detach);
896#endif
897
898 // no access to taskdata after this point!
899 // __kmp_fulfill_event might free taskdata at any time from now
900
901 taskdata->td_flags.proxy = TASK_PROXY; // proxify!
902 completed = false;
903 }
905 }
906 }
907
908 // Tasks with valid target async handles must be re-enqueued.
909 if (taskdata->td_target_data.async_handle != NULL) {
910 // Note: no need to translate gtid to its shadow. If the current thread is a
911 // hidden helper one, then the gtid is already correct. Otherwise, hidden
912 // helper threads are disabled, and gtid refers to a OpenMP thread.
913#if OMPT_SUPPORT
914 if (ompt) {
915 __ompt_task_finish(task, resumed_task, ompt_task_switch);
916 }
917#endif
919 if (KMP_HIDDEN_HELPER_THREAD(gtid))
921 completed = false;
922 }
923
924 if (completed) {
925 taskdata->td_flags.complete = 1; // mark the task as completed
926#if OMPX_TASKGRAPH
927 taskdata->td_flags.onced = 1; // mark the task as ran once already
928#endif
929
930#if OMPT_SUPPORT
931 // This is not a detached task, we are done here
932 if (ompt)
933 __ompt_task_finish(task, resumed_task, ompt_task_complete);
934#endif
935 // TODO: What would be the balance between the conditions in the function
936 // and an atomic operation?
937 if (__kmp_track_children_task(taskdata)) {
938 __kmp_release_deps(gtid, taskdata);
939 // Predecrement simulated by "- 1" calculation
940#if KMP_DEBUG
941 children = -1 +
942#endif
944 KMP_DEBUG_ASSERT(children >= 0);
945#if OMPX_TASKGRAPH
946 if (taskdata->td_taskgroup && !taskdata->is_taskgraph)
947#else
948 if (taskdata->td_taskgroup)
949#endif
950 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
951 } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
953 // if we found proxy or hidden helper tasks there could exist a dependency
954 // chain with the proxy task as origin
955 __kmp_release_deps(gtid, taskdata);
956 }
957 // td_flags.executing must be marked as 0 after __kmp_release_deps has been
958 // called. Othertwise, if a task is executed immediately from the
959 // release_deps code, the flag will be reset to 1 again by this same
960 // function
961 KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
962 taskdata->td_flags.executing = 0; // suspend the finishing task
963
964 // Decrement the counter of hidden helper tasks to be executed.
965 if (taskdata->td_flags.hidden_helper) {
966 // Hidden helper tasks can only be executed by hidden helper threads.
969 }
970 }
971
972 KA_TRACE(
973 20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
974 gtid, taskdata, children));
975
976 // Free this task and then ancestor tasks if they have no children.
977 // Restore th_current_task first as suggested by John:
978 // johnmc: if an asynchronous inquiry peers into the runtime system
979 // it doesn't see the freed task as the current task.
980 thread->th.th_current_task = resumed_task;
981 if (completed)
982 __kmp_free_task_and_ancestors(gtid, taskdata, thread);
983
984 // TODO: GEH - make sure root team implicit task is initialized properly.
985 // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
986 resumed_task->td_flags.executing = 1; // resume previous task
987
988#if OMPX_TASKGRAPH
989 if (is_taskgraph && __kmp_track_children_task(taskdata) &&
990 taskdata->td_taskgroup) {
991 // TDG: we only release taskgroup barrier here because
992 // free_task_and_ancestors will call
993 // __kmp_free_task, which resets all task parameters such as
994 // taskdata->started, etc. If we release the barrier earlier, these
995 // parameters could be read before being reset. This is not an issue for
996 // non-TDG implementation because we never reuse a task(data) structure
997 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
998 }
999#endif
1000
1001 KA_TRACE(
1002 10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
1003 gtid, taskdata, resumed_task));
1004
1005 return;
1006}
1007
1008template <bool ompt>
1010 kmp_int32 gtid,
1011 kmp_task_t *task) {
1012 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
1013 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1014 KMP_DEBUG_ASSERT(gtid >= 0);
1015 // this routine will provide task to resume
1016 __kmp_task_finish<ompt>(gtid, task, NULL);
1017
1018 KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
1019 gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1020
1021#if OMPT_SUPPORT
1022 if (ompt) {
1023 ompt_frame_t *ompt_frame;
1024 __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1025 ompt_frame->enter_frame = ompt_data_none;
1026 ompt_frame->enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
1027 }
1028#endif
1029
1030 return;
1031}
1032
1033#if OMPT_SUPPORT
1035void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1036 kmp_task_t *task) {
1037 __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1038}
1039#endif // OMPT_SUPPORT
1040
1041// __kmpc_omp_task_complete_if0: report that a task has completed execution
1042//
1043// loc_ref: source location information; points to end of task block.
1044// gtid: global thread number.
1045// task: task thunk for the completed task.
1047 kmp_task_t *task) {
1048#if OMPT_SUPPORT
1050 __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1051 return;
1052 }
1053#endif
1054 __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1055}
1056
1057#ifdef TASK_UNUSED
1058// __kmpc_omp_task_complete: report that a task has completed execution
1059// NEVER GENERATED BY COMPILER, DEPRECATED!!!
1060void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1061 kmp_task_t *task) {
1062 KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1063 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1064
1065 __kmp_task_finish<false>(gtid, task,
1066 NULL); // Not sure how to find task to resume
1067
1068 KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1069 loc_ref, KMP_TASK_TO_TASKDATA(task)));
1070 return;
1071}
1072#endif // TASK_UNUSED
1073
1074// __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1075// task for a given thread
1076//
1077// loc_ref: reference to source location of parallel region
1078// this_thr: thread data structure corresponding to implicit task
1079// team: team for this_thr
1080// tid: thread id of given thread within team
1081// set_curr_task: TRUE if need to push current task to thread
1082// NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1083// have already been done elsewhere.
1084// TODO: Get better loc_ref. Value passed in may be NULL
1086 kmp_team_t *team, int tid, int set_curr_task) {
1087 kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1088
1089 KF_TRACE(
1090 10,
1091 ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1092 tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1093
1094 task->td_task_id = KMP_GEN_TASK_ID();
1095 task->td_team = team;
1096 // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1097 // in debugger)
1098 task->td_ident = loc_ref;
1099 task->td_taskwait_ident = NULL;
1100 task->td_taskwait_counter = 0;
1101 task->td_taskwait_thread = 0;
1102
1103 task->td_flags.tiedness = TASK_TIED;
1104 task->td_flags.tasktype = TASK_IMPLICIT;
1105 task->td_flags.proxy = TASK_FULL;
1106
1107 // All implicit tasks are executed immediately, not deferred
1108 task->td_flags.task_serial = 1;
1109 task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1110 task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1111
1112 task->td_flags.started = 1;
1113 task->td_flags.executing = 1;
1114 task->td_flags.complete = 0;
1115 task->td_flags.freed = 0;
1116#if OMPX_TASKGRAPH
1117 task->td_flags.onced = 0;
1118#endif
1119
1120 task->td_depnode = NULL;
1121 task->td_last_tied = task;
1122 task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1123
1124 if (set_curr_task) { // only do this init first time thread is created
1125 KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1126 // Not used: don't need to deallocate implicit task
1127 KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1128 task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1129 task->td_dephash = NULL;
1130 __kmp_push_current_task_to_thread(this_thr, team, tid);
1131 } else {
1132 KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1133 KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1134 }
1135
1136#if OMPT_SUPPORT
1138 __ompt_task_init(task, tid);
1139#endif
1140
1141 KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1142 team, task));
1143}
1144
1145// __kmp_finish_implicit_task: Release resources associated to implicit tasks
1146// at the end of parallel regions. Some resources are kept for reuse in the next
1147// parallel region.
1148//
1149// thread: thread data structure corresponding to implicit task
1151 kmp_taskdata_t *task = thread->th.th_current_task;
1152#if ENABLE_LIBOMPTARGET
1153 // Give an opportunity to the offload runtime to synchronize any unfinished
1154 // target async regions before finishing the implicit task
1155 if (UNLIKELY(kmp_target_sync_cb != NULL))
1156 (*kmp_target_sync_cb)(NULL, thread->th.th_info.ds.ds_gtid,
1157 KMP_TASKDATA_TO_TASK(task), NULL);
1158#endif // ENABLE_LIBOMPTARGET
1159 if (task->td_dephash) {
1160 int children;
1161 task->td_flags.complete = 1;
1162#if OMPX_TASKGRAPH
1163 task->td_flags.onced = 1;
1164#endif
1165 children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1166 kmp_tasking_flags_t flags_old = task->td_flags;
1167 if (children == 0 && flags_old.complete == 1) {
1168 kmp_tasking_flags_t flags_new = flags_old;
1169 flags_new.complete = 0;
1171 *RCAST(kmp_int32 *, &flags_old),
1172 *RCAST(kmp_int32 *, &flags_new))) {
1173 KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1174 "dephash of implicit task %p\n",
1175 thread->th.th_info.ds.ds_gtid, task));
1176 __kmp_dephash_free_entries(thread, task->td_dephash);
1177 }
1178 }
1179 }
1180}
1181
1182// __kmp_free_implicit_task: Release resources associated to implicit tasks
1183// when these are destroyed regions
1184//
1185// thread: thread data structure corresponding to implicit task
1187 kmp_taskdata_t *task = thread->th.th_current_task;
1188 if (task && task->td_dephash) {
1189 __kmp_dephash_free(thread, task->td_dephash);
1190 task->td_dephash = NULL;
1191 }
1192}
1193
1194// Round up a size to a power of two specified by val: Used to insert padding
1195// between structures co-allocated using a single malloc() call
1196static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1197 if (size & (val - 1)) {
1198 size &= ~(val - 1);
1199 if (size <= KMP_SIZE_T_MAX - val) {
1200 size += val; // Round up if there is no overflow.
1201 }
1202 }
1203 return size;
1204} // __kmp_round_up_to_va
1205
1206// __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1207//
1208// loc_ref: source location information
1209// gtid: global thread number.
1210// flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1211// task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1212// sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1213// private vars accessed in task.
1214// sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1215// in task.
1216// task_entry: Pointer to task code entry point generated by compiler.
1217// returns: a pointer to the allocated kmp_task_t structure (task).
1219 kmp_tasking_flags_t *flags,
1220 size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1223 kmp_taskdata_t *taskdata;
1224 kmp_info_t *thread = __kmp_threads[gtid];
1225 kmp_team_t *team = thread->th.th_team;
1226 kmp_taskdata_t *parent_task = thread->th.th_current_task;
1227 size_t shareds_offset;
1228
1231
1232 if (flags->hidden_helper) {
1236 } else {
1237 // If the hidden helper task is not enabled, reset the flag to FALSE.
1238 flags->hidden_helper = FALSE;
1239 }
1240 }
1241
1242 KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1243 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1244 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1245 sizeof_shareds, task_entry));
1246
1247 KMP_DEBUG_ASSERT(parent_task);
1248 if (parent_task->td_flags.final) {
1249 if (flags->merged_if0) {
1250 }
1251 flags->final = 1;
1252 }
1253
1254 if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1255 // Untied task encountered causes the TSC algorithm to check entire deque of
1256 // the victim thread. If no untied task encountered, then checking the head
1257 // of the deque should be enough.
1258 KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1259 }
1260
1261 // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1262 // the tasking setup
1263 // when that happens is too late.
1264 if (UNLIKELY(flags->proxy == TASK_PROXY ||
1265 flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1266 if (flags->proxy == TASK_PROXY) {
1267 flags->tiedness = TASK_UNTIED;
1268 flags->merged_if0 = 1;
1269 }
1270 /* are we running in a sequential parallel or tskm_immediate_exec... we need
1271 tasking support enabled */
1272 if ((thread->th.th_task_team) == NULL) {
1273 /* This should only happen if the team is serialized
1274 setup a task team and propagate it to the thread */
1275 KMP_DEBUG_ASSERT(team->t.t_serialized);
1276 KA_TRACE(30,
1277 ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1278 gtid));
1279 __kmp_task_team_setup(thread, team);
1280 thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1281 }
1282 kmp_task_team_t *task_team = thread->th.th_task_team;
1283
1284 /* tasking must be enabled now as the task might not be pushed */
1285 if (!KMP_TASKING_ENABLED(task_team)) {
1286 KA_TRACE(
1287 30,
1288 ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1289 __kmp_enable_tasking(task_team, thread);
1290 kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1291 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1292 // No lock needed since only owner can allocate
1293 if (thread_data->td.td_deque == NULL) {
1294 __kmp_alloc_task_deque(thread, thread_data);
1295 }
1296 }
1297
1298 if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1299 task_team->tt.tt_found_proxy_tasks == FALSE)
1300 TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1301 if (flags->hidden_helper &&
1304 }
1305
1306 // Calculate shared structure offset including padding after kmp_task_t struct
1307 // to align pointers in shared struct
1308 shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1309 shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(kmp_uint64));
1310
1311 // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1312 KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1313 shareds_offset));
1314 KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1315 sizeof_shareds));
1316
1317 // Avoid double allocation here by combining shareds with taskdata
1318#if USE_FAST_MEMORY
1319 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1320 sizeof_shareds);
1321#else /* ! USE_FAST_MEMORY */
1322 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1323 sizeof_shareds);
1324#endif /* USE_FAST_MEMORY */
1325
1326 task = KMP_TASKDATA_TO_TASK(taskdata);
1327
1328// Make sure task & taskdata are aligned appropriately
1329#if KMP_ARCH_X86 || KMP_ARCH_PPC64 || KMP_ARCH_S390X || !KMP_HAVE_QUAD
1330 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1331 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1332#else
1333 KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1334 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1335#endif
1336 if (sizeof_shareds > 0) {
1337 // Avoid double allocation here by combining shareds with taskdata
1338 task->shareds = &((char *)taskdata)[shareds_offset];
1339 // Make sure shareds struct is aligned to pointer size
1340 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1341 0);
1342 } else {
1343 task->shareds = NULL;
1344 }
1346 task->part_id = 0; // AC: Always start with 0 part id
1347
1348 taskdata->td_task_id = KMP_GEN_TASK_ID();
1349 taskdata->td_team = thread->th.th_team;
1350 taskdata->td_alloc_thread = thread;
1351 taskdata->td_parent = parent_task;
1352 taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1353 KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1354 taskdata->td_ident = loc_ref;
1355 taskdata->td_taskwait_ident = NULL;
1356 taskdata->td_taskwait_counter = 0;
1357 taskdata->td_taskwait_thread = 0;
1358 KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1359 // avoid copying icvs for proxy tasks
1360 if (flags->proxy == TASK_FULL)
1361 copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1362
1363 taskdata->td_flags = *flags;
1364 taskdata->td_task_team = thread->th.th_task_team;
1365 taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1366 taskdata->td_flags.tasktype = TASK_EXPLICIT;
1367 // If it is hidden helper task, we need to set the team and task team
1368 // correspondingly.
1369 if (flags->hidden_helper) {
1370 kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1371 taskdata->td_team = shadow_thread->th.th_team;
1372 taskdata->td_task_team = shadow_thread->th.th_task_team;
1373 }
1374
1375 // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1377
1378 // GEH - TODO: fix this to copy parent task's value of team_serial flag
1379 taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1380
1381 // GEH - Note we serialize the task if the team is serialized to make sure
1382 // implicit parallel region tasks are not left until program termination to
1383 // execute. Also, it helps locality to execute immediately.
1384
1385 taskdata->td_flags.task_serial =
1386 (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1387 taskdata->td_flags.tasking_ser || flags->merged_if0);
1388
1389 taskdata->td_flags.started = 0;
1390 taskdata->td_flags.executing = 0;
1391 taskdata->td_flags.complete = 0;
1392 taskdata->td_flags.freed = 0;
1393#if OMPX_TASKGRAPH
1394 taskdata->td_flags.onced = 0;
1395 taskdata->is_taskgraph = 0;
1396 taskdata->tdg = nullptr;
1397#endif
1399 // start at one because counts current task and children
1401 taskdata->td_taskgroup =
1402 parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1403 taskdata->td_dephash = NULL;
1404 taskdata->td_depnode = NULL;
1405 taskdata->td_target_data.async_handle = NULL;
1406 if (flags->tiedness == TASK_UNTIED)
1407 taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1408 else
1409 taskdata->td_last_tied = taskdata;
1411#if OMPT_SUPPORT
1413 __ompt_task_init(taskdata, gtid);
1414#endif
1415 // TODO: What would be the balance between the conditions in the function and
1416 // an atomic operation?
1417 if (__kmp_track_children_task(taskdata)) {
1419 if (parent_task->td_taskgroup)
1420 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1421 // Only need to keep track of allocated child tasks for explicit tasks since
1422 // implicit not deallocated
1423 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1425 }
1426 if (flags->hidden_helper) {
1427 taskdata->td_flags.task_serial = FALSE;
1428 // Increment the number of hidden helper tasks to be executed
1430 }
1431 }
1432
1433#if OMPX_TASKGRAPH
1434 kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
1435 if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&
1437 taskdata->is_taskgraph = 1;
1438 taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
1439 taskdata->td_task_id = KMP_GEN_TASK_ID();
1440 taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
1441 }
1442#endif
1443 KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1444 gtid, taskdata, taskdata->td_parent));
1445
1446 return task;
1447}
1448
1450 kmp_int32 flags, size_t sizeof_kmp_task_t,
1451 size_t sizeof_shareds,
1453 kmp_task_t *retval;
1454 kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1456 input_flags->native = FALSE;
1457 // __kmp_task_alloc() sets up all other runtime flags
1458 KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1459 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1460 gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1461 input_flags->proxy ? "proxy" : "",
1462 input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1463 sizeof_shareds, task_entry));
1464
1465 retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1466 sizeof_shareds, task_entry);
1467
1468 KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1469
1470 return retval;
1471}
1472
1474 kmp_int32 flags,
1475 size_t sizeof_kmp_task_t,
1476 size_t sizeof_shareds,
1478 kmp_int64 device_id) {
1479 auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1480 // target task is untied defined in the specification
1481 input_flags.tiedness = TASK_UNTIED;
1482 input_flags.target = 1;
1483
1485 input_flags.hidden_helper = TRUE;
1486
1487 return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1488 sizeof_shareds, task_entry);
1489}
1490
1491/*!
1492@ingroup TASKING
1493@param loc_ref location of the original task directive
1494@param gtid Global Thread ID of encountering thread
1495@param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new
1496task''
1497@param naffins Number of affinity items
1498@param affin_list List of affinity items
1499@return Returns non-zero if registering affinity information was not successful.
1500 Returns 0 if registration was successful
1501This entry registers the affinity information attached to a task with the task
1502thunk structure kmp_taskdata_t.
1503*/
1506 kmp_task_t *new_task, kmp_int32 naffins,
1507 kmp_task_affinity_info_t *affin_list) {
1508 return 0;
1509}
1510
1511// __kmp_invoke_task: invoke the specified task
1512//
1513// gtid: global thread ID of caller
1514// task: the task to invoke
1515// current_task: the task to resume after task invocation
1516#ifdef __s390x__
1517__attribute__((target("backchain")))
1518#endif
1519static void
1521 kmp_taskdata_t *current_task) {
1523 kmp_info_t *thread;
1524 int discard = 0 /* false */;
1525 KA_TRACE(
1526 30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1527 gtid, taskdata, current_task));
1529 if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1530 taskdata->td_flags.complete == 1)) {
1531 // This is a proxy task that was already completed but it needs to run
1532 // its bottom-half finish
1533 KA_TRACE(
1534 30,
1535 ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1536 gtid, taskdata));
1537
1539
1540 KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1541 "proxy task %p, resuming task %p\n",
1542 gtid, taskdata, current_task));
1543
1544 return;
1545 }
1546
1547#if OMPT_SUPPORT
1548 // For untied tasks, the first task executed only calls __kmpc_omp_task and
1549 // does not execute code.
1550 ompt_thread_info_t oldInfo;
1552 // Store the threads states and restore them after the task
1553 thread = __kmp_threads[gtid];
1554 oldInfo = thread->th.ompt_thread_info;
1555 thread->th.ompt_thread_info.wait_id = 0;
1556 thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1557 ? ompt_state_work_serial
1558 : ompt_state_work_parallel;
1559 taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1560 }
1561#endif
1562
1563 // Proxy tasks are not handled by the runtime
1564 if (taskdata->td_flags.proxy != TASK_PROXY) {
1565 __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1566 }
1567
1568 // TODO: cancel tasks if the parallel region has also been cancelled
1569 // TODO: check if this sequence can be hoisted above __kmp_task_start
1570 // if cancellation has been enabled for this run ...
1572 thread = __kmp_threads[gtid];
1573 kmp_team_t *this_team = thread->th.th_team;
1574 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1575 if ((taskgroup && taskgroup->cancel_request) ||
1576 (this_team->t.t_cancel_request == cancel_parallel)) {
1577#if OMPT_SUPPORT && OMPT_OPTIONAL
1578 ompt_data_t *task_data;
1579 if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1580 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1581 ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1582 task_data,
1583 ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1584 : ompt_cancel_parallel) |
1585 ompt_cancel_discarded_task,
1586 NULL);
1587 }
1588#endif
1589 KMP_COUNT_BLOCK(TASK_cancelled);
1590 // this task belongs to a task group and we need to cancel it
1591 discard = 1 /* true */;
1592 }
1593 }
1594
1595 // Invoke the task routine and pass in relevant data.
1596 // Thunks generated by gcc take a different argument list.
1597 if (!discard) {
1598 if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1599 taskdata->td_last_tied = current_task->td_last_tied;
1600 KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1601 }
1602#if KMP_STATS_ENABLED
1603 KMP_COUNT_BLOCK(TASK_executed);
1604 switch (KMP_GET_THREAD_STATE()) {
1605 case FORK_JOIN_BARRIER:
1606 KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1607 break;
1608 case PLAIN_BARRIER:
1609 KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1610 break;
1611 case TASKYIELD:
1612 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1613 break;
1614 case TASKWAIT:
1615 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1616 break;
1617 case TASKGROUP:
1618 KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1619 break;
1620 default:
1621 KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1622 break;
1623 }
1624#endif // KMP_STATS_ENABLED
1625
1626// OMPT task begin
1627#if OMPT_SUPPORT
1629 __ompt_task_start(task, current_task, gtid);
1630#endif
1631#if OMPT_SUPPORT && OMPT_OPTIONAL
1632 if (UNLIKELY(ompt_enabled.ompt_callback_dispatch &&
1633 taskdata->ompt_task_info.dispatch_chunk.iterations > 0)) {
1634 ompt_data_t instance = ompt_data_none;
1635 instance.ptr = &(taskdata->ompt_task_info.dispatch_chunk);
1636 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1637 ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
1638 &(team_info->parallel_data), &(taskdata->ompt_task_info.task_data),
1639 ompt_dispatch_taskloop_chunk, instance);
1640 taskdata->ompt_task_info.dispatch_chunk = {0, 0};
1641 }
1642#endif // OMPT_SUPPORT && OMPT_OPTIONAL
1643
1644#if OMPD_SUPPORT
1645 if (ompd_state & OMPD_ENABLE_BP)
1646 ompd_bp_task_begin();
1647#endif
1648
1649#if USE_ITT_BUILD && USE_ITT_NOTIFY
1650 kmp_uint64 cur_time;
1651 kmp_int32 kmp_itt_count_task =
1652 __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1653 current_task->td_flags.tasktype == TASK_IMPLICIT;
1654 if (kmp_itt_count_task) {
1655 thread = __kmp_threads[gtid];
1656 // Time outer level explicit task on barrier for adjusting imbalance time
1657 if (thread->th.th_bar_arrive_time)
1658 cur_time = __itt_get_timestamp();
1659 else
1660 kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1661 }
1662 KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1663#endif
1664
1665#if ENABLE_LIBOMPTARGET
1666 if (taskdata->td_target_data.async_handle != NULL) {
1667 // If we have a valid target async handle, that means that we have already
1668 // executed the task routine once. We must query for the handle completion
1669 // instead of re-executing the routine.
1670 KMP_ASSERT(tgt_target_nowait_query);
1671 tgt_target_nowait_query(&taskdata->td_target_data.async_handle);
1672 } else
1673#endif
1674 if (task->routine != NULL) {
1675#ifdef KMP_GOMP_COMPAT
1676 if (taskdata->td_flags.native) {
1677 ((void (*)(void *))(*(task->routine)))(task->shareds);
1678 } else
1679#endif /* KMP_GOMP_COMPAT */
1680 {
1681 (*(task->routine))(gtid, task);
1682 }
1683 }
1685
1686#if USE_ITT_BUILD && USE_ITT_NOTIFY
1687 if (kmp_itt_count_task) {
1688 // Barrier imbalance - adjust arrive time with the task duration
1689 thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1690 }
1691 KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1692 KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1693#endif
1694 }
1695
1696#if OMPD_SUPPORT
1697 if (ompd_state & OMPD_ENABLE_BP)
1698 ompd_bp_task_end();
1699#endif
1700
1701 // Proxy tasks are not handled by the runtime
1702 if (taskdata->td_flags.proxy != TASK_PROXY) {
1703#if OMPT_SUPPORT
1705 thread->th.ompt_thread_info = oldInfo;
1706 if (taskdata->td_flags.tiedness == TASK_TIED) {
1707 taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1708 }
1709 __kmp_task_finish<true>(gtid, task, current_task);
1710 } else
1711#endif
1712 __kmp_task_finish<false>(gtid, task, current_task);
1713 }
1714#if OMPT_SUPPORT
1715 else if (UNLIKELY(ompt_enabled.enabled && taskdata->td_flags.target)) {
1716 __ompt_task_finish(task, current_task, ompt_task_switch);
1717 }
1718#endif
1719
1720 KA_TRACE(
1721 30,
1722 ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1723 gtid, taskdata, current_task));
1724 return;
1725}
1726
1727// __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1728//
1729// loc_ref: location of original task pragma (ignored)
1730// gtid: Global Thread ID of encountering thread
1731// new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1732// Returns:
1733// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1734// be resumed later.
1735// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1736// resumed later.
1738 kmp_task_t *new_task) {
1739 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1740
1741 KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1742 loc_ref, new_taskdata));
1743
1744#if OMPT_SUPPORT
1747 parent = new_taskdata->td_parent;
1748 if (ompt_enabled.ompt_callback_task_create) {
1749 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1750 &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1751 &(new_taskdata->ompt_task_info.task_data),
1752 TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1754 }
1755 }
1756#endif
1757
1758 /* Should we execute the new task or queue it? For now, let's just always try
1759 to queue it. If the queue fills up, then we'll execute it. */
1760
1761 if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1762 { // Execute this task immediately
1763 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1764 new_taskdata->td_flags.task_serial = 1;
1765 __kmp_invoke_task(gtid, new_task, current_task);
1766 }
1767
1768 KA_TRACE(
1769 10,
1770 ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1771 "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1772 gtid, loc_ref, new_taskdata));
1773
1774#if OMPT_SUPPORT
1776 parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1777 parent->ompt_task_info.frame.enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
1778 }
1779#endif
1781}
1782
1783// __kmp_omp_task: Schedule a non-thread-switchable task for execution
1784//
1785// gtid: Global Thread ID of encountering thread
1786// new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1787// serialize_immediate: if TRUE then if the task is executed immediately its
1788// execution will be serialized
1789// Returns:
1790// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1791// be resumed later.
1792// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1793// resumed later.
1795 bool serialize_immediate) {
1796 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1797
1798#if OMPX_TASKGRAPH
1799 if (new_taskdata->is_taskgraph &&
1800 __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
1801 kmp_tdg_info_t *tdg = new_taskdata->tdg;
1802 // extend the record_map if needed
1803 if (new_taskdata->td_tdg_task_id >= new_taskdata->tdg->map_size) {
1804 __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
1805 // map_size could have been updated by another thread if recursive
1806 // taskloop
1807 if (new_taskdata->td_tdg_task_id >= tdg->map_size) {
1808 kmp_uint old_size = tdg->map_size;
1809 kmp_uint new_size = old_size * 2;
1810 kmp_node_info_t *old_record = tdg->record_map;
1811 kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
1812 new_size * sizeof(kmp_node_info_t));
1813
1814 KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));
1815 tdg->record_map = new_record;
1816
1817 __kmp_free(old_record);
1818
1819 for (kmp_int i = old_size; i < new_size; i++) {
1820 kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
1821 __kmp_successors_size * sizeof(kmp_int32));
1822 new_record[i].task = nullptr;
1823 new_record[i].successors = successorsList;
1824 new_record[i].nsuccessors = 0;
1825 new_record[i].npredecessors = 0;
1826 new_record[i].successors_size = __kmp_successors_size;
1827 KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
1828 }
1829 // update the size at the end, so that we avoid other
1830 // threads use old_record while map_size is already updated
1831 tdg->map_size = new_size;
1832 }
1833 __kmp_release_bootstrap_lock(&tdg->graph_lock);
1834 }
1835 // record a task
1836 if (tdg->record_map[new_taskdata->td_tdg_task_id].task == nullptr) {
1837 tdg->record_map[new_taskdata->td_tdg_task_id].task = new_task;
1838 tdg->record_map[new_taskdata->td_tdg_task_id].parent_task =
1839 new_taskdata->td_parent;
1840 KMP_ATOMIC_INC(&tdg->num_tasks);
1841 }
1842 }
1843#endif
1844
1845 /* Should we execute the new task or queue it? For now, let's just always try
1846 to queue it. If the queue fills up, then we'll execute it. */
1847 if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1848 __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1849 { // Execute this task immediately
1850 kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1851 if (serialize_immediate)
1852 new_taskdata->td_flags.task_serial = 1;
1853 __kmp_invoke_task(gtid, new_task, current_task);
1856 kmp_info_t *this_thr = __kmp_threads[gtid];
1857 kmp_team_t *team = this_thr->th.th_team;
1858 kmp_int32 nthreads = this_thr->th.th_team_nproc;
1859 for (int i = 0; i < nthreads; ++i) {
1860 kmp_info_t *thread = team->t.t_threads[i];
1861 if (thread == this_thr)
1862 continue;
1863 if (thread->th.th_sleep_loc != NULL) {
1865 break; // awake one thread at a time
1866 }
1867 }
1868 }
1870}
1871
1872// __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1873// non-thread-switchable task from the parent thread only!
1874//
1875// loc_ref: location of original task pragma (ignored)
1876// gtid: Global Thread ID of encountering thread
1877// new_task: non-thread-switchable task thunk allocated by
1878// __kmp_omp_task_alloc()
1879// Returns:
1880// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1881// be resumed later.
1882// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1883// resumed later.
1885 kmp_task_t *new_task) {
1886 kmp_int32 res;
1887 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1888
1889#if KMP_DEBUG || OMPT_SUPPORT
1890 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1891#endif
1892 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1893 new_taskdata));
1895
1896#if OMPT_SUPPORT
1897 kmp_taskdata_t *parent = NULL;
1899 if (!new_taskdata->td_flags.started) {
1900 OMPT_STORE_RETURN_ADDRESS(gtid);
1901 parent = new_taskdata->td_parent;
1902 if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1903 parent->ompt_task_info.frame.enter_frame.ptr =
1905 }
1906 if (ompt_enabled.ompt_callback_task_create) {
1907 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1908 &(parent->ompt_task_info.task_data),
1909 &(parent->ompt_task_info.frame),
1910 &(new_taskdata->ompt_task_info.task_data),
1911 TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1912 OMPT_LOAD_RETURN_ADDRESS(gtid));
1913 }
1914 } else {
1915 // We are scheduling the continuation of an UNTIED task.
1916 // Scheduling back to the parent task.
1917 __ompt_task_finish(new_task,
1918 new_taskdata->ompt_task_info.scheduling_parent,
1919 ompt_task_switch);
1920 new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1921 }
1922 }
1923#endif
1924
1925 res = __kmp_omp_task(gtid, new_task, true);
1926
1927 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1928 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1929 gtid, loc_ref, new_taskdata));
1930#if OMPT_SUPPORT
1931 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1932 parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1933 }
1934#endif
1935 return res;
1936}
1937
1938// __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1939// a taskloop task with the correct OMPT return address
1940//
1941// loc_ref: location of original task pragma (ignored)
1942// gtid: Global Thread ID of encountering thread
1943// new_task: non-thread-switchable task thunk allocated by
1944// __kmp_omp_task_alloc()
1945// codeptr_ra: return address for OMPT callback
1946// Returns:
1947// TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1948// be resumed later.
1949// TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1950// resumed later.
1952 kmp_task_t *new_task, void *codeptr_ra) {
1953 kmp_int32 res;
1954 KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1955
1956#if KMP_DEBUG || OMPT_SUPPORT
1957 kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1958#endif
1959 KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1960 new_taskdata));
1961
1962#if OMPT_SUPPORT
1963 kmp_taskdata_t *parent = NULL;
1964 if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1965 parent = new_taskdata->td_parent;
1966 if (!parent->ompt_task_info.frame.enter_frame.ptr)
1967 parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1968 if (ompt_enabled.ompt_callback_task_create) {
1969 ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1970 &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1971 &(new_taskdata->ompt_task_info.task_data),
1972 TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, codeptr_ra);
1973 }
1974 }
1975#endif
1976
1977 res = __kmp_omp_task(gtid, new_task, true);
1978
1979 KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1980 "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1981 gtid, loc_ref, new_taskdata));
1982#if OMPT_SUPPORT
1983 if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1984 parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1985 }
1986#endif
1987 return res;
1988}
1989
1990template <bool ompt>
1992 void *frame_address,
1993 void *return_address) {
1994 kmp_taskdata_t *taskdata = nullptr;
1995 kmp_info_t *thread;
1996 int thread_finished = FALSE;
1998
1999 KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
2000 KMP_DEBUG_ASSERT(gtid >= 0);
2001
2003 thread = __kmp_threads[gtid];
2004 taskdata = thread->th.th_current_task;
2005
2006#if OMPT_SUPPORT && OMPT_OPTIONAL
2007 ompt_data_t *my_task_data;
2008 ompt_data_t *my_parallel_data;
2009
2010 if (ompt) {
2011 my_task_data = &(taskdata->ompt_task_info.task_data);
2012 my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
2013
2014 taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
2015
2016 if (ompt_enabled.ompt_callback_sync_region) {
2017 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2018 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2019 my_task_data, return_address);
2020 }
2021
2022 if (ompt_enabled.ompt_callback_sync_region_wait) {
2023 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2024 ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2025 my_task_data, return_address);
2026 }
2027 }
2028#endif // OMPT_SUPPORT && OMPT_OPTIONAL
2029
2030#if ENABLE_LIBOMPTARGET
2031 // Give an opportunity to the offload runtime to make progress and create
2032 // any necessary proxy tasks
2033 if (UNLIKELY(kmp_target_sync_cb))
2034 (*kmp_target_sync_cb)(loc_ref, gtid, KMP_TASKDATA_TO_TASK(taskdata),
2035 NULL);
2036#endif // ENABLE_LIBOMPTARGET
2037
2038// Debugger: The taskwait is active. Store location and thread encountered the
2039// taskwait.
2040#if USE_ITT_BUILD
2041// Note: These values are used by ITT events as well.
2042#endif /* USE_ITT_BUILD */
2043 taskdata->td_taskwait_counter += 1;
2044 taskdata->td_taskwait_ident = loc_ref;
2045 taskdata->td_taskwait_thread = gtid + 1;
2046
2047#if USE_ITT_BUILD
2048 void *itt_sync_obj = NULL;
2049#if USE_ITT_NOTIFY
2050 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2051#endif /* USE_ITT_NOTIFY */
2052#endif /* USE_ITT_BUILD */
2053
2054 bool must_wait =
2055 !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
2056
2057 must_wait = must_wait || (thread->th.th_task_team != NULL &&
2058 thread->th.th_task_team->tt.tt_found_proxy_tasks);
2059 // If hidden helper thread is encountered, we must enable wait here.
2060 must_wait =
2061 must_wait ||
2062 (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
2063 thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
2064
2065 if (must_wait) {
2067 RCAST(std::atomic<kmp_uint32> *,
2068 &(taskdata->td_incomplete_child_tasks)),
2069 0U);
2070 while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
2071 flag.execute_tasks(thread, gtid, FALSE,
2072 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2074 }
2075 }
2076#if USE_ITT_BUILD
2077 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2078 KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
2079#endif /* USE_ITT_BUILD */
2080
2081 // Debugger: The taskwait is completed. Location remains, but thread is
2082 // negated.
2083 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2084
2085#if OMPT_SUPPORT && OMPT_OPTIONAL
2086 if (ompt) {
2087 if (ompt_enabled.ompt_callback_sync_region_wait) {
2088 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2089 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2090 my_task_data, return_address);
2091 }
2092 if (ompt_enabled.ompt_callback_sync_region) {
2093 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2094 ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2095 my_task_data, return_address);
2096 }
2097 taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
2098 }
2099#endif // OMPT_SUPPORT && OMPT_OPTIONAL
2100 }
2101
2102 KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
2103 "returning TASK_CURRENT_NOT_QUEUED\n",
2104 gtid, taskdata));
2105
2107}
2108
2109#if OMPT_SUPPORT && OMPT_OPTIONAL
2111static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
2112 void *frame_address,
2113 void *return_address) {
2114 return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
2115 return_address);
2116}
2117#endif // OMPT_SUPPORT && OMPT_OPTIONAL
2118
2119// __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
2120// complete
2122#if OMPT_SUPPORT && OMPT_OPTIONAL
2124 OMPT_STORE_RETURN_ADDRESS(gtid);
2125 return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
2126 OMPT_LOAD_RETURN_ADDRESS(gtid));
2127 }
2128#endif
2129 return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
2130}
2131
2132// __kmpc_omp_taskyield: switch to a different task
2133kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
2134 kmp_taskdata_t *taskdata = NULL;
2135 kmp_info_t *thread;
2136 int thread_finished = FALSE;
2137
2138 KMP_COUNT_BLOCK(OMP_TASKYIELD);
2139 KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2140
2141 KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2142 gtid, loc_ref, end_part));
2144
2146 thread = __kmp_threads[gtid];
2147 taskdata = thread->th.th_current_task;
2148// Should we model this as a task wait or not?
2149// Debugger: The taskwait is active. Store location and thread encountered the
2150// taskwait.
2151#if USE_ITT_BUILD
2152// Note: These values are used by ITT events as well.
2153#endif /* USE_ITT_BUILD */
2154 taskdata->td_taskwait_counter += 1;
2155 taskdata->td_taskwait_ident = loc_ref;
2156 taskdata->td_taskwait_thread = gtid + 1;
2157
2158#if USE_ITT_BUILD
2159 void *itt_sync_obj = NULL;
2160#if USE_ITT_NOTIFY
2161 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2162#endif /* USE_ITT_NOTIFY */
2163#endif /* USE_ITT_BUILD */
2164 if (!taskdata->td_flags.team_serial) {
2165 kmp_task_team_t *task_team = thread->th.th_task_team;
2166 if (task_team != NULL) {
2167 if (KMP_TASKING_ENABLED(task_team)) {
2168#if OMPT_SUPPORT
2170 thread->th.ompt_thread_info.ompt_task_yielded = 1;
2171#endif
2173 thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2174 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2176#if OMPT_SUPPORT
2178 thread->th.ompt_thread_info.ompt_task_yielded = 0;
2179#endif
2180 }
2181 }
2182 }
2183#if USE_ITT_BUILD
2184 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2185#endif /* USE_ITT_BUILD */
2186
2187 // Debugger: The taskwait is completed. Location remains, but thread is
2188 // negated.
2189 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2190 }
2191
2192 KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2193 "returning TASK_CURRENT_NOT_QUEUED\n",
2194 gtid, taskdata));
2195
2197}
2198
2199// Task Reduction implementation
2200//
2201// Note: initial implementation didn't take into account the possibility
2202// to specify omp_orig for initializer of the UDR (user defined reduction).
2203// Corrected implementation takes into account the omp_orig object.
2204// Compiler is free to use old implementation if omp_orig is not specified.
2205
2206/*!
2207@ingroup BASIC_TYPES
2208@{
2209*/
2210
2211/*!
2212Flags for special info per task reduction item.
2213*/
2214typedef struct kmp_taskred_flags {
2215 /*! 1 - use lazy alloc/init (e.g. big objects, num tasks < num threads) */
2216 unsigned lazy_priv : 1;
2217 unsigned reserved31 : 31;
2219
2220/*!
2221Internal struct for reduction data item related info set up by compiler.
2222*/
2223typedef struct kmp_task_red_input {
2224 void *reduce_shar; /**< shared between tasks item to reduce into */
2225 size_t reduce_size; /**< size of data item in bytes */
2226 // three compiler-generated routines (init, fini are optional):
2227 void *reduce_init; /**< data initialization routine (single parameter) */
2228 void *reduce_fini; /**< data finalization routine */
2229 void *reduce_comb; /**< data combiner routine */
2230 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2232
2233/*!
2234Internal struct for reduction data item related info saved by the library.
2235*/
2236typedef struct kmp_taskred_data {
2237 void *reduce_shar; /**< shared between tasks item to reduce into */
2238 size_t reduce_size; /**< size of data item */
2239 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2240 void *reduce_priv; /**< array of thread specific items */
2241 void *reduce_pend; /**< end of private data for faster comparison op */
2242 // three compiler-generated routines (init, fini are optional):
2243 void *reduce_comb; /**< data combiner routine */
2244 void *reduce_init; /**< data initialization routine (two parameters) */
2245 void *reduce_fini; /**< data finalization routine */
2246 void *reduce_orig; /**< original item (can be used in UDR initializer) */
2248
2249/*!
2250Internal struct for reduction data item related info set up by compiler.
2251
2252New interface: added reduce_orig field to provide omp_orig for UDR initializer.
2253*/
2254typedef struct kmp_taskred_input {
2255 void *reduce_shar; /**< shared between tasks item to reduce into */
2256 void *reduce_orig; /**< original reduction item used for initialization */
2257 size_t reduce_size; /**< size of data item */
2258 // three compiler-generated routines (init, fini are optional):
2259 void *reduce_init; /**< data initialization routine (two parameters) */
2260 void *reduce_fini; /**< data finalization routine */
2261 void *reduce_comb; /**< data combiner routine */
2262 kmp_taskred_flags_t flags; /**< flags for additional info from compiler */
2264/*!
2265@}
2266*/
2267
2268template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2269template <>
2271 kmp_task_red_input_t &src) {
2272 item.reduce_orig = NULL;
2273}
2274template <>
2276 kmp_taskred_input_t &src) {
2277 if (src.reduce_orig != NULL) {
2278 item.reduce_orig = src.reduce_orig;
2279 } else {
2280 item.reduce_orig = src.reduce_shar;
2281 } // non-NULL reduce_orig means new interface used
2282}
2283
2284template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2285template <>
2287 size_t offset) {
2288 ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2289}
2290template <>
2292 size_t offset) {
2293 ((void (*)(void *, void *))item.reduce_init)(
2294 (char *)(item.reduce_priv) + offset, item.reduce_orig);
2295}
2296
2297template <typename T>
2298void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2300 kmp_info_t *thread = __kmp_threads[gtid];
2301 kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2302 kmp_uint32 nth = thread->th.th_team_nproc;
2304
2305 // check input data just in case
2306 KMP_ASSERT(tg != NULL);
2307 KMP_ASSERT(data != NULL);
2308 KMP_ASSERT(num > 0);
2309 if (nth == 1 && !__kmp_enable_hidden_helper) {
2310 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2311 gtid, tg));
2312 return (void *)tg;
2313 }
2314 KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2315 gtid, tg, num));
2317 thread, num * sizeof(kmp_taskred_data_t));
2318 for (int i = 0; i < num; ++i) {
2319 size_t size = data[i].reduce_size - 1;
2320 // round the size up to cache line per thread-specific item
2322 KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2323 arr[i].reduce_shar = data[i].reduce_shar;
2324 arr[i].reduce_size = size;
2325 arr[i].flags = data[i].flags;
2326 arr[i].reduce_comb = data[i].reduce_comb;
2327 arr[i].reduce_init = data[i].reduce_init;
2328 arr[i].reduce_fini = data[i].reduce_fini;
2329 __kmp_assign_orig<T>(arr[i], data[i]);
2330 if (!arr[i].flags.lazy_priv) {
2331 // allocate cache-line aligned block and fill it with zeros
2332 arr[i].reduce_priv = __kmp_allocate(nth * size);
2333 arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2334 if (arr[i].reduce_init != NULL) {
2335 // initialize all thread-specific items
2336 for (size_t j = 0; j < nth; ++j) {
2337 __kmp_call_init<T>(arr[i], j * size);
2338 }
2339 }
2340 } else {
2341 // only allocate space for pointers now,
2342 // objects will be lazily allocated/initialized if/when requested
2343 // note that __kmp_allocate zeroes the allocated memory
2344 arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2345 }
2346 }
2347 tg->reduce_data = (void *)arr;
2348 tg->reduce_num_data = num;
2349 return (void *)tg;
2350}
2351
2352/*!
2353@ingroup TASKING
2354@param gtid Global thread ID
2355@param num Number of data items to reduce
2356@param data Array of data for reduction
2357@return The taskgroup identifier
2358
2359Initialize task reduction for the taskgroup.
2360
2361Note: this entry supposes the optional compiler-generated initializer routine
2362has single parameter - pointer to object to be initialized. That means
2363the reduction either does not use omp_orig object, or the omp_orig is accessible
2364without help of the runtime library.
2365*/
2366void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2367#if OMPX_TASKGRAPH
2368 kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2369 if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2370 kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2371 this_tdg->rec_taskred_data =
2372 __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2373 this_tdg->rec_num_taskred = num;
2374 KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2375 sizeof(kmp_task_red_input_t) * num);
2376 }
2377#endif
2379}
2380
2381/*!
2382@ingroup TASKING
2383@param gtid Global thread ID
2384@param num Number of data items to reduce
2385@param data Array of data for reduction
2386@return The taskgroup identifier
2387
2388Initialize task reduction for the taskgroup.
2389
2390Note: this entry supposes the optional compiler-generated initializer routine
2391has two parameters, pointer to object to be initialized and pointer to omp_orig
2392*/
2393void *__kmpc_taskred_init(int gtid, int num, void *data) {
2394#if OMPX_TASKGRAPH
2395 kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2396 if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2397 kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2398 this_tdg->rec_taskred_data =
2399 __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2400 this_tdg->rec_num_taskred = num;
2401 KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2402 sizeof(kmp_task_red_input_t) * num);
2403 }
2404#endif
2406}
2407
2408// Copy task reduction data (except for shared pointers).
2409template <typename T>
2411 kmp_taskgroup_t *tg, void *reduce_data) {
2413 KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2414 " from data %p\n",
2415 thr, tg, reduce_data));
2417 thr, num * sizeof(kmp_taskred_data_t));
2418 // threads will share private copies, thunk routines, sizes, flags, etc.:
2419 KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2420 for (int i = 0; i < num; ++i) {
2421 arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2422 }
2423 tg->reduce_data = (void *)arr;
2424 tg->reduce_num_data = num;
2425}
2426
2427/*!
2428@ingroup TASKING
2429@param gtid Global thread ID
2430@param tskgrp The taskgroup ID (optional)
2431@param data Shared location of the item
2432@return The pointer to per-thread data
2433
2434Get thread-specific location of data item
2435*/
2436void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2438 kmp_info_t *thread = __kmp_threads[gtid];
2439 kmp_int32 nth = thread->th.th_team_nproc;
2440 if (nth == 1)
2441 return data; // nothing to do
2442
2443 kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2444 if (tg == NULL)
2445 tg = thread->th.th_current_task->td_taskgroup;
2446 KMP_ASSERT(tg != NULL);
2448 kmp_int32 num;
2449 kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2450
2451#if OMPX_TASKGRAPH
2452 if ((thread->th.th_current_task->is_taskgraph) &&
2453 (!__kmp_tdg_is_recording(
2454 __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {
2455 tg = thread->th.th_current_task->td_taskgroup;
2456 KMP_ASSERT(tg != NULL);
2457 KMP_ASSERT(tg->reduce_data != NULL);
2459 num = tg->reduce_num_data;
2460 }
2461#endif
2462
2463 KMP_ASSERT(data != NULL);
2464 while (tg != NULL) {
2466 num = tg->reduce_num_data;
2467 for (int i = 0; i < num; ++i) {
2468 if (!arr[i].flags.lazy_priv) {
2469 if (data == arr[i].reduce_shar ||
2470 (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2471 return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2472 } else {
2473 // check shared location first
2474 void **p_priv = (void **)(arr[i].reduce_priv);
2475 if (data == arr[i].reduce_shar)
2476 goto found;
2477 // check if we get some thread specific location as parameter
2478 for (int j = 0; j < nth; ++j)
2479 if (data == p_priv[j])
2480 goto found;
2481 continue; // not found, continue search
2482 found:
2483 if (p_priv[tid] == NULL) {
2484 // allocate thread specific object lazily
2485 p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2486 if (arr[i].reduce_init != NULL) {
2487 if (arr[i].reduce_orig != NULL) { // new interface
2488 ((void (*)(void *, void *))arr[i].reduce_init)(
2489 p_priv[tid], arr[i].reduce_orig);
2490 } else { // old interface (single parameter)
2491 ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2492 }
2493 }
2494 }
2495 return p_priv[tid];
2496 }
2497 }
2498 KMP_ASSERT(tg->parent);
2499 tg = tg->parent;
2500 }
2501 KMP_ASSERT2(0, "Unknown task reduction item");
2502 return NULL; // ERROR, this line never executed
2503}
2504
2505// Finalize task reduction.
2506// Called from __kmpc_end_taskgroup()
2508 kmp_int32 nth = th->th.th_team_nproc;
2510 nth > 1 ||
2511 __kmp_enable_hidden_helper); // should not be called if nth == 1 unless we
2512 // are using hidden helper threads
2514 kmp_int32 num = tg->reduce_num_data;
2515 for (int i = 0; i < num; ++i) {
2516 void *sh_data = arr[i].reduce_shar;
2517 void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2518 void (*f_comb)(void *, void *) =
2519 (void (*)(void *, void *))(arr[i].reduce_comb);
2520 if (!arr[i].flags.lazy_priv) {
2521 void *pr_data = arr[i].reduce_priv;
2522 size_t size = arr[i].reduce_size;
2523 for (int j = 0; j < nth; ++j) {
2524 void *priv_data = (char *)pr_data + j * size;
2525 f_comb(sh_data, priv_data); // combine results
2526 if (f_fini)
2527 f_fini(priv_data); // finalize if needed
2528 }
2529 } else {
2530 void **pr_data = (void **)(arr[i].reduce_priv);
2531 for (int j = 0; j < nth; ++j) {
2532 if (pr_data[j] != NULL) {
2533 f_comb(sh_data, pr_data[j]); // combine results
2534 if (f_fini)
2535 f_fini(pr_data[j]); // finalize if needed
2536 __kmp_free(pr_data[j]);
2537 }
2538 }
2539 }
2540 __kmp_free(arr[i].reduce_priv);
2541 }
2543 tg->reduce_data = NULL;
2544 tg->reduce_num_data = 0;
2545}
2546
2547// Cleanup task reduction data for parallel or worksharing,
2548// do not touch task private data other threads still working with.
2549// Called from __kmpc_end_taskgroup()
2552 tg->reduce_data = NULL;
2553 tg->reduce_num_data = 0;
2554}
2555
2556template <typename T>
2558 int num, T *data) {
2560 kmp_info_t *thr = __kmp_threads[gtid];
2561 kmp_int32 nth = thr->th.th_team_nproc;
2562 __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2563 if (nth == 1) {
2564 KA_TRACE(10,
2565 ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2566 gtid, thr->th.th_current_task->td_taskgroup));
2567 return (void *)thr->th.th_current_task->td_taskgroup;
2568 }
2569 kmp_team_t *team = thr->th.th_team;
2570 void *reduce_data;
2571 kmp_taskgroup_t *tg;
2572 reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2573 if (reduce_data == NULL &&
2574 __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2575 (void *)1)) {
2576 // single thread enters this block to initialize common reduction data
2577 KMP_DEBUG_ASSERT(reduce_data == NULL);
2578 // first initialize own data, then make a copy other threads can use
2579 tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2580 reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2581 KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2582 // fini counters should be 0 at this point
2583 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2584 KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2585 KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2586 } else {
2587 while (
2588 (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2589 (void *)1) { // wait for task reduction initialization
2590 KMP_CPU_PAUSE();
2591 }
2592 KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2593 tg = thr->th.th_current_task->td_taskgroup;
2594 __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2595 }
2596 return tg;
2597}
2598
2599/*!
2600@ingroup TASKING
2601@param loc Source location info
2602@param gtid Global thread ID
2603@param is_ws Is 1 if the reduction is for worksharing, 0 otherwise
2604@param num Number of data items to reduce
2605@param data Array of data for reduction
2606@return The taskgroup identifier
2607
2608Initialize task reduction for a parallel or worksharing.
2609
2610Note: this entry supposes the optional compiler-generated initializer routine
2611has single parameter - pointer to object to be initialized. That means
2612the reduction either does not use omp_orig object, or the omp_orig is accessible
2613without help of the runtime library.
2614*/
2616 int num, void *data) {
2617 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2619}
2620
2621/*!
2622@ingroup TASKING
2623@param loc Source location info
2624@param gtid Global thread ID
2625@param is_ws Is 1 if the reduction is for worksharing, 0 otherwise
2626@param num Number of data items to reduce
2627@param data Array of data for reduction
2628@return The taskgroup identifier
2629
2630Initialize task reduction for a parallel or worksharing.
2631
2632Note: this entry supposes the optional compiler-generated initializer routine
2633has two parameters, pointer to object to be initialized and pointer to omp_orig
2634*/
2635void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2636 void *data) {
2637 return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2639}
2640
2641/*!
2642@ingroup TASKING
2643@param loc Source location info
2644@param gtid Global thread ID
2645@param is_ws Is 1 if the reduction is for worksharing, 0 otherwise
2646
2647Finalize task reduction for a parallel or worksharing.
2648*/
2651}
2652
2653// __kmpc_taskgroup: Start a new taskgroup
2654void __kmpc_taskgroup(ident_t *loc, int gtid) {
2656 kmp_info_t *thread = __kmp_threads[gtid];
2657 kmp_taskdata_t *taskdata = thread->th.th_current_task;
2658 kmp_taskgroup_t *tg_new =
2660 KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2661 KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2663 tg_new->parent = taskdata->td_taskgroup;
2664 tg_new->reduce_data = NULL;
2665 tg_new->reduce_num_data = 0;
2666 tg_new->gomp_data = NULL;
2667 taskdata->td_taskgroup = tg_new;
2668
2669#if OMPT_SUPPORT && OMPT_OPTIONAL
2670 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2671 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2672 if (!codeptr)
2673 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2674 kmp_team_t *team = thread->th.th_team;
2675 ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2676 // FIXME: I think this is wrong for lwt!
2677 ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2678
2679 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2680 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2681 &(my_task_data), codeptr);
2682 }
2683#endif
2684}
2685
2686// __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2687// and its descendants are complete
2690 kmp_info_t *thread = __kmp_threads[gtid];
2691 kmp_taskdata_t *taskdata = thread->th.th_current_task;
2692 kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2693 int thread_finished = FALSE;
2694
2695#if OMPT_SUPPORT && OMPT_OPTIONAL
2696 kmp_team_t *team;
2697 ompt_data_t my_task_data;
2698 ompt_data_t my_parallel_data;
2699 void *codeptr = nullptr;
2701 team = thread->th.th_team;
2702 my_task_data = taskdata->ompt_task_info.task_data;
2703 // FIXME: I think this is wrong for lwt!
2704 my_parallel_data = team->t.ompt_team_info.parallel_data;
2705 codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2706 if (!codeptr)
2707 codeptr = OMPT_GET_RETURN_ADDRESS(0);
2708 }
2709#endif
2710
2711 KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2712 KMP_DEBUG_ASSERT(taskgroup != NULL);
2713 KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2714
2716 // mark task as waiting not on a barrier
2717 taskdata->td_taskwait_counter += 1;
2718 taskdata->td_taskwait_ident = loc;
2719 taskdata->td_taskwait_thread = gtid + 1;
2720#if USE_ITT_BUILD
2721 // For ITT the taskgroup wait is similar to taskwait until we need to
2722 // distinguish them
2723 void *itt_sync_obj = NULL;
2724#if USE_ITT_NOTIFY
2725 KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2726#endif /* USE_ITT_NOTIFY */
2727#endif /* USE_ITT_BUILD */
2728
2729#if OMPT_SUPPORT && OMPT_OPTIONAL
2730 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2731 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2732 ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2733 &(my_task_data), codeptr);
2734 }
2735#endif
2736
2737#if ENABLE_LIBOMPTARGET
2738 // Give an opportunity to the offload runtime to make progress and create
2739 // any necessary proxy tasks
2740 if (UNLIKELY(kmp_target_sync_cb))
2741 (*kmp_target_sync_cb)(loc, gtid, KMP_TASKDATA_TO_TASK(taskdata), NULL);
2742#endif // ENABLE_LIBOMPTARGET
2743
2744 if (!taskdata->td_flags.team_serial ||
2745 (thread->th.th_task_team != NULL &&
2746 (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
2747 thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
2749 RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2750 while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2751 flag.execute_tasks(thread, gtid, FALSE,
2752 &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2754 }
2755 }
2756 taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2757
2758#if OMPT_SUPPORT && OMPT_OPTIONAL
2759 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2760 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2761 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2762 &(my_task_data), codeptr);
2763 }
2764#endif
2765
2766#if USE_ITT_BUILD
2767 KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2768 KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2769#endif /* USE_ITT_BUILD */
2770 }
2771 KMP_DEBUG_ASSERT(taskgroup->count == 0);
2772
2773 if (taskgroup->reduce_data != NULL &&
2774 !taskgroup->gomp_data) { // need to reduce?
2775 int cnt;
2776 void *reduce_data;
2777 kmp_team_t *t = thread->th.th_team;
2779 // check if <priv> data of the first reduction variable shared for the team
2780 void *priv0 = arr[0].reduce_priv;
2781 if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2782 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2783 // finishing task reduction on parallel
2784 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2785 if (cnt == thread->th.th_team_nproc - 1) {
2786 // we are the last thread passing __kmpc_reduction_modifier_fini()
2787 // finalize task reduction:
2788 __kmp_task_reduction_fini(thread, taskgroup);
2789 // cleanup fields in the team structure:
2790 // TODO: is relaxed store enough here (whole barrier should follow)?
2791 __kmp_thread_free(thread, reduce_data);
2792 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2793 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2794 } else {
2795 // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2796 // so do not finalize reduction, just clean own copy of the data
2797 __kmp_task_reduction_clean(thread, taskgroup);
2798 }
2799 } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2800 NULL &&
2801 ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2802 // finishing task reduction on worksharing
2803 cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2804 if (cnt == thread->th.th_team_nproc - 1) {
2805 // we are the last thread passing __kmpc_reduction_modifier_fini()
2806 __kmp_task_reduction_fini(thread, taskgroup);
2807 // cleanup fields in team structure:
2808 // TODO: is relaxed store enough here (whole barrier should follow)?
2809 __kmp_thread_free(thread, reduce_data);
2810 KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2811 KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2812 } else {
2813 // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2814 // so do not finalize reduction, just clean own copy of the data
2815 __kmp_task_reduction_clean(thread, taskgroup);
2816 }
2817 } else {
2818 // finishing task reduction on taskgroup
2819 __kmp_task_reduction_fini(thread, taskgroup);
2820 }
2821 }
2822 // Restore parent taskgroup for the current task
2823 taskdata->td_taskgroup = taskgroup->parent;
2824 __kmp_thread_free(thread, taskgroup);
2825
2826 KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2827 gtid, taskdata));
2828
2829#if OMPT_SUPPORT && OMPT_OPTIONAL
2830 if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2831 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2832 ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2833 &(my_task_data), codeptr);
2834 }
2835#endif
2836}
2837
2839 kmp_task_team_t *task_team,
2840 kmp_int32 is_constrained) {
2841 kmp_task_t *task = NULL;
2842 kmp_taskdata_t *taskdata;
2843 kmp_taskdata_t *current;
2844 kmp_thread_data_t *thread_data;
2845 int ntasks = task_team->tt.tt_num_task_pri;
2846 if (ntasks == 0) {
2847 KA_TRACE(
2848 20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid));
2849 return NULL;
2850 }
2851 do {
2852 // decrement num_tasks to "reserve" one task to get for execution
2853 if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks,
2854 ntasks - 1))
2855 break;
2856 ntasks = task_team->tt.tt_num_task_pri;
2857 } while (ntasks > 0);
2858 if (ntasks == 0) {
2859 KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",
2860 __kmp_get_gtid()));
2861 return NULL;
2862 }
2863 // We got a "ticket" to get a "reserved" priority task
2864 int deque_ntasks;
2865 kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
2866 do {
2867 KMP_ASSERT(list != NULL);
2868 thread_data = &list->td;
2869 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2870 deque_ntasks = thread_data->td.td_deque_ntasks;
2871 if (deque_ntasks == 0) {
2872 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2873 KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n",
2874 __kmp_get_gtid(), thread_data));
2875 list = list->next;
2876 }
2877 } while (deque_ntasks == 0);
2878 KMP_DEBUG_ASSERT(deque_ntasks);
2879 int target = thread_data->td.td_deque_head;
2880 current = __kmp_threads[gtid]->th.th_current_task;
2881 taskdata = thread_data->td.td_deque[target];
2882 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2883 // Bump head pointer and Wrap.
2884 thread_data->td.td_deque_head =
2885 (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2886 } else {
2887 if (!task_team->tt.tt_untied_task_encountered) {
2888 // The TSC does not allow to steal victim task
2889 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2890 KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task "
2891 "from %p: task_team=%p ntasks=%d head=%u tail=%u\n",
2892 gtid, thread_data, task_team, deque_ntasks, target,
2893 thread_data->td.td_deque_tail));
2894 task_team->tt.tt_num_task_pri++; // atomic inc, restore value
2895 return NULL;
2896 }
2897 int i;
2898 // walk through the deque trying to steal any task
2899 taskdata = NULL;
2900 for (i = 1; i < deque_ntasks; ++i) {
2901 target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2902 taskdata = thread_data->td.td_deque[target];
2903 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2904 break; // found task to execute
2905 } else {
2906 taskdata = NULL;
2907 }
2908 }
2909 if (taskdata == NULL) {
2910 // No appropriate candidate found to execute
2911 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2912 KA_TRACE(
2913 10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from "
2914 "%p: task_team=%p ntasks=%d head=%u tail=%u\n",
2915 gtid, thread_data, task_team, deque_ntasks,
2916 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2917 task_team->tt.tt_num_task_pri++; // atomic inc, restore value
2918 return NULL;
2919 }
2920 int prev = target;
2921 for (i = i + 1; i < deque_ntasks; ++i) {
2922 // shift remaining tasks in the deque left by 1
2923 target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2924 thread_data->td.td_deque[prev] = thread_data->td.td_deque[target];
2925 prev = target;
2926 }
2928 thread_data->td.td_deque_tail ==
2929 (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td)));
2930 thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped))
2931 }
2932 thread_data->td.td_deque_ntasks = deque_ntasks - 1;
2933 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2934 task = KMP_TASKDATA_TO_TASK(taskdata);
2935 return task;
2936}
2937
2938// __kmp_remove_my_task: remove a task from my own deque
2940 kmp_task_team_t *task_team,
2941 kmp_int32 is_constrained) {
2943 kmp_taskdata_t *taskdata;
2944 kmp_thread_data_t *thread_data;
2946
2948 KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2949 NULL); // Caller should check this condition
2950
2951 thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2952
2953 KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2954 gtid, thread_data->td.td_deque_ntasks,
2955 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2956
2957 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2958 KA_TRACE(10,
2959 ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2960 "ntasks=%d head=%u tail=%u\n",
2961 gtid, thread_data->td.td_deque_ntasks,
2962 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2963 return NULL;
2964 }
2965
2966 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2967
2968 if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2969 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2970 KA_TRACE(10,
2971 ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2972 "ntasks=%d head=%u tail=%u\n",
2973 gtid, thread_data->td.td_deque_ntasks,
2974 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2975 return NULL;
2976 }
2977
2978 tail = (thread_data->td.td_deque_tail - 1) &
2979 TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2980 taskdata = thread_data->td.td_deque[tail];
2981
2982 if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2983 thread->th.th_current_task)) {
2984 // The TSC does not allow to steal victim task
2985 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2986 KA_TRACE(10,
2987 ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2988 "ntasks=%d head=%u tail=%u\n",
2989 gtid, thread_data->td.td_deque_ntasks,
2990 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2991 return NULL;
2992 }
2993
2994 thread_data->td.td_deque_tail = tail;
2995 TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2996
2997 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2998
2999 KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
3000 "ntasks=%d head=%u tail=%u\n",
3001 gtid, taskdata, thread_data->td.td_deque_ntasks,
3002 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3003
3004 task = KMP_TASKDATA_TO_TASK(taskdata);
3005 return task;
3006}
3007
3008// __kmp_steal_task: remove a task from another thread's deque
3009// Assume that calling thread has already checked existence of
3010// task_team thread_data before calling this routine.
3012 kmp_task_team_t *task_team,
3013 std::atomic<kmp_int32> *unfinished_threads,
3014 int *thread_finished,
3015 kmp_int32 is_constrained) {
3017 kmp_taskdata_t *taskdata;
3018 kmp_taskdata_t *current;
3019 kmp_thread_data_t *victim_td, *threads_data;
3021 kmp_info_t *victim_thr;
3022
3024
3025 threads_data = task_team->tt.tt_threads_data;
3026 KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
3027 KMP_DEBUG_ASSERT(victim_tid >= 0);
3028 KMP_DEBUG_ASSERT(victim_tid < task_team->tt.tt_max_threads);
3029
3030 victim_td = &threads_data[victim_tid];
3031 victim_thr = victim_td->td.td_thr;
3032 (void)victim_thr; // Use in TRACE messages which aren't always enabled.
3033
3034 KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
3035 "task_team=%p ntasks=%d head=%u tail=%u\n",
3036 gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3037 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3038 victim_td->td.td_deque_tail));
3039
3040 if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
3041 KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
3042 "task_team=%p ntasks=%d head=%u tail=%u\n",
3043 gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3044 victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3045 victim_td->td.td_deque_tail));
3046 return NULL;
3047 }
3048
3049 __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
3050
3051 int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
3052 // Check again after we acquire the lock
3053 if (ntasks == 0) {
3054 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3055 KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
3056 "task_team=%p ntasks=%d head=%u tail=%u\n",
3057 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3058 victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3059 return NULL;
3060 }
3061
3062 KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
3063 current = __kmp_threads[gtid]->th.th_current_task;
3064 taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
3065 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3066 // Bump head pointer and Wrap.
3067 victim_td->td.td_deque_head =
3068 (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
3069 } else {
3070 if (!task_team->tt.tt_untied_task_encountered) {
3071 // The TSC does not allow to steal victim task
3072 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3073 KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
3074 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3075 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3076 victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3077 return NULL;
3078 }
3079 int i;
3080 // walk through victim's deque trying to steal any task
3081 target = victim_td->td.td_deque_head;
3082 taskdata = NULL;
3083 for (i = 1; i < ntasks; ++i) {
3084 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3085 taskdata = victim_td->td.td_deque[target];
3086 if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3087 break; // found victim task
3088 } else {
3089 taskdata = NULL;
3090 }
3091 }
3092 if (taskdata == NULL) {
3093 // No appropriate candidate to steal found
3094 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3095 KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
3096 "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3097 gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3098 victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3099 return NULL;
3100 }
3101 int prev = target;
3102 for (i = i + 1; i < ntasks; ++i) {
3103 // shift remaining tasks in the deque left by 1
3104 target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3105 victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
3106 prev = target;
3107 }
3109 victim_td->td.td_deque_tail ==
3110 (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
3111 victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
3112 }
3113 if (*thread_finished) {
3114 // We need to un-mark this victim as a finished victim. This must be done
3115 // before releasing the lock, or else other threads (starting with the
3116 // primary thread victim) might be prematurely released from the barrier!!!
3117#if KMP_DEBUG
3119#endif
3120 KMP_ATOMIC_INC(unfinished_threads);
3121 KA_TRACE(
3122 20,
3123 ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
3124 gtid, count + 1, task_team));
3125 *thread_finished = FALSE;
3126 }
3127 TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
3128
3129 __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3130
3131 KMP_COUNT_BLOCK(TASK_stolen);
3132 KA_TRACE(10,
3133 ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
3134 "task_team=%p ntasks=%d head=%u tail=%u\n",
3135 gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
3136 ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3137
3138 task = KMP_TASKDATA_TO_TASK(taskdata);
3139 return task;
3140}
3141
3142// __kmp_execute_tasks_template: Choose and execute tasks until either the
3143// condition is statisfied (return true) or there are none left (return false).
3144//
3145// final_spin is TRUE if this is the spin at the release barrier.
3146// thread_finished indicates whether the thread is finished executing all
3147// the tasks it has on its deque, and is at the release barrier.
3148// spinner is the location on which to spin.
3149// spinner == NULL means only execute a single task and return.
3150// checker is the value to check to terminate the spin.
3151template <class C>
3153 kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
3154 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3155 kmp_int32 is_constrained) {
3156 kmp_task_team_t *task_team = thread->th.th_task_team;
3157 kmp_thread_data_t *threads_data;
3159 kmp_info_t *other_thread;
3160 kmp_taskdata_t *current_task = thread->th.th_current_task;
3161 std::atomic<kmp_int32> *unfinished_threads;
3162 kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
3163 tid = thread->th.th_info.ds.ds_tid;
3164
3166 KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
3167
3168 if (task_team == NULL || current_task == NULL)
3169 return FALSE;
3170
3171 KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
3172 "*thread_finished=%d\n",
3173 gtid, final_spin, *thread_finished));
3174
3175 thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3176 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3177
3178 KMP_DEBUG_ASSERT(threads_data != NULL);
3179
3180 nthreads = task_team->tt.tt_nproc;
3181 unfinished_threads = &(task_team->tt.tt_unfinished_threads);
3182 KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
3183
3184 while (1) { // Outer loop keeps trying to find tasks in case of single thread
3185 // getting tasks from target constructs
3186 while (1) { // Inner loop to find a task and execute it
3187#if ENABLE_LIBOMPTARGET
3188 // Give an opportunity to the offload runtime to make progress
3189 if (UNLIKELY(kmp_target_sync_cb))
3190 (*kmp_target_sync_cb)(NULL, gtid, KMP_TASKDATA_TO_TASK(current_task),
3191 NULL);
3192#endif // ENABLE_LIBOMPTARGET
3193
3194 task = NULL;
3195 if (task_team->tt.tt_num_task_pri) { // get priority task first
3196 task = __kmp_get_priority_task(gtid, task_team, is_constrained);
3197 }
3198 if (task == NULL && use_own_tasks) { // check own queue next
3199 task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
3200 }
3201 if ((task == NULL) && (nthreads > 1)) { // Steal a task finally
3202 int asleep = 1;
3203 use_own_tasks = 0;
3204 // Try to steal from the last place I stole from successfully.
3205 if (victim_tid == -2) { // haven't stolen anything yet
3206 victim_tid = threads_data[tid].td.td_deque_last_stolen;
3207 if (victim_tid !=
3208 -1) // if we have a last stolen from victim, get the thread
3209 other_thread = threads_data[victim_tid].td.td_thr;
3210 }
3211 if (victim_tid != -1) { // found last victim
3212 asleep = 0;
3213 } else if (!new_victim) { // no recent steals and we haven't already
3214 // used a new victim; select a random thread
3215 do { // Find a different thread to steal work from.
3216 // Pick a random thread. Initial plan was to cycle through all the
3217 // threads, and only return if we tried to steal from every thread,
3218 // and failed. Arch says that's not such a great idea.
3219 victim_tid = __kmp_get_random(thread) % (nthreads - 1);
3220 if (victim_tid >= tid) {
3221 ++victim_tid; // Adjusts random distribution to exclude self
3222 }
3223 // Found a potential victim
3224 other_thread = threads_data[victim_tid].td.td_thr;
3225 // There is a slight chance that __kmp_enable_tasking() did not wake
3226 // up all threads waiting at the barrier. If victim is sleeping,
3227 // then wake it up. Since we were going to pay the cache miss
3228 // penalty for referencing another thread's kmp_info_t struct
3229 // anyway,
3230 // the check shouldn't cost too much performance at this point. In
3231 // extra barrier mode, tasks do not sleep at the separate tasking
3232 // barrier, so this isn't a problem.
3233 asleep = 0;
3236 (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
3237 NULL)) {
3238 asleep = 1;
3239 __kmp_null_resume_wrapper(other_thread);
3240 // A sleeping thread should not have any tasks on it's queue.
3241 // There is a slight possibility that it resumes, steals a task
3242 // from another thread, which spawns more tasks, all in the time
3243 // that it takes this thread to check => don't write an assertion
3244 // that the victim's queue is empty. Try stealing from a
3245 // different thread.
3246 }
3247 } while (asleep);
3248 }
3249
3250 if (!asleep) {
3251 // We have a victim to try to steal from
3252 task =
3253 __kmp_steal_task(victim_tid, gtid, task_team, unfinished_threads,
3254 thread_finished, is_constrained);
3255 }
3256 if (task != NULL) { // set last stolen to victim
3257 if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
3258 threads_data[tid].td.td_deque_last_stolen = victim_tid;
3259 // The pre-refactored code did not try more than 1 successful new
3260 // vicitm, unless the last one generated more local tasks;
3261 // new_victim keeps track of this
3262 new_victim = 1;
3263 }
3264 } else { // No tasks found; unset last_stolen
3265 KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
3266 victim_tid = -2; // no successful victim found
3267 }
3268 }
3269
3270 if (task == NULL)
3271 break; // break out of tasking loop
3272
3273// Found a task; execute it
3274#if USE_ITT_BUILD && USE_ITT_NOTIFY
3275 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
3276 if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
3277 // get the object reliably
3278 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
3279 }
3280 __kmp_itt_task_starting(itt_sync_obj);
3281 }
3282#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
3283 __kmp_invoke_task(gtid, task, current_task);
3284#if USE_ITT_BUILD
3285 if (itt_sync_obj != NULL)
3286 __kmp_itt_task_finished(itt_sync_obj);
3287#endif /* USE_ITT_BUILD */
3288 // If this thread is only partway through the barrier and the condition is
3289 // met, then return now, so that the barrier gather/release pattern can
3290 // proceed. If this thread is in the last spin loop in the barrier,
3291 // waiting to be released, we know that the termination condition will not
3292 // be satisfied, so don't waste any cycles checking it.
3293 if (flag == NULL || (!final_spin && flag->done_check())) {
3294 KA_TRACE(
3295 15,
3296 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3297 gtid));
3298 return TRUE;
3299 }
3300 if (thread->th.th_task_team == NULL) {
3301 break;
3302 }
3303 KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3304 // If execution of a stolen task results in more tasks being placed on our
3305 // run queue, reset use_own_tasks
3306 if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3307 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3308 "other tasks, restart\n",
3309 gtid));
3310 use_own_tasks = 1;
3311 new_victim = 0;
3312 }
3313 }
3314
3315 // The task source has been exhausted. If in final spin loop of barrier,
3316 // check if termination condition is satisfied. The work queue may be empty
3317 // but there might be proxy tasks still executing.
3318 if (final_spin &&
3319 KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3320 // First, decrement the #unfinished threads, if that has not already been
3321 // done. This decrement might be to the spin location, and result in the
3322 // termination condition being satisfied.
3323 if (!*thread_finished) {
3324#if KMP_DEBUG
3325 kmp_int32 count = -1 +
3326#endif
3327 KMP_ATOMIC_DEC(unfinished_threads);
3328 KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3329 "unfinished_threads to %d task_team=%p\n",
3330 gtid, count, task_team));
3331 *thread_finished = TRUE;
3332 }
3333
3334 // It is now unsafe to reference thread->th.th_team !!!
3335 // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3336 // thread to pass through the barrier, where it might reset each thread's
3337 // th.th_team field for the next parallel region. If we can steal more
3338 // work, we know that this has not happened yet.
3339 if (flag != NULL && flag->done_check()) {
3340 KA_TRACE(
3341 15,
3342 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3343 gtid));
3344 return TRUE;
3345 }
3346 }
3347
3348 // If this thread's task team is NULL, primary thread has recognized that
3349 // there are no more tasks; bail out
3350 if (thread->th.th_task_team == NULL) {
3351 KA_TRACE(15,
3352 ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3353 return FALSE;
3354 }
3355
3356 // Check the flag again to see if it has already done in case to be trapped
3357 // into infinite loop when a if0 task depends on a hidden helper task
3358 // outside any parallel region. Detached tasks are not impacted in this case
3359 // because the only thread executing this function has to execute the proxy
3360 // task so it is in another code path that has the same check.
3361 if (flag == NULL || (!final_spin && flag->done_check())) {
3362 KA_TRACE(15,
3363 ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3364 gtid));
3365 return TRUE;
3366 }
3367
3368 // We could be getting tasks from target constructs; if this is the only
3369 // thread, keep trying to execute tasks from own queue
3370 if (nthreads == 1 &&
3372 use_own_tasks = 1;
3373 else {
3374 KA_TRACE(15,
3375 ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3376 return FALSE;
3377 }
3378 }
3379}
3380
3381template <bool C, bool S>
3383 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3384 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3385 kmp_int32 is_constrained) {
3387 thread, gtid, flag, final_spin,
3388 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3389}
3390
3391template <bool C, bool S>
3393 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3394 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3395 kmp_int32 is_constrained) {
3397 thread, gtid, flag, final_spin,
3398 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3399}
3400
3401template <bool C, bool S>
3404 int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3405 kmp_int32 is_constrained) {
3407 thread, gtid, flag, final_spin,
3408 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3409}
3410
3412 kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3413 int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3414 kmp_int32 is_constrained) {
3416 thread, gtid, flag, final_spin,
3417 thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3418}
3419
3420template int
3423 int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3424
3427 int,
3428 int *USE_ITT_BUILD_ARG(void *),
3429 kmp_int32);
3430
3433 int,
3434 int *USE_ITT_BUILD_ARG(void *),
3435 kmp_int32);
3436
3439 int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3440
3443 int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3444
3445// __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3446// next barrier so they can assist in executing enqueued tasks.
3447// First thread in allocates the task team atomically.
3449 kmp_info_t *this_thr) {
3450 kmp_thread_data_t *threads_data;
3451 int nthreads, i, is_init_thread;
3452
3453 KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3454 __kmp_gtid_from_thread(this_thr)));
3455
3456 KMP_DEBUG_ASSERT(task_team != NULL);
3457 KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3458
3459 nthreads = task_team->tt.tt_nproc;
3460 KMP_DEBUG_ASSERT(nthreads > 0);
3461 KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3462
3463 // Allocate or increase the size of threads_data if necessary
3464 is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3465
3466 if (!is_init_thread) {
3467 // Some other thread already set up the array.
3468 KA_TRACE(
3469 20,
3470 ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3471 __kmp_gtid_from_thread(this_thr)));
3472 return;
3473 }
3474 threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3475 KMP_DEBUG_ASSERT(threads_data != NULL);
3476
3479 // Release any threads sleeping at the barrier, so that they can steal
3480 // tasks and execute them. In extra barrier mode, tasks do not sleep
3481 // at the separate tasking barrier, so this isn't a problem.
3482 for (i = 0; i < nthreads; i++) {
3483 void *sleep_loc;
3484 kmp_info_t *thread = threads_data[i].td.td_thr;
3485
3486 if (i == this_thr->th.th_info.ds.ds_tid) {
3487 continue;
3488 }
3489 // Since we haven't locked the thread's suspend mutex lock at this
3490 // point, there is a small window where a thread might be putting
3491 // itself to sleep, but hasn't set the th_sleep_loc field yet.
3492 // To work around this, __kmp_execute_tasks_template() periodically checks
3493 // see if other threads are sleeping (using the same random mechanism that
3494 // is used for task stealing) and awakens them if they are.
3495 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3496 NULL) {
3497 KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3498 __kmp_gtid_from_thread(this_thr),
3499 __kmp_gtid_from_thread(thread)));
3501 } else {
3502 KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3503 __kmp_gtid_from_thread(this_thr),
3504 __kmp_gtid_from_thread(thread)));
3505 }
3506 }
3507 }
3508
3509 KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3510 __kmp_gtid_from_thread(this_thr)));
3511}
3512
3513/* // TODO: Check the comment consistency
3514 * Utility routines for "task teams". A task team (kmp_task_t) is kind of
3515 * like a shadow of the kmp_team_t data struct, with a different lifetime.
3516 * After a child * thread checks into a barrier and calls __kmp_release() from
3517 * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3518 * longer assume that the kmp_team_t structure is intact (at any moment, the
3519 * primary thread may exit the barrier code and free the team data structure,
3520 * and return the threads to the thread pool).
3521 *
3522 * This does not work with the tasking code, as the thread is still
3523 * expected to participate in the execution of any tasks that may have been
3524 * spawned my a member of the team, and the thread still needs access to all
3525 * to each thread in the team, so that it can steal work from it.
3526 *
3527 * Enter the existence of the kmp_task_team_t struct. It employs a reference
3528 * counting mechanism, and is allocated by the primary thread before calling
3529 * __kmp_<barrier_kind>_release, and then is release by the last thread to
3530 * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
3531 * of the kmp_task_team_t structs for consecutive barriers can overlap
3532 * (and will, unless the primary thread is the last thread to exit the barrier
3533 * release phase, which is not typical). The existence of such a struct is
3534 * useful outside the context of tasking.
3535 *
3536 * We currently use the existence of the threads array as an indicator that
3537 * tasks were spawned since the last barrier. If the structure is to be
3538 * useful outside the context of tasking, then this will have to change, but
3539 * not setting the field minimizes the performance impact of tasking on
3540 * barriers, when no explicit tasks were spawned (pushed, actually).
3541 */
3542
3544 NULL; // Free list for task_team data structures
3545// Lock for task team data structures
3548
3549// __kmp_alloc_task_deque:
3550// Allocates a task deque for a particular thread, and initialize the necessary
3551// data structures relating to the deque. This only happens once per thread
3552// per task team since task teams are recycled. No lock is needed during
3553// allocation since each thread allocates its own deque.
3555 kmp_thread_data_t *thread_data) {
3556 __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3557 KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3558
3559 // Initialize last stolen task field to "none"
3560 thread_data->td.td_deque_last_stolen = -1;
3561
3562 KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3563 KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3564 KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3565
3566 KE_TRACE(
3567 10,
3568 ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3569 __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3570 // Allocate space for task deque, and zero the deque
3571 // Cannot use __kmp_thread_calloc() because threads not around for
3572 // kmp_reap_task_team( ).
3573 thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3575 thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3576}
3577
3578// __kmp_free_task_deque:
3579// Deallocates a task deque for a particular thread. Happens at library
3580// deallocation so don't need to reset all thread data fields.
3581static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3582 if (thread_data->td.td_deque != NULL) {
3583 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3584 TCW_4(thread_data->td.td_deque_ntasks, 0);
3585 __kmp_free(thread_data->td.td_deque);
3586 thread_data->td.td_deque = NULL;
3587 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3588 }
3589}
3590
3591// __kmp_realloc_task_threads_data:
3592// Allocates a threads_data array for a task team, either by allocating an
3593// initial array or enlarging an existing array. Only the first thread to get
3594// the lock allocs or enlarges the array and re-initializes the array elements.
3595// That thread returns "TRUE", the rest return "FALSE".
3596// Assumes that the new array size is given by task_team -> tt.tt_nproc.
3597// The current size is given by task_team -> tt.tt_max_threads.
3599 kmp_task_team_t *task_team) {
3600 kmp_thread_data_t **threads_data_p;
3601 kmp_int32 nthreads, maxthreads;
3602 int is_init_thread = FALSE;
3603
3604 if (TCR_4(task_team->tt.tt_found_tasks)) {
3605 // Already reallocated and initialized.
3606 return FALSE;
3607 }
3608
3609 threads_data_p = &task_team->tt.tt_threads_data;
3610 nthreads = task_team->tt.tt_nproc;
3611 maxthreads = task_team->tt.tt_max_threads;
3612
3613 // All threads must lock when they encounter the first task of the implicit
3614 // task region to make sure threads_data fields are (re)initialized before
3615 // used.
3617
3618 if (!TCR_4(task_team->tt.tt_found_tasks)) {
3619 // first thread to enable tasking
3620 kmp_team_t *team = thread->th.th_team;
3621 int i;
3622
3623 is_init_thread = TRUE;
3624 if (maxthreads < nthreads) {
3625
3626 if (*threads_data_p != NULL) {
3627 kmp_thread_data_t *old_data = *threads_data_p;
3628 kmp_thread_data_t *new_data = NULL;
3629
3630 KE_TRACE(
3631 10,
3632 ("__kmp_realloc_task_threads_data: T#%d reallocating "
3633 "threads data for task_team %p, new_size = %d, old_size = %d\n",
3634 __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3635 // Reallocate threads_data to have more elements than current array
3636 // Cannot use __kmp_thread_realloc() because threads not around for
3637 // kmp_reap_task_team( ). Note all new array entries are initialized
3638 // to zero by __kmp_allocate().
3639 new_data = (kmp_thread_data_t *)__kmp_allocate(
3640 nthreads * sizeof(kmp_thread_data_t));
3641 // copy old data to new data
3642 KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3643 (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3644
3645 // Install the new data and free the old data
3646 (*threads_data_p) = new_data;
3647 __kmp_free(old_data);
3648 } else {
3649 KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3650 "threads data for task_team %p, size = %d\n",
3651 __kmp_gtid_from_thread(thread), task_team, nthreads));
3652 // Make the initial allocate for threads_data array, and zero entries
3653 // Cannot use __kmp_thread_calloc() because threads not around for
3654 // kmp_reap_task_team( ).
3655 *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3656 nthreads * sizeof(kmp_thread_data_t));
3657 }
3658 task_team->tt.tt_max_threads = nthreads;
3659 } else {
3660 // If array has (more than) enough elements, go ahead and use it
3661 KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3662 }
3663
3664 // initialize threads_data pointers back to thread_info structures
3665 for (i = 0; i < nthreads; i++) {
3666 kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3667 thread_data->td.td_thr = team->t.t_threads[i];
3668
3669 if (thread_data->td.td_deque_last_stolen >= nthreads) {
3670 // The last stolen field survives across teams / barrier, and the number
3671 // of threads may have changed. It's possible (likely?) that a new
3672 // parallel region will exhibit the same behavior as previous region.
3673 thread_data->td.td_deque_last_stolen = -1;
3674 }
3675 }
3676
3677 KMP_MB();
3678 TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3679 }
3680
3682 return is_init_thread;
3683}
3684
3685// __kmp_free_task_threads_data:
3686// Deallocates a threads_data array for a task team, including any attached
3687// tasking deques. Only occurs at library shutdown.
3690 if (task_team->tt.tt_threads_data != NULL) {
3691 int i;
3692 for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3694 }
3695 __kmp_free(task_team->tt.tt_threads_data);
3696 task_team->tt.tt_threads_data = NULL;
3697 }
3699}
3700
3701// __kmp_free_task_pri_list:
3702// Deallocates tasking deques used for priority tasks.
3703// Only occurs at library shutdown.
3706 if (task_team->tt.tt_task_pri_list != NULL) {
3707 kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3708 while (list != NULL) {
3709 kmp_task_pri_t *next = list->next;
3710 __kmp_free_task_deque(&list->td);
3711 __kmp_free(list);
3712 list = next;
3713 }
3714 task_team->tt.tt_task_pri_list = NULL;
3715 }
3717}
3718
3719static inline void __kmp_task_team_init(kmp_task_team_t *task_team,
3720 kmp_team_t *team) {
3721 int team_nth = team->t.t_nproc;
3722 // Only need to init if task team is isn't active or team size changed
3723 if (!task_team->tt.tt_active || team_nth != task_team->tt.tt_nproc) {
3724 TCW_4(task_team->tt.tt_found_tasks, FALSE);
3725 TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3727 TCW_4(task_team->tt.tt_nproc, team_nth);
3728 KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team_nth);
3729 TCW_4(task_team->tt.tt_active, TRUE);
3730 }
3731}
3732
3733// __kmp_allocate_task_team:
3734// Allocates a task team associated with a specific team, taking it from
3735// the global task team free list if possible. Also initializes data
3736// structures.
3738 kmp_team_t *team) {
3739 kmp_task_team_t *task_team = NULL;
3740
3741 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3742 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3743
3744 if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3745 // Take a task team from the task team pool
3747 if (__kmp_free_task_teams != NULL) {
3748 task_team = __kmp_free_task_teams;
3750 task_team->tt.tt_next = NULL;
3751 }
3753 }
3754
3755 if (task_team == NULL) {
3756 KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3757 "task team for team %p\n",
3758 __kmp_gtid_from_thread(thread), team));
3759 // Allocate a new task team if one is not available. Cannot use
3760 // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3761 task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3764#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3765 // suppress race conditions detection on synchronization flags in debug mode
3766 // this helps to analyze library internals eliminating false positives
3767 __itt_suppress_mark_range(
3768 __itt_suppress_range, __itt_suppress_threading_errors,
3769 &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3770 __itt_suppress_mark_range(__itt_suppress_range,
3771 __itt_suppress_threading_errors,
3772 CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3773 sizeof(task_team->tt.tt_active));
3774#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3775 // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3776 // task_team->tt.tt_threads_data = NULL;
3777 // task_team->tt.tt_max_threads = 0;
3778 // task_team->tt.tt_next = NULL;
3779 }
3780
3781 __kmp_task_team_init(task_team, team);
3782
3783 KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3784 "unfinished_threads init'd to %d\n",
3785 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3787 return task_team;
3788}
3789
3790// __kmp_free_task_team:
3791// Frees the task team associated with a specific thread, and adds it
3792// to the global task team free list.
3794 KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3795 thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3796
3797 // Put task team back on free list
3799
3800 KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3801 task_team->tt.tt_next = __kmp_free_task_teams;
3802 TCW_PTR(__kmp_free_task_teams, task_team);
3803
3805}
3806
3807// __kmp_reap_task_teams:
3808// Free all the task teams on the task team free list.
3809// Should only be done during library shutdown.
3810// Cannot do anything that needs a thread structure or gtid since they are
3811// already gone.
3813 kmp_task_team_t *task_team;
3814
3815 if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3816 // Free all task_teams on the free list
3818 while ((task_team = __kmp_free_task_teams) != NULL) {
3819 __kmp_free_task_teams = task_team->tt.tt_next;
3820 task_team->tt.tt_next = NULL;
3821
3822 // Free threads_data if necessary
3823 if (task_team->tt.tt_threads_data != NULL) {
3825 }
3826 if (task_team->tt.tt_task_pri_list != NULL) {
3827 __kmp_free_task_pri_list(task_team);
3828 }
3829 __kmp_free(task_team);
3830 }
3832 }
3833}
3834
3835// View the array of two task team pointers as a pair of pointers:
3836// 1) a single task_team pointer
3837// 2) next pointer for stack
3838// Serial teams can create a stack of task teams for nested serial teams.
3840 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
3841 kmp_task_team_list_t *current =
3842 (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
3843 kmp_task_team_list_t *node =
3845 node->task_team = current->task_team;
3846 node->next = current->next;
3847 thread->th.th_task_team = current->task_team = NULL;
3848 current->next = node;
3849}
3850
3851// Serial team pops a task team off the stack
3853 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
3854 kmp_task_team_list_t *current =
3855 (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
3856 if (current->task_team) {
3857 __kmp_free_task_team(thread, current->task_team);
3858 }
3859 kmp_task_team_list_t *next = current->next;
3860 if (next) {
3861 current->task_team = next->task_team;
3862 current->next = next->next;
3863 KMP_DEBUG_ASSERT(next != current);
3864 __kmp_free(next);
3865 thread->th.th_task_team = current->task_team;
3866 }
3867}
3868
3869// __kmp_wait_to_unref_task_teams:
3870// Some threads could still be in the fork barrier release code, possibly
3871// trying to steal tasks. Wait for each thread to unreference its task team.
3873 kmp_info_t *thread;
3874 kmp_uint32 spins;
3875 kmp_uint64 time;
3876 int done;
3877
3878 KMP_INIT_YIELD(spins);
3879 KMP_INIT_BACKOFF(time);
3880
3881 for (;;) {
3882 done = TRUE;
3883
3884 // TODO: GEH - this may be is wrong because some sync would be necessary
3885 // in case threads are added to the pool during the traversal. Need to
3886 // verify that lock for thread pool is held when calling this routine.
3887 for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3888 thread = thread->th.th_next_pool) {
3889#if KMP_OS_WINDOWS
3890 DWORD exit_val;
3891#endif
3892 if (TCR_PTR(thread->th.th_task_team) == NULL) {
3893 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3894 __kmp_gtid_from_thread(thread)));
3895 continue;
3896 }
3897#if KMP_OS_WINDOWS
3898 // TODO: GEH - add this check for Linux* OS / OS X* as well?
3899 if (!__kmp_is_thread_alive(thread, &exit_val)) {
3900 thread->th.th_task_team = NULL;
3901 continue;
3902 }
3903#endif
3904
3905 done = FALSE; // Because th_task_team pointer is not NULL for this thread
3906
3907 KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3908 "unreference task_team\n",
3909 __kmp_gtid_from_thread(thread)));
3910
3912 void *sleep_loc;
3913 // If the thread is sleeping, awaken it.
3914 if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3915 NULL) {
3916 KA_TRACE(
3917 10,
3918 ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3921 }
3922 }
3923 }
3924 if (done) {
3925 break;
3926 }
3927
3928 // If oversubscribed or have waited a bit, yield.
3929 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3930 }
3931}
3932
3933// __kmp_task_team_setup: Create a task_team for the current team, but use
3934// an already created, unused one if it already exists.
3937
3938 // For the serial and root teams, setup the first task team pointer to point
3939 // to task team. The other pointer is a stack of task teams from previous
3940 // serial levels.
3941 if (team == this_thr->th.th_serial_team ||
3942 team == this_thr->th.th_root->r.r_root_team) {
3943 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
3944 if (team->t.t_task_team[0] == NULL) {
3945 team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);
3946 KA_TRACE(
3947 20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
3948 " for serial/root team %p\n",
3949 __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));
3950
3951 } else
3952 __kmp_task_team_init(team->t.t_task_team[0], team);
3953 return;
3954 }
3955
3956 // If this task_team hasn't been created yet, allocate it. It will be used in
3957 // the region after the next.
3958 // If it exists, it is the current task team and shouldn't be touched yet as
3959 // it may still be in use.
3960 if (team->t.t_task_team[this_thr->th.th_task_state] == NULL) {
3961 team->t.t_task_team[this_thr->th.th_task_state] =
3962 __kmp_allocate_task_team(this_thr, team);
3963 KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
3964 " for team %d at parity=%d\n",
3965 __kmp_gtid_from_thread(this_thr),
3966 team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
3967 this_thr->th.th_task_state));
3968 }
3969
3970 // After threads exit the release, they will call sync, and then point to this
3971 // other task_team; make sure it is allocated and properly initialized. As
3972 // threads spin in the barrier release phase, they will continue to use the
3973 // previous task_team struct(above), until they receive the signal to stop
3974 // checking for tasks (they can't safely reference the kmp_team_t struct,
3975 // which could be reallocated by the primary thread).
3976 int other_team = 1 - this_thr->th.th_task_state;
3977 KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
3978 if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3979 team->t.t_task_team[other_team] = __kmp_allocate_task_team(this_thr, team);
3980 KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
3981 "task_team %p for team %d at parity=%d\n",
3982 __kmp_gtid_from_thread(this_thr),
3983 team->t.t_task_team[other_team], team->t.t_id, other_team));
3984 } else { // Leave the old task team struct in place for the upcoming region;
3985 // adjust as needed
3986 kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3987 __kmp_task_team_init(task_team, team);
3988 // if team size has changed, the first thread to enable tasking will
3989 // realloc threads_data if necessary
3990 KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
3991 "%p for team %d at parity=%d\n",
3992 __kmp_gtid_from_thread(this_thr),
3993 team->t.t_task_team[other_team], team->t.t_id, other_team));
3994 }
3995
3996 // For regular thread, task enabling should be called when the task is going
3997 // to be pushed to a dequeue. However, for the hidden helper thread, we need
3998 // it ahead of time so that some operations can be performed without race
3999 // condition.
4000 if (this_thr == __kmp_hidden_helper_main_thread) {
4001 for (int i = 0; i < 2; ++i) {
4002 kmp_task_team_t *task_team = team->t.t_task_team[i];
4003 if (KMP_TASKING_ENABLED(task_team)) {
4004 continue;
4005 }
4006 __kmp_enable_tasking(task_team, this_thr);
4007 for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
4008 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
4009 if (thread_data->td.td_deque == NULL) {
4011 }
4012 }
4013 }
4014 }
4015}
4016
4017// __kmp_task_team_sync: Propagation of task team data from team to threads
4018// which happens just after the release phase of a team barrier. This may be
4019// called by any thread. This is not called for serial or root teams.
4022 KMP_DEBUG_ASSERT(team != this_thr->th.th_serial_team);
4023 KMP_DEBUG_ASSERT(team != this_thr->th.th_root->r.r_root_team);
4024
4025 // Toggle the th_task_state field, to switch which task_team this thread
4026 // refers to
4027 this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
4028
4029 // It is now safe to propagate the task team pointer from the team struct to
4030 // the current thread.
4031 TCW_PTR(this_thr->th.th_task_team,
4032 team->t.t_task_team[this_thr->th.th_task_state]);
4033 KA_TRACE(20,
4034 ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
4035 "%p from Team #%d (parity=%d)\n",
4036 __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
4037 team->t.t_id, this_thr->th.th_task_state));
4038}
4039
4040// __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
4041// barrier gather phase. Only called by the primary thread.
4042//
4043// wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
4044// by passing in 0 optionally as the last argument. When wait is zero, primary
4045// thread does not wait for unfinished_threads to reach 0.
4047 kmp_info_t *this_thr,
4048 kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
4049 kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
4050
4052 KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
4053
4054 if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
4055 if (wait) {
4056 KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
4057 "(for unfinished_threads to reach 0) on task_team = %p\n",
4058 __kmp_gtid_from_thread(this_thr), task_team));
4059 // Worker threads may have dropped through to release phase, but could
4060 // still be executing tasks. Wait here for tasks to complete. To avoid
4061 // memory contention, only primary thread checks termination condition.
4063 RCAST(std::atomic<kmp_uint32> *,
4064 &task_team->tt.tt_unfinished_threads),
4065 0U);
4066 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
4067 }
4068 // Deactivate the old task team, so that the worker threads will stop
4069 // referencing it while spinning.
4070 KA_TRACE(
4071 20,
4072 ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
4073 "setting active to false, setting local and team's pointer to NULL\n",
4074 __kmp_gtid_from_thread(this_thr), task_team));
4078 TCW_SYNC_4(task_team->tt.tt_active, FALSE);
4079 KMP_MB();
4080
4081 TCW_PTR(this_thr->th.th_task_team, NULL);
4082 }
4083}
4084
4085// __kmp_tasking_barrier:
4086// This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
4087// Internal function to execute all tasks prior to a regular barrier or a join
4088// barrier. It is a full barrier itself, which unfortunately turns regular
4089// barriers into double barriers and join barriers into 1 1/2 barriers.
4090void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
4091 std::atomic<kmp_uint32> *spin = RCAST(
4092 std::atomic<kmp_uint32> *,
4093 &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
4094 int flag = FALSE;
4096
4097#if USE_ITT_BUILD
4098 KMP_FSYNC_SPIN_INIT(spin, NULL);
4099#endif /* USE_ITT_BUILD */
4100 kmp_flag_32<false, false> spin_flag(spin, 0U);
4101 while (!spin_flag.execute_tasks(thread, gtid, TRUE,
4102 &flag USE_ITT_BUILD_ARG(NULL), 0)) {
4103#if USE_ITT_BUILD
4104 // TODO: What about itt_sync_obj??
4105 KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
4106#endif /* USE_ITT_BUILD */
4107
4108 if (TCR_4(__kmp_global.g.g_done)) {
4109 if (__kmp_global.g.g_abort)
4111 break;
4112 }
4113 KMP_YIELD(TRUE);
4114 }
4115#if USE_ITT_BUILD
4116 KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
4117#endif /* USE_ITT_BUILD */
4118}
4119
4120// __kmp_give_task puts a task into a given thread queue if:
4121// - the queue for that thread was created
4122// - there's space in that queue
4123// Because of this, __kmp_push_task needs to check if there's space after
4124// getting the lock
4126 kmp_int32 pass) {
4128 kmp_task_team_t *task_team = taskdata->td_task_team;
4129
4130 KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
4131 taskdata, tid));
4132
4133 // If task_team is NULL something went really bad...
4134 KMP_DEBUG_ASSERT(task_team != NULL);
4135
4136 bool result = false;
4137 kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
4138
4139 if (thread_data->td.td_deque == NULL) {
4140 // There's no queue in this thread, go find another one
4141 // We're guaranteed that at least one thread has a queue
4142 KA_TRACE(30,
4143 ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
4144 tid, taskdata));
4145 return result;
4146 }
4147
4148 if (TCR_4(thread_data->td.td_deque_ntasks) >=
4149 TASK_DEQUE_SIZE(thread_data->td)) {
4150 KA_TRACE(
4151 30,
4152 ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
4153 taskdata, tid));
4154
4155 // if this deque is bigger than the pass ratio give a chance to another
4156 // thread
4157 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4158 return result;
4159
4160 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4161 if (TCR_4(thread_data->td.td_deque_ntasks) >=
4162 TASK_DEQUE_SIZE(thread_data->td)) {
4163 // expand deque to push the task which is not allowed to execute
4164 __kmp_realloc_task_deque(thread, thread_data);
4165 }
4166
4167 } else {
4168
4169 __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4170
4171 if (TCR_4(thread_data->td.td_deque_ntasks) >=
4172 TASK_DEQUE_SIZE(thread_data->td)) {
4173 KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
4174 "thread %d.\n",
4175 taskdata, tid));
4176
4177 // if this deque is bigger than the pass ratio give a chance to another
4178 // thread
4179 if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4180 goto release_and_exit;
4181
4182 __kmp_realloc_task_deque(thread, thread_data);
4183 }
4184 }
4185
4186 // lock is held here, and there is space in the deque
4187
4188 thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
4189 // Wrap index.
4190 thread_data->td.td_deque_tail =
4191 (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
4192 TCW_4(thread_data->td.td_deque_ntasks,
4193 TCR_4(thread_data->td.td_deque_ntasks) + 1);
4194
4195 result = true;
4196 KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
4197 taskdata, tid));
4198
4199release_and_exit:
4200 __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
4201
4202 return result;
4203}
4204
4205#define PROXY_TASK_FLAG 0x40000000
4206/* The finish of the proxy tasks is divided in two pieces:
4207 - the top half is the one that can be done from a thread outside the team
4208 - the bottom half must be run from a thread within the team
4209
4210 In order to run the bottom half the task gets queued back into one of the
4211 threads of the team. Once the td_incomplete_child_task counter of the parent
4212 is decremented the threads can leave the barriers. So, the bottom half needs
4213 to be queued before the counter is decremented. The top half is therefore
4214 divided in two parts:
4215 - things that can be run before queuing the bottom half
4216 - things that must be run after queuing the bottom half
4217
4218 This creates a second race as the bottom half can free the task before the
4219 second top half is executed. To avoid this we use the
4220 td_incomplete_child_task of the proxy task to synchronize the top and bottom
4221 half. */
4225 KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
4226 KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
4227
4228 taskdata->td_flags.complete = 1; // mark the task as completed
4229#if OMPX_TASKGRAPH
4230 taskdata->td_flags.onced = 1;
4231#endif
4232
4233 if (taskdata->td_taskgroup)
4234 KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
4235
4236 // Create an imaginary children for this task so the bottom half cannot
4237 // release the task before we have completed the second top half
4239}
4240
4242#if KMP_DEBUG
4243 kmp_int32 children = 0;
4244 // Predecrement simulated by "- 1" calculation
4245 children = -1 +
4246#endif
4248 KMP_DEBUG_ASSERT(children >= 0);
4249
4250 // Remove the imaginary children
4252}
4253
4256 kmp_info_t *thread = __kmp_threads[gtid];
4257
4260 1); // top half must run before bottom half
4261
4262 // We need to wait to make sure the top half is finished
4263 // Spinning here should be ok as this should happen quickly
4264 while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
4265 PROXY_TASK_FLAG) > 0)
4266 ;
4267
4268 __kmp_release_deps(gtid, taskdata);
4269 __kmp_free_task_and_ancestors(gtid, taskdata, thread);
4270}
4271
4272/*!
4273@ingroup TASKING
4274@param gtid Global Thread ID of encountering thread
4275@param ptask Task which execution is completed
4276
4277Execute the completion of a proxy task from a thread of that is part of the
4278team. Run first and bottom halves directly.
4279*/
4281 KMP_DEBUG_ASSERT(ptask != NULL);
4283 KA_TRACE(
4284 10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
4285 gtid, taskdata));
4288
4292
4293 KA_TRACE(10,
4294 ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
4295 gtid, taskdata));
4296}
4297
4299 KMP_DEBUG_ASSERT(ptask != NULL);
4301
4302 // Enqueue task to complete bottom half completion from a thread within the
4303 // corresponding team
4304 kmp_team_t *team = taskdata->td_team;
4305 kmp_int32 nthreads = team->t.t_nproc;
4306 kmp_info_t *thread;
4307
4308 // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
4309 // but we cannot use __kmp_get_random here
4310 kmp_int32 start_k = start % nthreads;
4311 kmp_int32 pass = 1;
4312 kmp_int32 k = start_k;
4313
4314 do {
4315 // For now we're just linearly trying to find a thread
4316 thread = team->t.t_threads[k];
4317 k = (k + 1) % nthreads;
4318
4319 // we did a full pass through all the threads
4320 if (k == start_k)
4321 pass = pass << 1;
4322
4323 } while (!__kmp_give_task(thread, k, ptask, pass));
4324
4326 // awake at least one thread to execute given task
4327 for (int i = 0; i < nthreads; ++i) {
4328 thread = team->t.t_threads[i];
4329 if (thread->th.th_sleep_loc != NULL) {
4331 break;
4332 }
4333 }
4334 }
4335}
4336
4337/*!
4338@ingroup TASKING
4339@param ptask Task which execution is completed
4340
4341Execute the completion of a proxy task from a thread that could not belong to
4342the team.
4343*/
4345 KMP_DEBUG_ASSERT(ptask != NULL);
4347
4348 KA_TRACE(
4349 10,
4350 ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
4351 taskdata));
4352
4354
4356
4358
4360
4361 KA_TRACE(
4362 10,
4363 ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
4364 taskdata));
4365}
4366
4368 kmp_task_t *task) {
4374 }
4375 return &td->td_allow_completion_event;
4376}
4377
4379 if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4380 kmp_task_t *ptask = event->ed.task;
4382 bool detached = false;
4383 int gtid = __kmp_get_gtid();
4384
4385 // The associated task might have completed or could be completing at this
4386 // point.
4387 // We need to take the lock to avoid races
4388 __kmp_acquire_tas_lock(&event->lock, gtid);
4389 if (taskdata->td_flags.proxy == TASK_PROXY) {
4390 detached = true;
4391 } else {
4392#if OMPT_SUPPORT
4393 // The OMPT event must occur under mutual exclusion,
4394 // otherwise the tool might access ptask after free
4396 __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4397#endif
4398 }
4400 __kmp_release_tas_lock(&event->lock, gtid);
4401
4402 if (detached) {
4403#if OMPT_SUPPORT
4404 // We free ptask afterwards and know the task is finished,
4405 // so locking is not necessary
4407 __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4408#endif
4409 // If the task detached complete the proxy task
4410 if (gtid >= 0) {
4411 kmp_team_t *team = taskdata->td_team;
4412 kmp_info_t *thread = __kmp_get_thread();
4413 if (thread->th.th_team == team) {
4415 return;
4416 }
4417 }
4418
4419 // fallback
4421 }
4422 }
4423}
4424
4425// __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4426// for taskloop
4427//
4428// thread: allocating thread
4429// task_src: pointer to source task to be duplicated
4430// taskloop_recur: used only when dealing with taskgraph,
4431// indicating whether we need to update task->td_task_id
4432// returns: a pointer to the allocated kmp_task_t structure (task).
4434#if OMPX_TASKGRAPH
4435 , int taskloop_recur
4436#endif
4437) {
4439 kmp_taskdata_t *taskdata;
4440 kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4441 kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4442 size_t shareds_offset;
4443 size_t task_size;
4444
4445 KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4446 task_src));
4447 KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4448 TASK_FULL); // it should not be proxy task
4450 task_size = taskdata_src->td_size_alloc;
4451
4452 // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4453 KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4454 task_size));
4455#if USE_FAST_MEMORY
4456 taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4457#else
4458 taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4459#endif /* USE_FAST_MEMORY */
4460 KMP_MEMCPY(taskdata, taskdata_src, task_size);
4461
4462 task = KMP_TASKDATA_TO_TASK(taskdata);
4463
4464 // Initialize new task (only specific fields not affected by memcpy)
4465#if OMPX_TASKGRAPH
4466 if (taskdata->is_taskgraph && !taskloop_recur &&
4467 __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))
4468 taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
4469#endif
4470 taskdata->td_task_id = KMP_GEN_TASK_ID();
4471 if (task->shareds != NULL) { // need setup shareds pointer
4472 shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4473 task->shareds = &((char *)taskdata)[shareds_offset];
4474 KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4475 0);
4476 }
4477 taskdata->td_alloc_thread = thread;
4478 taskdata->td_parent = parent_task;
4479 // task inherits the taskgroup from the parent task
4480 taskdata->td_taskgroup = parent_task->td_taskgroup;
4481 // tied task needs to initialize the td_last_tied at creation,
4482 // untied one does this when it is scheduled for execution
4483 if (taskdata->td_flags.tiedness == TASK_TIED)
4484 taskdata->td_last_tied = taskdata;
4485
4486 // Only need to keep track of child task counts if team parallel and tasking
4487 // not serialized
4488 if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4490 if (parent_task->td_taskgroup)
4491 KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4492 // Only need to keep track of allocated child tasks for explicit tasks since
4493 // implicit not deallocated
4494 if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4496 }
4497
4498 KA_TRACE(20,
4499 ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4500 thread, taskdata, taskdata->td_parent));
4501#if OMPT_SUPPORT
4503 __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4504#endif
4505 return task;
4506}
4507
4508// Routine optionally generated by the compiler for setting the lastprivate flag
4509// and calling needed constructors for private/firstprivate objects
4510// (used to form taskloop tasks from pattern task)
4511// Parameters: dest task, src task, lastprivate flag.
4513
4514KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4515
4516// class to encapsulate manipulating loop bounds in a taskloop task.
4517// this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4518// the loop bound variables.
4521 const kmp_taskdata_t *taskdata;
4522 size_t lower_offset;
4523 size_t upper_offset;
4524
4525public:
4527 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4528 lower_offset((char *)lb - (char *)task),
4529 upper_offset((char *)ub - (char *)task) {
4530 KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4531 KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4532 }
4534 : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4535 lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4536 size_t get_lower_offset() const { return lower_offset; }
4537 size_t get_upper_offset() const { return upper_offset; }
4539 kmp_int64 retval;
4540#if defined(KMP_GOMP_COMPAT)
4541 // Intel task just returns the lower bound normally
4542 if (!taskdata->td_flags.native) {
4543 retval = *(kmp_int64 *)((char *)task + lower_offset);
4544 } else {
4545 // GOMP task has to take into account the sizeof(long)
4546 if (taskdata->td_size_loop_bounds == 4) {
4548 retval = (kmp_int64)*lb;
4549 } else {
4551 retval = (kmp_int64)*lb;
4552 }
4553 }
4554#else
4555 (void)taskdata;
4556 retval = *(kmp_int64 *)((char *)task + lower_offset);
4557#endif // defined(KMP_GOMP_COMPAT)
4558 return retval;
4559 }
4561 kmp_int64 retval;
4562#if defined(KMP_GOMP_COMPAT)
4563 // Intel task just returns the upper bound normally
4564 if (!taskdata->td_flags.native) {
4565 retval = *(kmp_int64 *)((char *)task + upper_offset);
4566 } else {
4567 // GOMP task has to take into account the sizeof(long)
4568 if (taskdata->td_size_loop_bounds == 4) {
4569 kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4570 retval = (kmp_int64)*ub;
4571 } else {
4572 kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4573 retval = (kmp_int64)*ub;
4574 }
4575 }
4576#else
4577 retval = *(kmp_int64 *)((char *)task + upper_offset);
4578#endif // defined(KMP_GOMP_COMPAT)
4579 return retval;
4580 }
4582#if defined(KMP_GOMP_COMPAT)
4583 // Intel task just sets the lower bound normally
4584 if (!taskdata->td_flags.native) {
4585 *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4586 } else {
4587 // GOMP task has to take into account the sizeof(long)
4588 if (taskdata->td_size_loop_bounds == 4) {
4589 kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4590 *lower = (kmp_uint32)lb;
4591 } else {
4592 kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4593 *lower = (kmp_uint64)lb;
4594 }
4595 }
4596#else
4597 *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4598#endif // defined(KMP_GOMP_COMPAT)
4599 }
4601#if defined(KMP_GOMP_COMPAT)
4602 // Intel task just sets the upper bound normally
4603 if (!taskdata->td_flags.native) {
4604 *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4605 } else {
4606 // GOMP task has to take into account the sizeof(long)
4607 if (taskdata->td_size_loop_bounds == 4) {
4608 kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4609 *upper = (kmp_uint32)ub;
4610 } else {
4611 kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4612 *upper = (kmp_uint64)ub;
4613 }
4614 }
4615#else
4616 *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4617#endif // defined(KMP_GOMP_COMPAT)
4618 }
4619};
4620
4621// __kmp_taskloop_linear: Start tasks of the taskloop linearly
4622//
4623// loc Source location information
4624// gtid Global thread ID
4625// task Pattern task, exposes the loop iteration range
4626// lb Pointer to loop lower bound in task structure
4627// ub Pointer to loop upper bound in task structure
4628// st Loop stride
4629// ub_glob Global upper bound (used for lastprivate check)
4630// num_tasks Number of tasks to execute
4631// grainsize Number of loop iterations per task
4632// extras Number of chunks with grainsize+1 iterations
4633// last_chunk Reduction of grainsize for last task
4634// tc Iterations count
4635// task_dup Tasks duplication routine
4636// codeptr_ra Return address for OMPT events
4638 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4639 kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4640 kmp_uint64 grainsize, kmp_uint64 extras,
4641 kmp_int64 last_chunk, kmp_uint64 tc,
4642#if OMPT_SUPPORT
4643 void *codeptr_ra,
4644#endif
4645 void *task_dup) {
4646 KMP_COUNT_BLOCK(OMP_TASKLOOP);
4647 KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4648 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4649 // compiler provides global bounds here
4650 kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4651 kmp_uint64 lower = task_bounds.get_lb();
4652 kmp_uint64 upper = task_bounds.get_ub();
4653 kmp_uint64 i;
4654 kmp_info_t *thread = __kmp_threads[gtid];
4655 kmp_taskdata_t *current_task = thread->th.th_current_task;
4656 kmp_task_t *next_task;
4657 kmp_int32 lastpriv = 0;
4658
4659 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4660 (last_chunk < 0 ? last_chunk : extras));
4661 KMP_DEBUG_ASSERT(num_tasks > extras);
4662 KMP_DEBUG_ASSERT(num_tasks > 0);
4663 KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4664 "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4665 gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4666 ub_glob, st, task_dup));
4667
4668 // Launch num_tasks tasks, assign grainsize iterations each task
4669 for (i = 0; i < num_tasks; ++i) {
4670 kmp_uint64 chunk_minus_1;
4671 if (extras == 0) {
4672 chunk_minus_1 = grainsize - 1;
4673 } else {
4674 chunk_minus_1 = grainsize;
4675 --extras; // first extras iterations get bigger chunk (grainsize+1)
4676 }
4677 upper = lower + st * chunk_minus_1;
4678 if (upper > *ub) {
4679 upper = *ub;
4680 }
4681 if (i == num_tasks - 1) {
4682 // schedule the last task, set lastprivate flag if needed
4683 if (st == 1) { // most common case
4684 KMP_DEBUG_ASSERT(upper == *ub);
4685 if (upper == ub_glob)
4686 lastpriv = 1;
4687 } else if (st > 0) { // positive loop stride
4688 KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4689 if ((kmp_uint64)st > ub_glob - upper)
4690 lastpriv = 1;
4691 } else { // negative loop stride
4692 KMP_DEBUG_ASSERT(upper + st < *ub);
4693 if (upper - ub_glob < (kmp_uint64)(-st))
4694 lastpriv = 1;
4695 }
4696 }
4697
4698#if OMPX_TASKGRAPH
4699 next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0);
4700#else
4701 next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4702#endif
4703
4704 kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4705 kmp_taskloop_bounds_t next_task_bounds =
4706 kmp_taskloop_bounds_t(next_task, task_bounds);
4707
4708 // adjust task-specific bounds
4709 next_task_bounds.set_lb(lower);
4710 if (next_taskdata->td_flags.native) {
4711 next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4712 } else {
4713 next_task_bounds.set_ub(upper);
4714 }
4715 if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4716 // etc.
4717 ptask_dup(next_task, task, lastpriv);
4718 KA_TRACE(40,
4719 ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4720 "upper %lld stride %lld, (offsets %p %p)\n",
4721 gtid, i, next_task, lower, upper, st,
4722 next_task_bounds.get_lower_offset(),
4723 next_task_bounds.get_upper_offset()));
4724#if OMPT_SUPPORT
4725 __kmp_omp_taskloop_task(NULL, gtid, next_task,
4726 codeptr_ra); // schedule new task
4727#if OMPT_OPTIONAL
4728 if (ompt_enabled.ompt_callback_dispatch) {
4729 OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,
4730 lower, upper, st);
4731 }
4732#endif // OMPT_OPTIONAL
4733#else
4734 __kmp_omp_task(gtid, next_task, true); // schedule new task
4735#endif
4736 lower = upper + st; // adjust lower bound for the next iteration
4737 }
4738 // free the pattern task and exit
4739 __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4740 // do not execute the pattern task, just do internal bookkeeping
4741 __kmp_task_finish<false>(gtid, task, current_task);
4742}
4743
4744// Structure to keep taskloop parameters for auxiliary task
4745// kept in the shareds of the task structure.
4746typedef struct __taskloop_params {
4759#if OMPT_SUPPORT
4760 void *codeptr_ra;
4761#endif
4763
4767 kmp_uint64,
4768#if OMPT_SUPPORT
4769 void *,
4770#endif
4771 void *);
4772
4773// Execute part of the taskloop submitted as a task.
4774int __kmp_taskloop_task(int gtid, void *ptask) {
4776 (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4777 kmp_task_t *task = p->task;
4778 kmp_uint64 *lb = p->lb;
4779 kmp_uint64 *ub = p->ub;
4780 void *task_dup = p->task_dup;
4781 // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4782 kmp_int64 st = p->st;
4783 kmp_uint64 ub_glob = p->ub_glob;
4784 kmp_uint64 num_tasks = p->num_tasks;
4785 kmp_uint64 grainsize = p->grainsize;
4786 kmp_uint64 extras = p->extras;
4787 kmp_int64 last_chunk = p->last_chunk;
4788 kmp_uint64 tc = p->tc;
4789 kmp_uint64 num_t_min = p->num_t_min;
4790#if OMPT_SUPPORT
4791 void *codeptr_ra = p->codeptr_ra;
4792#endif
4793#if KMP_DEBUG
4795 KMP_DEBUG_ASSERT(task != NULL);
4796 KA_TRACE(20,
4797 ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4798 " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4799 gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4800 st, task_dup));
4801#endif
4802 KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4803 if (num_tasks > num_t_min)
4804 __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4805 grainsize, extras, last_chunk, tc, num_t_min,
4806#if OMPT_SUPPORT
4807 codeptr_ra,
4808#endif
4809 task_dup);
4810 else
4811 __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4812 grainsize, extras, last_chunk, tc,
4813#if OMPT_SUPPORT
4814 codeptr_ra,
4815#endif
4816 task_dup);
4817
4818 KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
4819 return 0;
4820}
4821
4822// Schedule part of the taskloop as a task,
4823// execute the rest of the taskloop.
4824//
4825// loc Source location information
4826// gtid Global thread ID
4827// task Pattern task, exposes the loop iteration range
4828// lb Pointer to loop lower bound in task structure
4829// ub Pointer to loop upper bound in task structure
4830// st Loop stride
4831// ub_glob Global upper bound (used for lastprivate check)
4832// num_tasks Number of tasks to execute
4833// grainsize Number of loop iterations per task
4834// extras Number of chunks with grainsize+1 iterations
4835// last_chunk Reduction of grainsize for last task
4836// tc Iterations count
4837// num_t_min Threshold to launch tasks recursively
4838// task_dup Tasks duplication routine
4839// codeptr_ra Return address for OMPT events
4841 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4842 kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4843 kmp_uint64 grainsize, kmp_uint64 extras,
4844 kmp_int64 last_chunk, kmp_uint64 tc,
4845 kmp_uint64 num_t_min,
4846#if OMPT_SUPPORT
4847 void *codeptr_ra,
4848#endif
4849 void *task_dup) {
4851 KMP_DEBUG_ASSERT(task != NULL);
4852 KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4853 KA_TRACE(20,
4854 ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4855 " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4856 gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4857 st, task_dup));
4858 p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4859 kmp_uint64 lower = *lb;
4860 kmp_info_t *thread = __kmp_threads[gtid];
4861 // kmp_taskdata_t *current_task = thread->th.th_current_task;
4862 kmp_task_t *next_task;
4863 size_t lower_offset =
4864 (char *)lb - (char *)task; // remember offset of lb in the task structure
4865 size_t upper_offset =
4866 (char *)ub - (char *)task; // remember offset of ub in the task structure
4867
4868 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4869 (last_chunk < 0 ? last_chunk : extras));
4870 KMP_DEBUG_ASSERT(num_tasks > extras);
4871 KMP_DEBUG_ASSERT(num_tasks > 0);
4872
4873 // split the loop in two halves
4874 kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4875 kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
4876 kmp_uint64 gr_size0 = grainsize;
4877 kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
4878 kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
4879 if (last_chunk < 0) {
4880 ext0 = ext1 = 0;
4881 last_chunk1 = last_chunk;
4882 tc0 = grainsize * n_tsk0;
4883 tc1 = tc - tc0;
4884 } else if (n_tsk0 <= extras) {
4885 gr_size0++; // integrate extras into grainsize
4886 ext0 = 0; // no extra iters in 1st half
4887 ext1 = extras - n_tsk0; // remaining extras
4888 tc0 = gr_size0 * n_tsk0;
4889 tc1 = tc - tc0;
4890 } else { // n_tsk0 > extras
4891 ext1 = 0; // no extra iters in 2nd half
4892 ext0 = extras;
4893 tc1 = grainsize * n_tsk1;
4894 tc0 = tc - tc1;
4895 }
4896 ub0 = lower + st * (tc0 - 1);
4897 lb1 = ub0 + st;
4898
4899 // create pattern task for 2nd half of the loop
4900#if OMPX_TASKGRAPH
4901 next_task = __kmp_task_dup_alloc(thread, task,
4902 /* taskloop_recur */ 1);
4903#else
4904 next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
4905#endif
4906 // adjust lower bound (upper bound is not changed) for the 2nd half
4907 *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
4908 if (ptask_dup != NULL) // construct firstprivates, etc.
4909 ptask_dup(next_task, task, 0);
4910 *ub = ub0; // adjust upper bound for the 1st half
4911
4912 // create auxiliary task for 2nd half of the loop
4913 // make sure new task has same parent task as the pattern task
4914 kmp_taskdata_t *current_task = thread->th.th_current_task;
4915 thread->th.th_current_task = taskdata->td_parent;
4916 kmp_task_t *new_task =
4917 __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4919 // restore current task
4920 thread->th.th_current_task = current_task;
4922 p->task = next_task;
4923 p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4924 p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4925 p->task_dup = task_dup;
4926 p->st = st;
4927 p->ub_glob = ub_glob;
4928 p->num_tasks = n_tsk1;
4929 p->grainsize = grainsize;
4930 p->extras = ext1;
4931 p->last_chunk = last_chunk1;
4932 p->tc = tc1;
4933 p->num_t_min = num_t_min;
4934#if OMPT_SUPPORT
4935 p->codeptr_ra = codeptr_ra;
4936#endif
4937
4938#if OMPX_TASKGRAPH
4939 kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
4940 new_task_data->tdg = taskdata->tdg;
4941 new_task_data->is_taskgraph = 0;
4942#endif
4943
4944#if OMPT_SUPPORT
4945 // schedule new task with correct return address for OMPT events
4946 __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4947#else
4948 __kmp_omp_task(gtid, new_task, true); // schedule new task
4949#endif
4950
4951 // execute the 1st half of current subrange
4952 if (n_tsk0 > num_t_min)
4953 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4954 ext0, last_chunk0, tc0, num_t_min,
4955#if OMPT_SUPPORT
4956 codeptr_ra,
4957#endif
4958 task_dup);
4959 else
4960 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4961 gr_size0, ext0, last_chunk0, tc0,
4962#if OMPT_SUPPORT
4963 codeptr_ra,
4964#endif
4965 task_dup);
4966
4967 KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
4968}
4969
4970static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4971 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4972 int nogroup, int sched, kmp_uint64 grainsize,
4973 int modifier, void *task_dup) {
4975 KMP_DEBUG_ASSERT(task != NULL);
4976 if (nogroup == 0) {
4977#if OMPT_SUPPORT && OMPT_OPTIONAL
4978 OMPT_STORE_RETURN_ADDRESS(gtid);
4979#endif
4980 __kmpc_taskgroup(loc, gtid);
4981 }
4982
4983#if OMPX_TASKGRAPH
4984 KMP_ATOMIC_DEC(&__kmp_tdg_task_id);
4985#endif
4986 // =========================================================================
4987 // calculate loop parameters
4988 kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4989 kmp_uint64 tc;
4990 // compiler provides global bounds here
4991 kmp_uint64 lower = task_bounds.get_lb();
4992 kmp_uint64 upper = task_bounds.get_ub();
4993 kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
4994 kmp_uint64 num_tasks = 0, extras = 0;
4995 kmp_int64 last_chunk =
4996 0; // reduce grainsize of last task by last_chunk in strict mode
4997 kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4998 kmp_info_t *thread = __kmp_threads[gtid];
4999 kmp_taskdata_t *current_task = thread->th.th_current_task;
5000
5001 KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
5002 "grain %llu(%d, %d), dup %p\n",
5003 gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
5004 task_dup));
5005
5006 // compute trip count
5007 if (st == 1) { // most common case
5008 tc = upper - lower + 1;
5009 } else if (st < 0) {
5010 tc = (lower - upper) / (-st) + 1;
5011 } else { // st > 0
5012 tc = (upper - lower) / st + 1;
5013 }
5014 if (tc == 0) {
5015 KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
5016 // free the pattern task and exit
5017 __kmp_task_start(gtid, task, current_task);
5018 // do not execute anything for zero-trip loop
5019 __kmp_task_finish<false>(gtid, task, current_task);
5020 return;
5021 }
5022
5023#if OMPT_SUPPORT && OMPT_OPTIONAL
5024 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
5026 if (ompt_enabled.ompt_callback_work) {
5027 ompt_callbacks.ompt_callback(ompt_callback_work)(
5028 ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
5029 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5030 }
5031#endif
5032
5033 if (num_tasks_min == 0)
5034 // TODO: can we choose better default heuristic?
5035 num_tasks_min =
5036 KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
5037
5038 // compute num_tasks/grainsize based on the input provided
5039 switch (sched) {
5040 case 0: // no schedule clause specified, we can choose the default
5041 // let's try to schedule (team_size*10) tasks
5042 grainsize = thread->th.th_team_nproc * static_cast<kmp_uint64>(10);
5044 case 2: // num_tasks provided
5045 if (grainsize > tc) {
5046 num_tasks = tc; // too big num_tasks requested, adjust values
5047 grainsize = 1;
5048 extras = 0;
5049 } else {
5050 num_tasks = grainsize;
5051 grainsize = tc / num_tasks;
5052 extras = tc % num_tasks;
5053 }
5054 break;
5055 case 1: // grainsize provided
5056 if (grainsize > tc) {
5057 num_tasks = 1;
5058 grainsize = tc; // too big grainsize requested, adjust values
5059 extras = 0;
5060 } else {
5061 if (modifier) {
5062 num_tasks = (tc + grainsize - 1) / grainsize;
5063 last_chunk = tc - (num_tasks * grainsize);
5064 extras = 0;
5065 } else {
5066 num_tasks = tc / grainsize;
5067 // adjust grainsize for balanced distribution of iterations
5068 grainsize = tc / num_tasks;
5069 extras = tc % num_tasks;
5070 }
5071 }
5072 break;
5073 default:
5074 KMP_ASSERT2(0, "unknown scheduling of taskloop");
5075 }
5076
5077 KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5078 (last_chunk < 0 ? last_chunk : extras));
5079 KMP_DEBUG_ASSERT(num_tasks > extras);
5080 KMP_DEBUG_ASSERT(num_tasks > 0);
5081 // =========================================================================
5082
5083 // check if clause value first
5084 // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
5085 if (if_val == 0) { // if(0) specified, mark task as serial
5086 taskdata->td_flags.task_serial = 1;
5087 taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
5088 // always start serial tasks linearly
5089 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5090 grainsize, extras, last_chunk, tc,
5091#if OMPT_SUPPORT
5093#endif
5094 task_dup);
5095 // !taskdata->td_flags.native => currently force linear spawning of tasks
5096 // for GOMP_taskloop
5097 } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
5098 KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
5099 "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5100 gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5101 last_chunk));
5102 __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5103 grainsize, extras, last_chunk, tc, num_tasks_min,
5104#if OMPT_SUPPORT
5106#endif
5107 task_dup);
5108 } else {
5109 KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
5110 "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5111 gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5112 last_chunk));
5113 __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5114 grainsize, extras, last_chunk, tc,
5115#if OMPT_SUPPORT
5117#endif
5118 task_dup);
5119 }
5120
5121#if OMPT_SUPPORT && OMPT_OPTIONAL
5122 if (ompt_enabled.ompt_callback_work) {
5123 ompt_callbacks.ompt_callback(ompt_callback_work)(
5124 ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
5125 &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5126 }
5127#endif
5128
5129 if (nogroup == 0) {
5130#if OMPT_SUPPORT && OMPT_OPTIONAL
5131 OMPT_STORE_RETURN_ADDRESS(gtid);
5132#endif
5134 }
5135 KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
5136}
5137
5138/*!
5139@ingroup TASKING
5140@param loc Source location information
5141@param gtid Global thread ID
5142@param task Task structure
5143@param if_val Value of the if clause
5144@param lb Pointer to loop lower bound in task structure
5145@param ub Pointer to loop upper bound in task structure
5146@param st Loop stride
5147@param nogroup Flag, 1 if nogroup clause specified, 0 otherwise
5148@param sched Schedule specified 0/1/2 for none/grainsize/num_tasks
5149@param grainsize Schedule value if specified
5150@param task_dup Tasks duplication routine
5151
5152Execute the taskloop construct.
5153*/
5154void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5155 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
5156 int sched, kmp_uint64 grainsize, void *task_dup) {
5158 KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
5159 __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5160 0, task_dup);
5161 KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
5162}
5163
5164/*!
5165@ingroup TASKING
5166@param loc Source location information
5167@param gtid Global thread ID
5168@param task Task structure
5169@param if_val Value of the if clause
5170@param lb Pointer to loop lower bound in task structure
5171@param ub Pointer to loop upper bound in task structure
5172@param st Loop stride
5173@param nogroup Flag, 1 if nogroup clause specified, 0 otherwise
5174@param sched Schedule specified 0/1/2 for none/grainsize/num_tasks
5175@param grainsize Schedule value if specified
5176@param modifier Modifier 'strict' for sched, 1 if present, 0 otherwise
5177@param task_dup Tasks duplication routine
5178
5179Execute the taskloop construct.
5180*/
5181void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5182 kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5183 int nogroup, int sched, kmp_uint64 grainsize,
5184 int modifier, void *task_dup) {
5186 KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
5187 __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5188 modifier, task_dup);
5189 KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
5190}
5191
5192/*!
5193@ingroup TASKING
5194@param gtid Global Thread ID of current thread
5195@return Returns a pointer to the thread's current task async handle. If no task
5196is present or gtid is invalid, returns NULL.
5197
5198Acqurires a pointer to the target async handle from the current task.
5199*/
5201 if (gtid == KMP_GTID_DNE)
5202 return NULL;
5203
5204 kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5205 kmp_taskdata_t *taskdata = thread->th.th_current_task;
5206
5207 if (!taskdata)
5208 return NULL;
5209
5210 return &taskdata->td_target_data.async_handle;
5211}
5212
5213/*!
5214@ingroup TASKING
5215@param gtid Global Thread ID of current thread
5216@return Returns TRUE if the current task being executed of the given thread has
5217a task team allocated to it. Otherwise, returns FALSE.
5218
5219Checks if the current thread has a task team.
5220*/
5222 if (gtid == KMP_GTID_DNE)
5223 return FALSE;
5224
5225 kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5226 kmp_taskdata_t *taskdata = thread->th.th_current_task;
5227
5228 if (!taskdata)
5229 return FALSE;
5230
5231 return taskdata->td_task_team != NULL;
5232}
5233
5234#if OMPX_TASKGRAPH
5235// __kmp_find_tdg: identify a TDG through its ID
5236// tdg_id: ID of the TDG
5237// returns: If a TDG corresponding to this ID is found and not
5238// its initial state, return the pointer to it, otherwise nullptr
5239static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
5240 kmp_tdg_info_t *res = nullptr;
5241 if (__kmp_max_tdgs == 0)
5242 return res;
5243
5244 if (__kmp_global_tdgs == NULL)
5245 __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
5246 sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);
5247
5248 if ((__kmp_global_tdgs[tdg_id]) &&
5249 (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
5250 res = __kmp_global_tdgs[tdg_id];
5251 return res;
5252}
5253
5254// __kmp_print_tdg_dot: prints the TDG to a dot file
5255// tdg: ID of the TDG
5256// gtid: Global Thread ID
5257void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg, kmp_int32 gtid) {
5258 kmp_int32 tdg_id = tdg->tdg_id;
5259 KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));
5260
5261 char file_name[20];
5262 sprintf(file_name, "tdg_%d.dot", tdg_id);
5263 kmp_safe_raii_file_t tdg_file(file_name, "w");
5264
5265 kmp_int32 num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5266 fprintf(tdg_file,
5267 "digraph TDG {\n"
5268 " compound=true\n"
5269 " subgraph cluster {\n"
5270 " label=TDG_%d\n",
5271 tdg_id);
5272 for (kmp_int32 i = 0; i < num_tasks; i++) {
5273 fprintf(tdg_file, " %d[style=bold]\n", i);
5274 }
5275 fprintf(tdg_file, " }\n");
5276 for (kmp_int32 i = 0; i < num_tasks; i++) {
5277 kmp_int32 nsuccessors = tdg->record_map[i].nsuccessors;
5278 kmp_int32 *successors = tdg->record_map[i].successors;
5279 if (nsuccessors > 0) {
5280 for (kmp_int32 j = 0; j < nsuccessors; j++)
5281 fprintf(tdg_file, " %d -> %d \n", i, successors[j]);
5282 }
5283 }
5284 fprintf(tdg_file, "}");
5285 KA_TRACE(10, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id));
5286}
5287
5288// __kmp_exec_tdg: launch the execution of a previous
5289// recorded TDG
5290// gtid: Global Thread ID
5291// tdg: ID of the TDG
5292void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5293 KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
5294 KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
5295 tdg->tdg_id, tdg->num_roots));
5296 kmp_node_info_t *this_record_map = tdg->record_map;
5297 kmp_int32 *this_root_tasks = tdg->root_tasks;
5298 kmp_int32 this_num_roots = tdg->num_roots;
5299 kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5300
5301 kmp_info_t *thread = __kmp_threads[gtid];
5302 kmp_taskdata_t *parent_task = thread->th.th_current_task;
5303
5304 if (tdg->rec_taskred_data) {
5305 __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
5306 }
5307
5308 for (kmp_int32 j = 0; j < this_num_tasks; j++) {
5309 kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);
5310
5311 td->td_parent = parent_task;
5312 this_record_map[j].parent_task = parent_task;
5313
5314 kmp_taskgroup_t *parent_taskgroup =
5315 this_record_map[j].parent_task->td_taskgroup;
5316
5317 KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
5318 this_record_map[j].npredecessors);
5319 KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);
5320
5321 if (parent_taskgroup) {
5322 KMP_ATOMIC_INC(&parent_taskgroup->count);
5323 // The taskgroup is different so we must update it
5324 td->td_taskgroup = parent_taskgroup;
5325 } else if (td->td_taskgroup != nullptr) {
5326 // If the parent doesnt have a taskgroup, remove it from the task
5327 td->td_taskgroup = nullptr;
5328 }
5329 if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
5330 KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
5331 }
5332
5333 for (kmp_int32 j = 0; j < this_num_roots; ++j) {
5334 __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
5335 }
5336 KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
5337 tdg->tdg_id, tdg->num_roots));
5338}
5339
5340// __kmp_start_record: set up a TDG structure and turn the
5341// recording flag to true
5342// gtid: Global Thread ID of the encountering thread
5343// input_flags: Flags associated with the TDG
5344// tdg_id: ID of the TDG to record
5345static inline void __kmp_start_record(kmp_int32 gtid,
5346 kmp_taskgraph_flags_t *flags,
5347 kmp_int32 tdg_id) {
5348 kmp_tdg_info_t *tdg =
5349 (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));
5350 __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
5351 // Initializing the TDG structure
5352 tdg->tdg_id = tdg_id;
5353 tdg->map_size = INIT_MAPSIZE;
5354 tdg->num_roots = -1;
5355 tdg->root_tasks = nullptr;
5356 tdg->tdg_status = KMP_TDG_RECORDING;
5357 tdg->rec_num_taskred = 0;
5358 tdg->rec_taskred_data = nullptr;
5359 KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);
5360
5361 // Initializing the list of nodes in this TDG
5362 kmp_node_info_t *this_record_map =
5363 (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));
5364 for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {
5365 kmp_int32 *successorsList =
5366 (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));
5367 this_record_map[i].task = nullptr;
5368 this_record_map[i].successors = successorsList;
5369 this_record_map[i].nsuccessors = 0;
5370 this_record_map[i].npredecessors = 0;
5371 this_record_map[i].successors_size = __kmp_successors_size;
5372 KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);
5373 }
5374
5375 __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
5376}
5377
5378// __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
5379// the beginning of the record process of a task region
5380// loc_ref: Location of TDG, not used yet
5381// gtid: Global Thread ID of the encountering thread
5382// input_flags: Flags associated with the TDG
5383// tdg_id: ID of the TDG to record, for now, incremental integer
5384// returns: 1 if we record, otherwise, 0
5386 kmp_int32 input_flags, kmp_int32 tdg_id) {
5387
5388 kmp_int32 res;
5389 kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;
5390 KA_TRACE(10,
5391 ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
5392 gtid, loc_ref, input_flags, tdg_id));
5393
5394 if (__kmp_max_tdgs == 0) {
5395 KA_TRACE(
5396 10,
5397 ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
5398 "__kmp_max_tdgs = 0\n",
5399 gtid, loc_ref, input_flags, tdg_id));
5400 return 1;
5401 }
5402
5403 __kmpc_taskgroup(loc_ref, gtid);
5404 if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
5405 // TODO: use re_record flag
5406 __kmp_exec_tdg(gtid, tdg);
5407 res = 0;
5408 } else {
5409 __kmp_curr_tdg_idx = tdg_id;
5410 KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
5411 __kmp_start_record(gtid, flags, tdg_id);
5412 __kmp_num_tdg++;
5413 res = 1;
5414 }
5415 KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
5416 gtid, tdg_id, res ? "record" : "execute"));
5417 return res;
5418}
5419
5420// __kmp_end_record: set up a TDG after recording it
5421// gtid: Global thread ID
5422// tdg: Pointer to the TDG
5423void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5424 // Store roots
5425 kmp_node_info_t *this_record_map = tdg->record_map;
5426 kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5427 kmp_int32 *this_root_tasks =
5428 (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));
5429 kmp_int32 this_map_size = tdg->map_size;
5430 kmp_int32 this_num_roots = 0;
5431 kmp_info_t *thread = __kmp_threads[gtid];
5432
5433 for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5434 if (this_record_map[i].npredecessors == 0) {
5435 this_root_tasks[this_num_roots++] = i;
5436 }
5437 }
5438
5439 // Update with roots info and mapsize
5440 tdg->map_size = this_map_size;
5441 tdg->num_roots = this_num_roots;
5442 tdg->root_tasks = this_root_tasks;
5443 KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
5444 tdg->tdg_status = KMP_TDG_READY;
5445
5446 if (thread->th.th_current_task->td_dephash) {
5447 __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
5448 thread->th.th_current_task->td_dephash = NULL;
5449 }
5450
5451 // Reset predecessor counter
5452 for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5453 KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
5454 this_record_map[i].npredecessors);
5455 }
5456 KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);
5457
5458 if (__kmp_tdg_dot)
5459 __kmp_print_tdg_dot(tdg, gtid);
5460}
5461
5462// __kmpc_end_record_task: wrapper around __kmp_end_record to mark
5463// the end of recording phase
5464//
5465// loc_ref: Source location information
5466// gtid: Global thread ID
5467// input_flags: Flags attached to the graph
5468// tdg_id: ID of the TDG just finished recording
5469void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
5470 kmp_int32 input_flags, kmp_int32 tdg_id) {
5471 kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);
5472
5473 KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
5474 " tdg=%d with flags=%d\n",
5475 gtid, loc_ref, tdg_id, input_flags));
5476 if (__kmp_max_tdgs) {
5477 // TODO: use input_flags->nowait
5478 __kmpc_end_taskgroup(loc_ref, gtid);
5479 if (__kmp_tdg_is_recording(tdg->tdg_status))
5480 __kmp_end_record(gtid, tdg);
5481 }
5482 KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
5483 " tdg=%d, its status is now READY\n",
5484 gtid, loc_ref, tdg_id));
5485}
5486#endif
void * target(void *task)
uint8_t kmp_uint8
int task_entry(kmp_int32 gtid, kmp_task_t *task)
struct task * ptask
int result[2]
int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained)
This class safely opens and closes a C-style FILE* object using RAII semantics.
Definition: kmp.h:4716
kmp_uint64 get_ub() const
void set_ub(kmp_uint64 ub)
size_t get_lower_offset() const
void set_lb(kmp_uint64 lb)
size_t get_upper_offset() const
kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
kmp_uint64 get_lb() const
kmp_int32(*)(kmp_int32, void *) kmp_routine_entry_t
Definition: common.h:11
int64_t kmp_int64
Definition: common.h:10
struct kmp_taskred_data kmp_taskred_data_t
Internal struct for reduction data item related info saved by the library.
struct kmp_task_red_input kmp_task_red_input_t
Internal struct for reduction data item related info set up by compiler.
struct kmp_taskred_flags kmp_taskred_flags_t
Flags for special info per task reduction item.
struct kmp_taskred_input kmp_taskred_input_t
Internal struct for reduction data item related info set up by compiler.
void * __kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data)
void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void *task_dup)
void * __kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void * __kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
bool __kmpc_omp_has_task_team(kmp_int32 gtid)
void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask)
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws)
kmp_int32 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins, kmp_task_affinity_info_t *affin_list)
void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, int modifier, void *task_dup)
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask)
void * __kmpc_taskred_init(int gtid, int num, void *data)
void ** __kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid)
void
Definition: ittnotify.h:3324
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type size_t void * data
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t new_size
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance * instance
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t count
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type size_t void ITT_FORMAT p const __itt_domain __itt_id __itt_string_handle const wchar_t size_t ITT_FORMAT lu const __itt_domain __itt_id __itt_relation __itt_id ITT_FORMAT p const wchar_t int ITT_FORMAT __itt_group_mark d __itt_event event
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id parent
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t size
void const char const char int ITT_FORMAT __itt_group_sync p
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type size_t void ITT_FORMAT p const __itt_domain __itt_id __itt_string_handle const wchar_t size_t ITT_FORMAT lu const __itt_domain __itt_id __itt_relation __itt_id tail
#define TASK_UNTIED
Definition: kmp.h:40
#define TASK_NOT_PUSHED
Definition: kmp.h:37
#define __kmp_free(ptr)
Definition: kmp.h:3756
kmp_info_t * __kmp_hidden_helper_main_thread
#define TASK_PROXY
Definition: kmp.h:43
#define KMP_CPU_PAUSE()
Definition: kmp.h:1594
kmp_global_t __kmp_global
Definition: kmp_global.cpp:464
#define TASK_EXPLICIT
Definition: kmp.h:41
void __kmp_hidden_helper_worker_thread_signal()
#define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time)
Definition: kmp.h:1670
#define KMP_MAX_BLOCKTIME
Definition: kmp.h:1254
#define INITIAL_TASK_DEQUE_SIZE
Definition: kmp.h:2832
#define KMP_TASKDATA_TO_TASK(taskdata)
Definition: kmp.h:2467
#define KMP_NOT_SAFE_TO_REAP
Definition: kmp.h:2148
#define TASK_DEQUE_MASK(td)
Definition: kmp.h:2835
unsigned short __kmp_get_random(kmp_info_t *thread)
#define TASK_FULL
Definition: kmp.h:44
kmp_tasking_mode_t __kmp_tasking_mode
Definition: kmp_global.cpp:292
void __kmp_abort_thread(void)
int __kmp_dflt_blocktime
Definition: kmp_global.cpp:156
volatile kmp_info_t * __kmp_thread_pool
Definition: kmp_global.cpp:455
#define KMP_GEN_TASK_ID()
Definition: kmp.h:3680
int __kmp_omp_cancellation
Definition: kmp_global.cpp:213
#define TASK_DEQUE_SIZE(td)
Definition: kmp.h:2834
#define KMP_GTID_TO_SHADOW_GTID(gtid)
Definition: kmp.h:4609
#define __kmp_get_thread()
Definition: kmp.h:3604
#define TASK_CURRENT_NOT_QUEUED
Definition: kmp.h:34
static int __kmp_tid_from_gtid(int gtid)
Definition: kmp.h:3619
#define KMP_MIN(x, y)
Definition: kmp.h:300
volatile int __kmp_init_hidden_helper
Definition: kmp_global.cpp:50
@ KMP_EVENT_UNINITIALIZED
Definition: kmp.h:2618
@ KMP_EVENT_ALLOW_COMPLETION
Definition: kmp.h:2619
volatile int __kmp_init_middle
Definition: kmp_global.cpp:48
#define TASK_DETACHABLE
Definition: kmp.h:45
@ cancel_parallel
Definition: kmp.h:981
@ cancel_taskgroup
Definition: kmp.h:984
@ cancel_noreq
Definition: kmp.h:980
#define KMP_CHECK_UPDATE(a, b)
Definition: kmp.h:2383
#define TASK_IMPLICIT
Definition: kmp.h:42
#define KMP_TASK_TO_TASKDATA(task)
Definition: kmp.h:2466
union KMP_ALIGN_CACHE kmp_thread_data kmp_thread_data_t
#define TASK_SUCCESSFULLY_PUSHED
Definition: kmp.h:38
#define TASK_TIED
Definition: kmp.h:39
#define __kmp_thread_malloc(th, size)
Definition: kmp.h:3776
void __kmp_middle_initialize(void)
static void copy_icvs(kmp_internal_control_t *dst, kmp_internal_control_t *src)
Definition: kmp.h:2214
#define KMP_TASKING_ENABLED(task_team)
Definition: kmp.h:2471
kmp_info_t ** __kmp_threads
Definition: kmp_global.cpp:447
#define KMP_HIDDEN_HELPER_THREAD(gtid)
Definition: kmp.h:4595
int __kmp_enable_task_throttling
Definition: kmp_global.cpp:353
int __kmp_task_stealing_constraint
Definition: kmp_global.cpp:352
#define KMP_INIT_YIELD(count)
Definition: kmp.h:1597
#define KMP_INIT_BACKOFF(time)
Definition: kmp.h:1600
#define KMP_YIELD(cond)
Definition: kmp.h:1612
volatile int __kmp_init_parallel
Definition: kmp_global.cpp:49
kmp_int32 __kmp_enable_hidden_helper
#define __kmp_allocate(size)
Definition: kmp.h:3754
#define TRUE
Definition: kmp.h:1350
enum library_type __kmp_library
Definition: kmp_global.cpp:141
#define FALSE
Definition: kmp.h:1349
@ tskm_extra_barrier
Definition: kmp.h:2448
@ tskm_task_teams
Definition: kmp.h:2449
@ tskm_immediate_exec
Definition: kmp.h:2447
#define UNLIKELY(x)
Definition: kmp.h:150
kmp_uint64 __kmp_taskloop_min_tasks
Definition: kmp_global.cpp:294
std::atomic< kmp_int32 > __kmp_unexecuted_hidden_helper_tasks
kmp_info_t ** __kmp_hidden_helper_threads
bool __kmp_wpolicy_passive
Definition: kmp_global.cpp:158
@ bs_forkjoin_barrier
Definition: kmp.h:2164
void __kmp_hidden_helper_initialize()
#define __kmp_get_gtid()
Definition: kmp.h:3600
kmp_int32 __kmp_max_task_priority
Definition: kmp_global.cpp:293
static void __kmp_assert_valid_gtid(kmp_int32 gtid)
Definition: kmp.h:3644
static kmp_info_t * __kmp_thread_from_gtid(int gtid)
Definition: kmp.h:3634
static int __kmp_gtid_from_thread(const kmp_info_t *thr)
Definition: kmp.h:3629
@ library_throughput
Definition: kmp.h:501
struct kmp_taskdata kmp_taskdata_t
Definition: kmp.h:252
#define KMP_GTID_DNE
Definition: kmp.h:1009
union KMP_ALIGN_CACHE kmp_info kmp_info_t
#define __kmp_thread_free(th, ptr)
Definition: kmp.h:3782
char
KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86<<, 2i, 1, KMP_ARCH_X86) ATOMIC_CMPXCHG(fixed2, shr, kmp_int16, 16, > KMP_ARCH_X86 KMP_ARCH_X86 kmp_uint32
#define KE_TRACE(d, x)
Definition: kmp_debug.h:161
#define KA_TRACE(d, x)
Definition: kmp_debug.h:157
#define KMP_DEBUG_USE_VAR(x)
Definition: kmp_debug.h:63
#define KMP_ASSERT(cond)
Definition: kmp_debug.h:59
#define KMP_BUILD_ASSERT(expr)
Definition: kmp_debug.h:26
#define KF_TRACE(d, x)
Definition: kmp_debug.h:162
#define KMP_DEBUG_ASSERT(cond)
Definition: kmp_debug.h:61
#define KMP_ASSERT2(cond, msg)
Definition: kmp_debug.h:60
unsigned long long kmp_uint64
static volatile kmp_i18n_cat_status_t status
Definition: kmp_i18n.cpp:48
#define KMP_FSYNC_RELEASING(obj)
Definition: kmp_itt.h:335
#define KMP_FSYNC_ACQUIRED(obj)
Definition: kmp_itt.h:334
#define KMP_FSYNC_SPIN_ACQUIRED(obj)
Definition: kmp_itt.h:339
#define KMP_FSYNC_CANCEL(obj)
Definition: kmp_itt.h:333
#define KMP_FSYNC_SPIN_PREPARE(obj)
Definition: kmp_itt.h:338
#define USE_ITT_BUILD_ARG(x)
Definition: kmp_itt.h:346
#define KMP_FSYNC_SPIN_INIT(obj, spin)
Definition: kmp_itt.h:337
int __kmp_acquire_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid)
Definition: kmp_lock.cpp:118
void __kmp_init_tas_lock(kmp_tas_lock_t *lck)
Definition: kmp_lock.cpp:186
int __kmp_release_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid)
Definition: kmp_lock.cpp:157
static void __kmp_release_bootstrap_lock(kmp_bootstrap_lock_t *lck)
Definition: kmp_lock.h:535
static int __kmp_test_lock(kmp_lock_t *lck, kmp_int32 gtid)
Definition: kmp_lock.h:563
static int __kmp_acquire_bootstrap_lock(kmp_bootstrap_lock_t *lck)
Definition: kmp_lock.h:527
static void __kmp_release_lock(kmp_lock_t *lck, kmp_int32 gtid)
Definition: kmp_lock.h:567
static void __kmp_init_bootstrap_lock(kmp_bootstrap_lock_t *lck)
Definition: kmp_lock.h:539
#define KMP_BOOTSTRAP_LOCK_INITIALIZER(lock)
Definition: kmp_lock.h:523
#define TCW_PTR(a, b)
Definition: kmp_os.h:1167
#define KMP_ATOMIC_AND(p, v)
Definition: kmp_os.h:1267
#define KMP_SIZE_T_MAX
Definition: kmp_os.h:195
#define KMP_ATOMIC_ST_REL(p, v)
Definition: kmp_os.h:1261
bool __kmp_atomic_compare_store(std::atomic< T > *p, T expected, T desired)
Definition: kmp_os.h:1276
#define TCR_PTR(a)
Definition: kmp_os.h:1166
#define RCAST(type, var)
Definition: kmp_os.h:292
#define CACHE_LINE
Definition: kmp_os.h:340
#define KMP_ATOMIC_LD_ACQ(p)
Definition: kmp_os.h:1259
#define TCW_SYNC_4(a, b)
Definition: kmp_os.h:1146
#define KMP_ATOMIC_ST_RLX(p, v)
Definition: kmp_os.h:1262
#define CCAST(type, var)
Definition: kmp_os.h:291
#define KMP_MB()
Definition: kmp_os.h:1066
kmp_int32 kmp_int
Definition: kmp_os.h:214
#define TCR_4(a)
Definition: kmp_os.h:1137
#define KMP_FALLTHROUGH()
Definition: kmp_os.h:364
#define KMP_ATOMIC_DEC(p)
Definition: kmp_os.h:1270
#define KMP_ATOMIC_LD_RLX(p)
Definition: kmp_os.h:1260
#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv)
Definition: kmp_os.h:814
#define TCW_4(a, b)
Definition: kmp_os.h:1138
unsigned long kmp_uintptr_t
Definition: kmp_os.h:205
#define KMP_DLSYM(name)
Definition: kmp_os.h:1302
#define KMP_ATOMIC_OR(p, v)
Definition: kmp_os.h:1268
#define KMP_ATOMIC_INC(p)
Definition: kmp_os.h:1269
#define KMP_MEMCPY
#define KMP_MEMCPY_S(dst, bsz, src, cnt)
Functions for collecting statistics.
#define KMP_PUSH_PARTITIONED_TIMER(name)
Definition: kmp_stats.h:1014
#define KMP_GET_THREAD_STATE()
Definition: kmp_stats.h:1017
#define KMP_POP_PARTITIONED_TIMER()
Definition: kmp_stats.h:1015
#define KMP_SET_THREAD_STATE_BLOCK(state_name)
Definition: kmp_stats.h:1018
#define KMP_TIME_PARTITIONED_BLOCK(name)
Definition: kmp_stats.h:1013
#define KMP_COUNT_BLOCK(n)
Definition: kmp_stats.h:1001
#define i
Definition: kmp_stub.cpp:87
static void __kmp_dephash_free(kmp_info_t *thread, kmp_dephash_t *h)
Definition: kmp_taskdeps.h:86
static void __kmp_dephash_free_entries(kmp_info_t *thread, kmp_dephash_t *h)
Definition: kmp_taskdeps.h:59
static void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task)
Definition: kmp_taskdeps.h:97
void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team)
template int __kmp_atomic_execute_tasks_64< true, false >(kmp_info_t *, kmp_int32, kmp_atomic_flag_64< true, false > *, int, int *USE_ITT_BUILD_ARG(void *), kmp_int32)
void __kmp_call_init(kmp_taskred_data_t &item, size_t j)
void __kmp_call_init< kmp_task_red_input_t >(kmp_taskred_data_t &item, size_t offset)
#define PROXY_TASK_FLAG
void(* p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32)
void * __kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, T *data)
static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg)
static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task)
kmp_task_t * __kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds, kmp_routine_entry_t task_entry)
void __kmp_reap_task_teams(void)
kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part)
kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task)
static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *current_task)
void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team)
int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64< C, S > *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained)
void __kmp_wait_to_unref_task_teams(void)
static kmp_task_team_t * __kmp_allocate_task_team(kmp_info_t *thread, kmp_team_t *team)
void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team)
void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data, kmp_taskgroup_t *tg, void *reduce_data)
void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task)
static void __kmp_realloc_task_deque(kmp_info_t *thread, kmp_thread_data_t *thread_data)
Definition: kmp_tasking.cpp:97
static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask)
void __kmpc_end_taskgroup(ident_t *loc, int gtid)
void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task)
static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata)
static int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained)
kmp_bootstrap_lock_t __kmp_task_team_lock
static void __kmp_free_task_and_ancestors(kmp_int32 gtid, kmp_taskdata_t *taskdata, kmp_info_t *thread)
static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *resumed_task)
static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *current_task)
static kmp_thread_data_t * __kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri)
template int __kmp_execute_tasks_64< false, true >(kmp_info_t *, kmp_int32, kmp_flag_64< false, true > *, int, int *USE_ITT_BUILD_ARG(void *), kmp_int32)
void __kmp_finish_implicit_task(kmp_info_t *thread)
template int __kmp_execute_tasks_32< false, false >(kmp_info_t *, kmp_int32, kmp_flag_32< false, false > *, int, int *USE_ITT_BUILD_ARG(void *), kmp_int32)
struct __taskloop_params __taskloop_params_t
kmp_event_t * __kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid, kmp_task_t *task)
static void __kmp_free_task_threads_data(kmp_task_team_t *task_team)
void __kmp_assign_orig(kmp_taskred_data_t &item, T &src)
static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid, void *frame_address, void *return_address)
static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg)
static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata, kmp_info_t *thread)
template int __kmp_execute_tasks_64< true, false >(kmp_info_t *, kmp_int32, kmp_flag_64< true, false > *, int, int *USE_ITT_BUILD_ARG(void *), kmp_int32)
int __kmp_atomic_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64< C, S > *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained)
void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait)
kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task)
static kmp_task_pri_t * __kmp_alloc_task_pri_list()
static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, int modifier, void *task_dup)
kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, void *codeptr_ra)
void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, kmp_team_t *team, int tid, int set_curr_task)
static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task)
static kmp_task_t * __kmp_steal_task(kmp_int32 victim_tid, kmp_int32 gtid, kmp_task_team_t *task_team, std::atomic< kmp_int32 > *unfinished_threads, int *thread_finished, kmp_int32 is_constrained)
void * __kmp_task_reduction_init(int gtid, int num, T *data)
static void __kmp_enable_tasking(kmp_task_team_t *task_team, kmp_info_t *this_thr)
static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task, kmp_int32 pass)
int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32< C, S > *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained)
static kmp_task_t * __kmp_get_priority_task(kmp_int32 gtid, kmp_task_team_t *task_team, kmp_int32 is_constrained)
void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid)
void __kmp_free_implicit_task(kmp_info_t *thread)
int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained)
static size_t __kmp_round_up_to_val(size_t size, size_t val)
kmp_task_t * __kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds, kmp_routine_entry_t task_entry, kmp_int64 device_id)
void __kmp_fulfill_event(kmp_event_t *event)
void __kmp_assign_orig< kmp_taskred_input_t >(kmp_taskred_data_t &item, kmp_taskred_input_t &src)
static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task, void *frame_address, void *return_address)
void __kmp_assign_orig< kmp_task_red_input_t >(kmp_taskred_data_t &item, kmp_task_red_input_t &src)
static int __kmp_realloc_task_threads_data(kmp_info_t *thread, kmp_task_team_t *task_team)
template int __kmp_atomic_execute_tasks_64< false, true >(kmp_info_t *, kmp_int32, kmp_atomic_flag_64< false, true > *, int, int *USE_ITT_BUILD_ARG(void *), kmp_int32)
static kmp_task_team_t * __kmp_free_task_teams
kmp_task_t * __kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src)
kmp_task_t * __kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_tasking_flags_t *flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds, kmp_routine_entry_t task_entry)
kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid)
int __kmp_taskloop_task(int gtid, void *ptask)
void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team, int tid)
void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team)
static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata)
static void __kmp_free_task_pri_list(kmp_task_team_t *task_team)
static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained, const kmp_taskdata_t *tasknew, const kmp_taskdata_t *taskcurr)
Definition: kmp_tasking.cpp:48
static kmp_task_t * __kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, kmp_task_team_t *task_team, kmp_int32 is_constrained)
void __kmpc_taskgroup(ident_t *loc, int gtid)
kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task, bool serialize_immediate)
static void __kmp_task_team_init(kmp_task_team_t *task_team, kmp_team_t *team)
void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr)
void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team)
static void __kmp_alloc_task_deque(kmp_info_t *thread, kmp_thread_data_t *thread_data)
static void __kmp_free_task_deque(kmp_thread_data_t *thread_data)
static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread, kmp_taskdata_t *taskdata, kmp_task_team_t *task_team, kmp_int32 pri)
void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start=0)
void __kmp_call_init< kmp_taskred_input_t >(kmp_taskred_data_t &item, size_t offset)
void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, kmp_uint64 ub_glob, kmp_uint64 num_tasks, kmp_uint64 grainsize, kmp_uint64 extras, kmp_int64 last_chunk, kmp_uint64 tc, void *task_dup)
void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *, kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64, kmp_uint64, void *)
static bool __kmp_track_children_task(kmp_taskdata_t *taskdata)
int counter
Definition: kmp_taskloop.c:13
static void __kmp_null_resume_wrapper(kmp_info_t *thr)
int32_t kmp_int32
int arr[N][N][N]
#define C
int __kmpc_start_record_task(ident_t *, int, int, int)
void __kmpc_end_record_task(ident_t *, int, int, int)
#define res
if(ret)
ompt_callbacks_active_t ompt_enabled
return ret
ompt_callbacks_internal_t ompt_callbacks
#define OMPT_NOINLINE
#define OMPT_GET_RETURN_ADDRESS(level)
#define TASK_TYPE_DETAILS_FORMAT(info)
Definition: ompt-internal.h:47
#define OMPT_FRAME_FLAGS_APP
#define OMPT_GET_FRAME_ADDRESS(level)
#define OMPT_FRAME_FLAGS_RUNTIME
ompt_team_info_t * __ompt_get_teaminfo(int depth, int *size)
int __ompt_get_task_info_internal(int ancestor_level, int *type, ompt_data_t **task_data, ompt_frame_t **task_frame, ompt_data_t **parallel_data, int *thread_num)
ompt_task_info_t * __ompt_get_task_info_object(int depth)
static id loc
volatile int flag
__attribute__((noinline))
kmp_uint64 num_tasks
kmp_uint64 grainsize
kmp_uint64 num_t_min
kmp_task_t * task
kmp_lock_t * mtx_locks[MAX_MTX_DEPS]
Definition: kmp.h:2572
kmp_int32 mtx_num_locks
Definition: kmp.h:2573
kmp_int32 tt_found_proxy_tasks
Definition: kmp.h:2868
KMP_ALIGN_CACHE std::atomic< kmp_int32 > tt_unfinished_threads
Definition: kmp.h:2876
kmp_int32 tt_max_threads
Definition: kmp.h:2867
kmp_int32 tt_nproc
Definition: kmp.h:2866
kmp_bootstrap_lock_t tt_task_pri_lock
Definition: kmp.h:2856
std::atomic< kmp_int32 > tt_num_task_pri
Definition: kmp.h:2870
kmp_bootstrap_lock_t tt_threads_lock
Definition: kmp.h:2852
kmp_int32 tt_untied_task_encountered
Definition: kmp.h:2869
kmp_task_pri_t * tt_task_pri_list
Definition: kmp.h:2857
kmp_int32 tt_hidden_helper_task_encountered
Definition: kmp.h:2873
kmp_thread_data_t * tt_threads_data
Definition: kmp.h:2861
KMP_ALIGN_CACHE volatile kmp_uint32 tt_active
Definition: kmp.h:2880
kmp_task_team_t * tt_next
Definition: kmp.h:2859
kmp_int32 tt_found_tasks
Definition: kmp.h:2863
kmp_tas_lock_t lock
Definition: kmp.h:2624
union kmp_event_t::@12 ed
kmp_task_t * task
Definition: kmp.h:2626
kmp_event_type_t type
Definition: kmp.h:2623
void * async_handle
Definition: kmp.h:2759
kmp_int32 priority
Definition: kmp.h:2845
kmp_task_pri * next
Definition: kmp.h:2846
kmp_thread_data_t td
Definition: kmp.h:2844
Internal struct for reduction data item related info set up by compiler.
void * reduce_shar
shared between tasks item to reduce into
void * reduce_fini
data finalization routine
kmp_taskred_flags_t flags
flags for additional info from compiler
size_t reduce_size
size of data item in bytes
void * reduce_init
data initialization routine (single parameter)
void * reduce_comb
data combiner routine
kmp_task_team_list_t * next
Definition: kmp.h:2891
kmp_task_team_t * task_team
Definition: kmp.h:2890
Definition: kmp.h:2493
void * shareds
pointer to block of pointers to shared vars
Definition: kmp.h:2494
kmp_uint32 td_taskwait_counter
Definition: kmp.h:2774
ident_t * td_taskwait_ident
Definition: kmp.h:2773
kmp_int32 td_level
Definition: kmp.h:2769
kmp_team_t * td_team
Definition: kmp.h:2765
kmp_task_team_t * td_task_team
Definition: kmp.h:2789
kmp_dephash_t * td_dephash
Definition: kmp.h:2786
kmp_taskdata_t * td_parent
Definition: kmp.h:2768
std::atomic< kmp_int32 > td_incomplete_child_tasks
Definition: kmp.h:2782
std::atomic< kmp_int32 > td_untied_count
Definition: kmp.h:2770
kmp_taskgroup_t * td_taskgroup
Definition: kmp.h:2784
kmp_int32 td_task_id
Definition: kmp.h:2763
kmp_info_p * td_alloc_thread
Definition: kmp.h:2766
ident_t * td_ident
Definition: kmp.h:2771
kmp_depnode_t * td_depnode
Definition: kmp.h:2788
kmp_int32 td_taskwait_thread
Definition: kmp.h:2775
kmp_tasking_flags_t td_flags
Definition: kmp.h:2764
kmp_taskdata_t * td_last_tied
Definition: kmp.h:2795
KMP_ALIGN_CACHE kmp_internal_control_t td_icvs
Definition: kmp.h:2777
kmp_event_t td_allow_completion_event
Definition: kmp.h:2800
size_t td_size_alloc
Definition: kmp.h:2790
kmp_target_data_t td_target_data
Definition: kmp.h:2809
KMP_ALIGN_CACHE std::atomic< kmp_int32 > td_allocated_child_tasks
Definition: kmp.h:2779
std::atomic< kmp_int32 > cancel_request
Definition: kmp.h:2512
uintptr_t * gomp_data
Definition: kmp.h:2517
std::atomic< kmp_int32 > count
Definition: kmp.h:2510
void * reduce_data
Definition: kmp.h:2515
struct kmp_taskgroup * parent
Definition: kmp.h:2513
kmp_int32 reduce_num_data
Definition: kmp.h:2516
unsigned target
Definition: kmp.h:2747
unsigned priority_specified
Definition: kmp.h:2726
unsigned detachable
Definition: kmp.h:2728
unsigned task_serial
Definition: kmp.h:2735
unsigned merged_if0
Definition: kmp.h:2720
unsigned complete
Definition: kmp.h:2744
unsigned freed
Definition: kmp.h:2745
unsigned executing
Definition: kmp.h:2743
unsigned tasking_ser
Definition: kmp.h:2736
unsigned team_serial
Definition: kmp.h:2738
unsigned native
Definition: kmp.h:2746
unsigned tiedness
Definition: kmp.h:2718
unsigned started
Definition: kmp.h:2742
unsigned destructors_thunk
Definition: kmp.h:2722
unsigned proxy
Definition: kmp.h:2724
unsigned tasktype
Definition: kmp.h:2734
unsigned final
Definition: kmp.h:2719
unsigned hidden_helper
Definition: kmp.h:2748
Internal struct for reduction data item related info saved by the library.
void * reduce_init
data initialization routine (two parameters)
void * reduce_priv
array of thread specific items
void * reduce_pend
end of private data for faster comparison op
void * reduce_comb
data combiner routine
kmp_taskred_flags_t flags
flags for additional info from compiler
void * reduce_fini
data finalization routine
size_t reduce_size
size of data item
void * reduce_shar
shared between tasks item to reduce into
void * reduce_orig
original item (can be used in UDR initializer)
Flags for special info per task reduction item.
unsigned lazy_priv
1 - use lazy alloc/init (e.g.
Internal struct for reduction data item related info set up by compiler.
void * reduce_shar
shared between tasks item to reduce into
void * reduce_fini
data finalization routine
void * reduce_init
data initialization routine (two parameters)
void * reduce_comb
data combiner routine
size_t reduce_size
size of data item
void * reduce_orig
original reduction item used for initialization
kmp_taskred_flags_t flags
flags for additional info from compiler
ompt_data_t task_data
Definition: ompt-internal.h:61
ompt_frame_t frame
Definition: ompt-internal.h:60
ompt_data_t parallel_data
Definition: ompt-internal.h:68
ompt_wait_id_t wait_id
Definition: ompt-internal.h:86
int(* routine)(int, struct task *)
int th
Definition: kmp_taskloop.c:38
void ** shareds
kmp_base_depnode_t dn
Definition: kmp.h:2585
kmp_base_task_team_t tt
Definition: kmp.h:2884
Definition: kmp.h:3233
kmp_base_team_t t
Definition: kmp.h:3234
int __kmp_is_thread_alive(kmp_info_t *th, DWORD *exit_val)