doxygen/kmp__tasking_8cpp_source.html

/*

 * kmp_tasking.cpp -- OpenMP 3.0 tasking support.

 */


//===----------------------------------------------------------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//


#include "kmp.h"

#include "kmp_i18n.h"

#include "kmp_itt.h"

#include "kmp_stats.h"

#include "kmp_wait_release.h"

#include "kmp_taskdeps.h"


#if OMPT_SUPPORT

#include "ompt-specific.h"

#endif


#if ENABLE_LIBOMPTARGET

static void (*tgt_target_nowait_query)(void **);


void __kmp_init_target_task() {

  *(void **)(&tgt_target_nowait_query) = KMP_DLSYM("__tgt_target_nowait_query");

}

#endif


/* forward declaration */

static void __kmp_enable_tasking(kmp_task_team_t *task_team,

                                 kmp_info_t *this_thr);

static void __kmp_alloc_task_deque(kmp_info_t *thread,

                                   kmp_thread_data_t *thread_data);

static int __kmp_realloc_task_threads_data(kmp_info_t *thread,

                                           kmp_task_team_t *task_team);

static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);

#if OMPX_TASKGRAPH

static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);

int __kmp_taskloop_task(int gtid, void *ptask);

#endif


// returns 1 if new task is allowed to execute, 0 otherwise

// checks Task Scheduling constraint (if requested) and

// mutexinoutset dependencies if any


static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,

                                  const kmp_taskdata_t *tasknew,

                                  const kmp_taskdata_t *taskcurr) {

  if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {

    // Check if the candidate obeys the Task Scheduling Constraints (TSC)

    // only descendant of all deferred tied tasks can be scheduled, checking

    // the last one is enough, as it in turn is the descendant of all others

    kmp_taskdata_t *current = taskcurr->td_last_tied;

    KMP_DEBUG_ASSERT(current != NULL);

    // check if the task is not suspended on barrier

    if (current->td_flags.tasktype == TASK_EXPLICIT ||

        current->td_taskwait_thread > 0) { // <= 0 on barrier

      kmp_int32 level = current->td_level;

      kmp_taskdata_t *parent = tasknew->td_parent;

      while (parent != current && parent->td_level > level) {

        // check generation up to the level of the current task

        parent = parent->td_parent;

        KMP_DEBUG_ASSERT(parent != NULL);

      }

      if (parent != current)

        return false;

    }

  }

  // Check mutexinoutset dependencies, acquire locks

  kmp_depnode_t *node = tasknew->td_depnode;

#if OMPX_TASKGRAPH

  if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {

#else

  if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {

#endif

    for (int i = 0; i < node->dn.mtx_num_locks; ++i) {

      KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);

      if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))

        continue;

      // could not get the lock, release previous locks

      for (int j = i - 1; j >= 0; --j)

        __kmp_release_lock(node->dn.mtx_locks[j], gtid);

      return false;

    }

    // negative num_locks means all locks acquired successfully

    node->dn.mtx_num_locks = -node->dn.mtx_num_locks;

  }

  return true;

}


// __kmp_realloc_task_deque:

// Re-allocates a task deque for a particular thread, copies the content from

// the old deque and adjusts the necessary data structures relating to the

// deque. This operation must be done with the deque_lock being held


static void __kmp_realloc_task_deque(kmp_info_t *thread,

                                     kmp_thread_data_t *thread_data) {

  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);

  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);

  kmp_int32 new_size = 2 * size;


  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "

                "%d] for thread_data %p\n",

                __kmp_gtid_from_thread(thread), size, new_size, thread_data));


  kmp_taskdata_t **new_deque =

      (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));


  int i, j;

  for (i = thread_data->td.td_deque_head, j = 0; j < size;

       i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)

    new_deque[j] = thread_data->td.td_deque[i];


  __kmp_free(thread_data->td.td_deque);


  thread_data->td.td_deque_head = 0;

  thread_data->td.td_deque_tail = size;

  thread_data->td.td_deque = new_deque;

  thread_data->td.td_deque_size = new_size;

}


static kmp_task_pri_t *__kmp_alloc_task_pri_list() {

  kmp_task_pri_t *l = (kmp_task_pri_t *)__kmp_allocate(sizeof(kmp_task_pri_t));

  kmp_thread_data_t *thread_data = &l->td;

  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);

  thread_data->td.td_deque_last_stolen = -1;

  KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] "

                "for thread_data %p\n",

                __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data));

  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(

      INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));

  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;

  return l;

}


// The function finds the deque of priority tasks with given priority, or

// allocates a new deque and put it into sorted (high -> low) list of deques.

// Deques of non-default priority tasks are shared between all threads in team,

// as opposed to per-thread deques of tasks with default priority.

// The function is called under the lock task_team->tt.tt_task_pri_lock.

static kmp_thread_data_t *


__kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) {

  kmp_thread_data_t *thread_data;

  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;

  if (lst->priority == pri) {

    // Found queue of tasks with given priority.

    thread_data = &lst->td;

  } else if (lst->priority < pri) {

    // All current priority queues contain tasks with lower priority.

    // Allocate new one for given priority tasks.

    kmp_task_pri_t *list = __kmp_alloc_task_pri_list();

    thread_data = &list->td;

    list->priority = pri;

    list->next = lst;

    task_team->tt.tt_task_pri_list = list;

  } else { // task_team->tt.tt_task_pri_list->priority > pri

    kmp_task_pri_t *next_queue = lst->next;

    while (next_queue && next_queue->priority > pri) {

      lst = next_queue;

      next_queue = lst->next;

    }

    // lst->priority > pri && (next == NULL || pri >= next->priority)

    if (next_queue == NULL) {

      // No queue with pri priority, need to allocate new one.

      kmp_task_pri_t *list = __kmp_alloc_task_pri_list();

      thread_data = &list->td;

      list->priority = pri;

      list->next = NULL;

      lst->next = list;

    } else if (next_queue->priority == pri) {

      // Found queue of tasks with given priority.

      thread_data = &next_queue->td;

    } else { // lst->priority > pri > next->priority

      // insert newly allocated between existed queues

      kmp_task_pri_t *list = __kmp_alloc_task_pri_list();

      thread_data = &list->td;

      list->priority = pri;

      list->next = next_queue;

      lst->next = list;

    }

  }

  return thread_data;

}


//  __kmp_push_priority_task: Add a task to the team's priority task deque


static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,

                                          kmp_taskdata_t *taskdata,

                                          kmp_task_team_t *task_team,

                                          kmp_int32 pri) {

  kmp_thread_data_t *thread_data = NULL;

  KA_TRACE(20,

           ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n",

            gtid, taskdata, pri));


  // Find task queue specific to priority value

  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;

  if (UNLIKELY(lst == NULL)) {

    __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);

    if (task_team->tt.tt_task_pri_list == NULL) {

      // List of queues is still empty, allocate one.

      kmp_task_pri_t *list = __kmp_alloc_task_pri_list();

      thread_data = &list->td;

      list->priority = pri;

      list->next = NULL;

      task_team->tt.tt_task_pri_list = list;

    } else {

      // Other thread initialized a queue. Check if it fits and get thread_data.

      thread_data = __kmp_get_priority_deque_data(task_team, pri);

    }

    __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);

  } else {

    if (lst->priority == pri) {

      // Found queue of tasks with given priority.

      thread_data = &lst->td;

    } else {

      __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);

      thread_data = __kmp_get_priority_deque_data(task_team, pri);

      __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);

    }

  }

  KMP_DEBUG_ASSERT(thread_data);


  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);

  // Check if deque is full

  if (TCR_4(thread_data->td.td_deque_ntasks) >=

      TASK_DEQUE_SIZE(thread_data->td)) {

    if (__kmp_enable_task_throttling &&

        __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,

                              thread->th.th_current_task)) {

      __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);

      KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "

                    "TASK_NOT_PUSHED for task %p\n",

                    gtid, taskdata));

      return TASK_NOT_PUSHED;

    } else {

      // expand deque to push the task which is not allowed to execute

      __kmp_realloc_task_deque(thread, thread_data);

    }

  }

  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <

                   TASK_DEQUE_SIZE(thread_data->td));

  // Push taskdata.

  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;

  // Wrap index.

  thread_data->td.td_deque_tail =

      (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);

  TCW_4(thread_data->td.td_deque_ntasks,

        TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count

  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self

  KMP_FSYNC_RELEASING(taskdata); // releasing child

  KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning "

                "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n",

                gtid, taskdata, thread_data->td.td_deque_ntasks,

                thread_data->td.td_deque_head, thread_data->td.td_deque_tail));

  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);

  task_team->tt.tt_num_task_pri++; // atomic inc

  return TASK_SUCCESSFULLY_PUSHED;

}


//  __kmp_push_task: Add a task to the thread's deque


static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {

  kmp_info_t *thread = __kmp_threads[gtid];

  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);


  // If we encounter a hidden helper task, and the current thread is not a

  // hidden helper thread, we have to give the task to any hidden helper thread

  // starting from its shadow one.

  if (UNLIKELY(taskdata->td_flags.hidden_helper &&

               !KMP_HIDDEN_HELPER_THREAD(gtid))) {

    kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);

    __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));

    // Signal the hidden helper threads.

    __kmp_hidden_helper_worker_thread_signal();

    return TASK_SUCCESSFULLY_PUSHED;

  }


  kmp_task_team_t *task_team = thread->th.th_task_team;

  kmp_int32 tid = __kmp_tid_from_gtid(gtid);

  kmp_thread_data_t *thread_data;


  KA_TRACE(20,

           ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));


  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {

    // untied task needs to increment counter so that the task structure is not

    // freed prematurely

    kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);

    KMP_DEBUG_USE_VAR(counter);

    KA_TRACE(

        20,

        ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",

         gtid, counter, taskdata));

  }


  // The first check avoids building task_team thread data if serialized

  if (UNLIKELY(taskdata->td_flags.task_serial)) {

    KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "

                  "TASK_NOT_PUSHED for task %p\n",

                  gtid, taskdata));

    return TASK_NOT_PUSHED;

  }


  // Now that serialized tasks have returned, we can assume that we are not in

  // immediate exec mode

  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);

  if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {

    __kmp_enable_tasking(task_team, thread);

  }

  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);

  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);


  if (taskdata->td_flags.priority_specified && task->data2.priority > 0 &&

      __kmp_max_task_priority > 0) {

    int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority);

    return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri);

  }


  // Find tasking deque specific to encountering thread

  thread_data = &task_team->tt.tt_threads_data[tid];


  // No lock needed since only owner can allocate. If the task is hidden_helper,

  // we don't need it either because we have initialized the dequeue for hidden

  // helper thread data.

  if (UNLIKELY(thread_data->td.td_deque == NULL)) {

    __kmp_alloc_task_deque(thread, thread_data);

  }


  int locked = 0;

  // Check if deque is full

  if (TCR_4(thread_data->td.td_deque_ntasks) >=

      TASK_DEQUE_SIZE(thread_data->td)) {

    if (__kmp_enable_task_throttling &&

        __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,

                              thread->th.th_current_task)) {

      KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "

                    "TASK_NOT_PUSHED for task %p\n",

                    gtid, taskdata));

      return TASK_NOT_PUSHED;

    } else {

      __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);

      locked = 1;

      if (TCR_4(thread_data->td.td_deque_ntasks) >=

          TASK_DEQUE_SIZE(thread_data->td)) {

        // expand deque to push the task which is not allowed to execute

        __kmp_realloc_task_deque(thread, thread_data);

      }

    }

  }

  // Lock the deque for the task push operation

  if (!locked) {

    __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);

    // Need to recheck as we can get a proxy task from thread outside of OpenMP

    if (TCR_4(thread_data->td.td_deque_ntasks) >=

        TASK_DEQUE_SIZE(thread_data->td)) {

      if (__kmp_enable_task_throttling &&

          __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,

                                thread->th.th_current_task)) {

        __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);

        KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "

                      "returning TASK_NOT_PUSHED for task %p\n",

                      gtid, taskdata));

        return TASK_NOT_PUSHED;

      } else {

        // expand deque to push the task which is not allowed to execute

        __kmp_realloc_task_deque(thread, thread_data);

      }

    }

  }

  // Must have room since no thread can add tasks but calling thread

  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <

                   TASK_DEQUE_SIZE(thread_data->td));


  thread_data->td.td_deque[thread_data->td.td_deque_tail] =

      taskdata; // Push taskdata

  // Wrap index.

  thread_data->td.td_deque_tail =

      (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);

  TCW_4(thread_data->td.td_deque_ntasks,

        TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count

  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self

  KMP_FSYNC_RELEASING(taskdata); // releasing child

  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "

                "task=%p ntasks=%d head=%u tail=%u\n",

                gtid, taskdata, thread_data->td.td_deque_ntasks,

                thread_data->td.td_deque_head, thread_data->td.td_deque_tail));


  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);


  return TASK_SUCCESSFULLY_PUSHED;

}


// __kmp_pop_current_task_from_thread: set up current task from called thread

// when team ends

//

// this_thr: thread structure to set current_task in.


void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {

  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "

                "this_thread=%p, curtask=%p, "

                "curtask_parent=%p\n",

                0, this_thr, this_thr->th.th_current_task,

                this_thr->th.th_current_task->td_parent));


  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;


  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "

                "this_thread=%p, curtask=%p, "

                "curtask_parent=%p\n",

                0, this_thr, this_thr->th.th_current_task,

                this_thr->th.th_current_task->td_parent));

}


// __kmp_push_current_task_to_thread: set up current task in called thread for a

// new team

//

// this_thr: thread structure to set up

// team: team for implicit task data

// tid: thread within team to set up


void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,

                                       int tid) {

  // current task of the thread is a parent of the new just created implicit

  // tasks of new team

  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "

                "curtask=%p "

                "parent_task=%p\n",

                tid, this_thr, this_thr->th.th_current_task,

                team->t.t_implicit_task_taskdata[tid].td_parent));


  KMP_DEBUG_ASSERT(this_thr != NULL);


  if (tid == 0) {

    if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {

      team->t.t_implicit_task_taskdata[0].td_parent =

          this_thr->th.th_current_task;

      this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];

    }

  } else {

    team->t.t_implicit_task_taskdata[tid].td_parent =

        team->t.t_implicit_task_taskdata[0].td_parent;

    this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];

  }


  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "

                "curtask=%p "

                "parent_task=%p\n",

                tid, this_thr, this_thr->th.th_current_task,

                team->t.t_implicit_task_taskdata[tid].td_parent));

}


// __kmp_task_start: bookkeeping for a task starting execution

//

// GTID: global thread id of calling thread

// task: task starting execution

// current_task: task suspending


static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,

                             kmp_taskdata_t *current_task) {

  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);

  kmp_info_t *thread = __kmp_threads[gtid];


  KA_TRACE(10,

           ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",

            gtid, taskdata, current_task));


  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);


  // mark currently executing task as suspended

  // TODO: GEH - make sure root team implicit task is initialized properly.

  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );

  current_task->td_flags.executing = 0;


  // mark starting task as executing and as current task

  thread->th.th_current_task = taskdata;


  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||

                   taskdata->td_flags.tiedness == TASK_UNTIED);

  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||

                   taskdata->td_flags.tiedness == TASK_UNTIED);

  taskdata->td_flags.started = 1;

  taskdata->td_flags.executing = 1;

  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);

  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);


  // GEH TODO: shouldn't we pass some sort of location identifier here?

  // APT: yes, we will pass location here.

  // need to store current thread state (in a thread or taskdata structure)

  // before setting work_state, otherwise wrong state is set after end of task


  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));


  return;

}


#if OMPT_SUPPORT

//------------------------------------------------------------------------------


// __ompt_task_start:

//   Build and trigger task-begin event

static inline void __ompt_task_start(kmp_task_t *task,

                                     kmp_taskdata_t *current_task,

                                     kmp_int32 gtid) {

  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);

  ompt_task_status_t status = ompt_task_switch;

  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {

    status = ompt_task_yield;

    __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;

  }

  /* let OMPT know that we're about to run this task */

  if (ompt_enabled.ompt_callback_task_schedule) {

    ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(

        &(current_task->ompt_task_info.task_data), status,

        &(taskdata->ompt_task_info.task_data));

  }

  taskdata->ompt_task_info.scheduling_parent = current_task;

}


// __ompt_task_finish:

//   Build and trigger final task-schedule event

static inline void __ompt_task_finish(kmp_task_t *task,

                                      kmp_taskdata_t *resumed_task,

                                      ompt_task_status_t status) {

  if (ompt_enabled.ompt_callback_task_schedule) {

    kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);

    if (__kmp_omp_cancellation && taskdata->td_taskgroup &&

        taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {

      status = ompt_task_cancel;

    }


    /* let OMPT know that we're returning to the callee task */

    ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(

        &(taskdata->ompt_task_info.task_data), status,

        (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));

  }

}

#endif


template <bool ompt>


static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,

                                               kmp_task_t *task,

                                               void *frame_address,

                                               void *return_address) {

  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);

  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;


  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "

                "current_task=%p\n",

                gtid, loc_ref, taskdata, current_task));


  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {

    // untied task needs to increment counter so that the task structure is not

    // freed prematurely

    kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);

    KMP_DEBUG_USE_VAR(counter);

    KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "

                  "incremented for task %p\n",

                  gtid, counter, taskdata));

  }


  taskdata->td_flags.task_serial =

      1; // Execute this task immediately, not deferred.

  __kmp_task_start(gtid, task, current_task);


#if OMPT_SUPPORT

  if (ompt) {

    if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {

      current_task->ompt_task_info.frame.enter_frame.ptr =

          taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;

      current_task->ompt_task_info.frame.enter_frame_flags =

          taskdata->ompt_task_info.frame.exit_frame_flags =

              OMPT_FRAME_FLAGS_APP;

    }

    if (ompt_enabled.ompt_callback_task_create) {

      ompt_task_info_t *parent_info = &(current_task->ompt_task_info);

      ompt_callbacks.ompt_callback(ompt_callback_task_create)(

          &(parent_info->task_data), &(parent_info->frame),

          &(taskdata->ompt_task_info.task_data),

          TASK_TYPE_DETAILS_FORMAT(taskdata), 0, return_address);

    }

    __ompt_task_start(task, current_task, gtid);

  }

#endif // OMPT_SUPPORT


  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,

                loc_ref, taskdata));

}


#if OMPT_SUPPORT

OMPT_NOINLINE

static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,

                                           kmp_task_t *task,

                                           void *frame_address,

                                           void *return_address) {

  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,

                                           return_address);

}

#endif // OMPT_SUPPORT


// __kmpc_omp_task_begin_if0: report that a given serialized task has started

// execution

//

// loc_ref: source location information; points to beginning of task block.

// gtid: global thread number.

// task: task thunk for the started task.

#ifdef __s390x__

// This is required for OMPT_GET_FRAME_ADDRESS(1) to compile on s390x.

// In order for it to work correctly, the caller also needs to be compiled with

// backchain. If a caller is compiled without backchain,

// OMPT_GET_FRAME_ADDRESS(1) will produce an incorrect value, but will not

// crash.

__attribute__((target("backchain")))

#endif


void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,

                               kmp_task_t *task) {

#if OMPT_SUPPORT

  if (UNLIKELY(ompt_enabled.enabled)) {

    OMPT_STORE_RETURN_ADDRESS(gtid);

    __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,

                                   OMPT_GET_FRAME_ADDRESS(1),

                                   OMPT_LOAD_RETURN_ADDRESS(gtid));

    return;

  }

#endif

  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);

}


#ifdef TASK_UNUSED

// __kmpc_omp_task_begin: report that a given task has started execution

// NEVER GENERATED BY COMPILER, DEPRECATED!!!

void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {

  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;


  KA_TRACE(

      10,

      ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",

       gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));


  __kmp_task_start(gtid, task, current_task);


  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,

                loc_ref, KMP_TASK_TO_TASKDATA(task)));

  return;

}

#endif // TASK_UNUSED


// __kmp_free_task: free the current task space and the space for shareds

//

// gtid: Global thread ID of calling thread

// taskdata: task to free

// thread: thread data structure of caller


static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,

                            kmp_info_t *thread) {

  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,

                taskdata));


  // Check to make sure all flags and counters have the correct values

  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);

  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);

  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);

  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);

  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||

                   taskdata->td_flags.task_serial == 1);

  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);

  kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);

  // Clear data to not be re-used later by mistake.

  task->data1.destructors = NULL;

  task->data2.priority = 0;


  taskdata->td_flags.freed = 1;

#if OMPX_TASKGRAPH

  // do not free tasks in taskgraph

  if (!taskdata->is_taskgraph) {

#endif

// deallocate the taskdata and shared variable blocks associated with this task

#if USE_FAST_MEMORY

  __kmp_fast_free(thread, taskdata);

#else /* ! USE_FAST_MEMORY */

  __kmp_thread_free(thread, taskdata);

#endif

#if OMPX_TASKGRAPH

  } else {

    taskdata->td_flags.complete = 0;

    taskdata->td_flags.started = 0;

    taskdata->td_flags.freed = 0;

    taskdata->td_flags.executing = 0;

    taskdata->td_flags.task_serial =

        (taskdata->td_parent->td_flags.final ||

          taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);


    // taskdata->td_allow_completion_event.pending_events_count = 1;

    KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);

    KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);

    // start at one because counts current task and children

    KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);

  }

#endif


  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));

}


// __kmp_free_task_and_ancestors: free the current task and ancestors without

// children

//

// gtid: Global thread ID of calling thread

// taskdata: task to free

// thread: thread data structure of caller


static void __kmp_free_task_and_ancestors(kmp_int32 gtid,

                                          kmp_taskdata_t *taskdata,

                                          kmp_info_t *thread) {

  // Proxy tasks must always be allowed to free their parents

  // because they can be run in background even in serial mode.

  kmp_int32 team_serial =

      (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&

      !taskdata->td_flags.proxy;

  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);


  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;

  KMP_DEBUG_ASSERT(children >= 0);


  // Now, go up the ancestor tree to see if any ancestors can now be freed.

  while (children == 0) {

    kmp_taskdata_t *parent_taskdata = taskdata->td_parent;


    KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "

                  "and freeing itself\n",

                  gtid, taskdata));


    // --- Deallocate my ancestor task ---

    __kmp_free_task(gtid, taskdata, thread);


    taskdata = parent_taskdata;


    if (team_serial)

      return;

    // Stop checking ancestors at implicit task instead of walking up ancestor

    // tree to avoid premature deallocation of ancestors.

    if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {

      if (taskdata->td_dephash) { // do we need to cleanup dephash?

        int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);

        kmp_tasking_flags_t flags_old = taskdata->td_flags;

        if (children == 0 && flags_old.complete == 1) {

          kmp_tasking_flags_t flags_new = flags_old;

          flags_new.complete = 0;

          if (KMP_COMPARE_AND_STORE_ACQ32(

                  RCAST(kmp_int32 *, &taskdata->td_flags),

                  *RCAST(kmp_int32 *, &flags_old),

                  *RCAST(kmp_int32 *, &flags_new))) {

            KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "

                           "dephash of implicit task %p\n",

                           gtid, taskdata));

            // cleanup dephash of finished implicit task

            __kmp_dephash_free_entries(thread, taskdata->td_dephash);

          }

        }

      }

      return;

    }

    // Predecrement simulated by "- 1" calculation

    children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;

    KMP_DEBUG_ASSERT(children >= 0);

  }


  KA_TRACE(

      20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "

           "not freeing it yet\n",

           gtid, taskdata, children));

}


// Only need to keep track of child task counts if any of the following:

// 1. team parallel and tasking not serialized;

// 2. it is a proxy or detachable or hidden helper task

// 3. the children counter of its parent task is greater than 0.

// The reason for the 3rd one is for serialized team that found detached task,

// hidden helper task, T. In this case, the execution of T is still deferred,

// and it is also possible that a regular task depends on T. In this case, if we

// don't track the children, task synchronization will be broken.


static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {

  kmp_tasking_flags_t flags = taskdata->td_flags;

  bool ret = !(flags.team_serial || flags.tasking_ser);

  ret = ret || flags.proxy == TASK_PROXY ||

        flags.detachable == TASK_DETACHABLE || flags.hidden_helper;

  ret = ret ||

        KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;

#if OMPX_TASKGRAPH

  if (taskdata->td_taskgroup && taskdata->is_taskgraph)

    ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0;

#endif

  return ret;

}


// __kmp_task_finish: bookkeeping to do when a task finishes execution

//

// gtid: global thread ID for calling thread

// task: task to be finished

// resumed_task: task to be resumed.  (may be NULL if task is serialized)

//

// template<ompt>: effectively ompt_enabled.enabled!=0

// the version with ompt=false is inlined, allowing to optimize away all ompt

// code in this case

template <bool ompt>


static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,

                              kmp_taskdata_t *resumed_task) {

  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);

  kmp_info_t *thread = __kmp_threads[gtid];

  kmp_task_team_t *task_team =

      thread->th.th_task_team; // might be NULL for serial teams...

#if OMPX_TASKGRAPH

  // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop

  bool is_taskgraph;

#endif

#if KMP_DEBUG

  kmp_int32 children = 0;

#endif

  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "

                "task %p\n",

                gtid, taskdata, resumed_task));


  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);


#if OMPX_TASKGRAPH

  is_taskgraph = taskdata->is_taskgraph;

#endif


  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {

    // untied task needs to check the counter so that the task structure is not

    // freed prematurely

    kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;

    KA_TRACE(

        20,

        ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",

         gtid, counter, taskdata));

    if (counter > 0) {

      // untied task is not done, to be continued possibly by other thread, do

      // not free it now

      if (resumed_task == NULL) {

        KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);

        resumed_task = taskdata->td_parent; // In a serialized task, the resumed

        // task is the parent

      }

      thread->th.th_current_task = resumed_task; // restore current_task

      resumed_task->td_flags.executing = 1; // resume previous task

      KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "

                    "resuming task %p\n",

                    gtid, taskdata, resumed_task));

      return;

    }

  }


  // bookkeeping for resuming task:

  // GEH - note tasking_ser => task_serial

  KMP_DEBUG_ASSERT(

      (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==

      taskdata->td_flags.task_serial);

  if (taskdata->td_flags.task_serial) {

    if (resumed_task == NULL) {

      resumed_task = taskdata->td_parent; // In a serialized task, the resumed

      // task is the parent

    }

  } else {

    KMP_DEBUG_ASSERT(resumed_task !=

                     NULL); // verify that resumed task is passed as argument

  }


  /* If the tasks' destructor thunk flag has been set, we need to invoke the

     destructor thunk that has been generated by the compiler. The code is

     placed here, since at this point other tasks might have been released

     hence overlapping the destructor invocations with some other work in the

     released tasks.  The OpenMP spec is not specific on when the destructors

     are invoked, so we should be free to choose. */

  if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {

    kmp_routine_entry_t destr_thunk = task->data1.destructors;

    KMP_ASSERT(destr_thunk);

    destr_thunk(gtid, task);

  }


  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);

  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);

  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);


  bool completed = true;

  if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {

    if (taskdata->td_allow_completion_event.type ==

        KMP_EVENT_ALLOW_COMPLETION) {

      // event hasn't been fulfilled yet. Try to detach task.

      __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);

      if (taskdata->td_allow_completion_event.type ==

          KMP_EVENT_ALLOW_COMPLETION) {

        // task finished execution

        KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);

        taskdata->td_flags.executing = 0; // suspend the finishing task


#if OMPT_SUPPORT

        // For a detached task, which is not completed, we switch back

        // the omp_fulfill_event signals completion

        // locking is necessary to avoid a race with ompt_task_late_fulfill

        if (ompt)

          __ompt_task_finish(task, resumed_task, ompt_task_detach);

#endif


        // no access to taskdata after this point!

        // __kmp_fulfill_event might free taskdata at any time from now


        taskdata->td_flags.proxy = TASK_PROXY; // proxify!

        completed = false;

      }

      __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);

    }

  }


  // Tasks with valid target async handles must be re-enqueued.

  if (taskdata->td_target_data.async_handle != NULL) {

    // Note: no need to translate gtid to its shadow. If the current thread is a

    // hidden helper one, then the gtid is already correct. Otherwise, hidden

    // helper threads are disabled, and gtid refers to a OpenMP thread.

#if OMPT_SUPPORT

    if (ompt) {

      __ompt_task_finish(task, resumed_task, ompt_task_switch);

    }

#endif

    __kmpc_give_task(task, __kmp_tid_from_gtid(gtid));

    if (KMP_HIDDEN_HELPER_THREAD(gtid))

      __kmp_hidden_helper_worker_thread_signal();

    completed = false;

  }


  if (completed) {

    taskdata->td_flags.complete = 1; // mark the task as completed

#if OMPX_TASKGRAPH

    taskdata->td_flags.onced = 1; // mark the task as ran once already

#endif


#if OMPT_SUPPORT

    // This is not a detached task, we are done here

    if (ompt)

      __ompt_task_finish(task, resumed_task, ompt_task_complete);

#endif

    // TODO: What would be the balance between the conditions in the function

    // and an atomic operation?

    if (__kmp_track_children_task(taskdata)) {

      __kmp_release_deps(gtid, taskdata);

      // Predecrement simulated by "- 1" calculation

#if KMP_DEBUG

      children = -1 +

#endif

          KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);

      KMP_DEBUG_ASSERT(children >= 0);

#if OMPX_TASKGRAPH

      if (taskdata->td_taskgroup && !taskdata->is_taskgraph)

#else

      if (taskdata->td_taskgroup)

#endif

        KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);

    } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||

                             task_team->tt.tt_hidden_helper_task_encountered)) {

      // if we found proxy or hidden helper tasks there could exist a dependency

      // chain with the proxy task as origin

      __kmp_release_deps(gtid, taskdata);

    }

    // td_flags.executing must be marked as 0 after __kmp_release_deps has been

    // called. Othertwise, if a task is executed immediately from the

    // release_deps code, the flag will be reset to 1 again by this same

    // function

    KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);

    taskdata->td_flags.executing = 0; // suspend the finishing task


    // Decrement the counter of hidden helper tasks to be executed.

    if (taskdata->td_flags.hidden_helper) {

      // Hidden helper tasks can only be executed by hidden helper threads.

      KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));

      KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);

    }

  }


  KA_TRACE(

      20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",

           gtid, taskdata, children));


  // Free this task and then ancestor tasks if they have no children.

  // Restore th_current_task first as suggested by John:

  // johnmc: if an asynchronous inquiry peers into the runtime system

  // it doesn't see the freed task as the current task.

  thread->th.th_current_task = resumed_task;

  if (completed)

    __kmp_free_task_and_ancestors(gtid, taskdata, thread);


  // TODO: GEH - make sure root team implicit task is initialized properly.

  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );

  resumed_task->td_flags.executing = 1; // resume previous task


#if OMPX_TASKGRAPH

  if (is_taskgraph && __kmp_track_children_task(taskdata) &&

      taskdata->td_taskgroup) {

    // TDG: we only release taskgroup barrier here because

    // free_task_and_ancestors will call

    // __kmp_free_task, which resets all task parameters such as

    // taskdata->started, etc. If we release the barrier earlier, these

    // parameters could be read before being reset. This is not an issue for

    // non-TDG implementation because we never reuse a task(data) structure

    KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);

  }

#endif


  KA_TRACE(

      10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",

           gtid, taskdata, resumed_task));


  return;

}


template <bool ompt>


static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,

                                                  kmp_int32 gtid,

                                                  kmp_task_t *task) {

  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",

                gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));

  KMP_DEBUG_ASSERT(gtid >= 0);

  // this routine will provide task to resume

  __kmp_task_finish<ompt>(gtid, task, NULL);


  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",

                gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));


#if OMPT_SUPPORT

  if (ompt) {

    ompt_frame_t *ompt_frame;

    __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);

    ompt_frame->enter_frame = ompt_data_none;

    ompt_frame->enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;

  }

#endif


  return;

}


#if OMPT_SUPPORT

OMPT_NOINLINE

void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,

                                       kmp_task_t *task) {

  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);

}

#endif // OMPT_SUPPORT


// __kmpc_omp_task_complete_if0: report that a task has completed execution

//

// loc_ref: source location information; points to end of task block.

// gtid: global thread number.

// task: task thunk for the completed task.


void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,

                                  kmp_task_t *task) {

#if OMPT_SUPPORT

  if (UNLIKELY(ompt_enabled.enabled)) {

    __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);

    return;

  }

#endif

  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);

}


#ifdef TASK_UNUSED

// __kmpc_omp_task_complete: report that a task has completed execution

// NEVER GENERATED BY COMPILER, DEPRECATED!!!

void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,

                              kmp_task_t *task) {

  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,

                loc_ref, KMP_TASK_TO_TASKDATA(task)));


  __kmp_task_finish<false>(gtid, task,

                           NULL); // Not sure how to find task to resume


  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,

                loc_ref, KMP_TASK_TO_TASKDATA(task)));

  return;

}

#endif // TASK_UNUSED


// __kmp_init_implicit_task: Initialize the appropriate fields in the implicit

// task for a given thread

//

// loc_ref:  reference to source location of parallel region

// this_thr:  thread data structure corresponding to implicit task

// team: team for this_thr

// tid: thread id of given thread within team

// set_curr_task: TRUE if need to push current task to thread

// NOTE: Routine does not set up the implicit task ICVS.  This is assumed to

// have already been done elsewhere.

// TODO: Get better loc_ref.  Value passed in may be NULL


void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,

                              kmp_team_t *team, int tid, int set_curr_task) {

  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];


  KF_TRACE(

      10,

      ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",

       tid, team, task, set_curr_task ? "TRUE" : "FALSE"));


  task->td_task_id = KMP_GEN_TASK_ID();

  task->td_team = team;

  //    task->td_parent   = NULL;  // fix for CQ230101 (broken parent task info

  //    in debugger)

  task->td_ident = loc_ref;

  task->td_taskwait_ident = NULL;

  task->td_taskwait_counter = 0;

  task->td_taskwait_thread = 0;


  task->td_flags.tiedness = TASK_TIED;

  task->td_flags.tasktype = TASK_IMPLICIT;

  task->td_flags.proxy = TASK_FULL;


  // All implicit tasks are executed immediately, not deferred

  task->td_flags.task_serial = 1;

  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);

  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;


  task->td_flags.started = 1;

  task->td_flags.executing = 1;

  task->td_flags.complete = 0;

  task->td_flags.freed = 0;

#if OMPX_TASKGRAPH

  task->td_flags.onced = 0;

#endif


  task->td_depnode = NULL;

  task->td_last_tied = task;

  task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;


  if (set_curr_task) { // only do this init first time thread is created

    KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);

    // Not used: don't need to deallocate implicit task

    KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);

    task->td_taskgroup = NULL; // An implicit task does not have taskgroup

    task->td_dephash = NULL;

    __kmp_push_current_task_to_thread(this_thr, team, tid);

  } else {

    KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);

    KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);

  }


#if OMPT_SUPPORT

  if (UNLIKELY(ompt_enabled.enabled))

    __ompt_task_init(task, tid);

#endif


  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,

                team, task));

}


// __kmp_finish_implicit_task: Release resources associated to implicit tasks

// at the end of parallel regions. Some resources are kept for reuse in the next

// parallel region.

//

// thread:  thread data structure corresponding to implicit task


void __kmp_finish_implicit_task(kmp_info_t *thread) {

  kmp_taskdata_t *task = thread->th.th_current_task;

#if ENABLE_LIBOMPTARGET

  // Give an opportunity to the offload runtime to synchronize any unfinished

  // target async regions before finishing the implicit task

  if (UNLIKELY(kmp_target_sync_cb != NULL))

    (*kmp_target_sync_cb)(NULL, thread->th.th_info.ds.ds_gtid,

                          KMP_TASKDATA_TO_TASK(task), NULL);

#endif // ENABLE_LIBOMPTARGET

  if (task->td_dephash) {

    int children;

    task->td_flags.complete = 1;

#if OMPX_TASKGRAPH

    task->td_flags.onced = 1;

#endif

    children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);

    kmp_tasking_flags_t flags_old = task->td_flags;

    if (children == 0 && flags_old.complete == 1) {

      kmp_tasking_flags_t flags_new = flags_old;

      flags_new.complete = 0;

      if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),

                                      *RCAST(kmp_int32 *, &flags_old),

                                      *RCAST(kmp_int32 *, &flags_new))) {

        KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "

                       "dephash of implicit task %p\n",

                       thread->th.th_info.ds.ds_gtid, task));

        __kmp_dephash_free_entries(thread, task->td_dephash);

      }

    }

  }

}


// __kmp_free_implicit_task: Release resources associated to implicit tasks

// when these are destroyed regions

//

// thread:  thread data structure corresponding to implicit task


void __kmp_free_implicit_task(kmp_info_t *thread) {

  kmp_taskdata_t *task = thread->th.th_current_task;

  if (task && task->td_dephash) {

    __kmp_dephash_free(thread, task->td_dephash);

    task->td_dephash = NULL;

  }

}


// Round up a size to a power of two specified by val: Used to insert padding

// between structures co-allocated using a single malloc() call


static size_t __kmp_round_up_to_val(size_t size, size_t val) {

  if (size & (val - 1)) {

    size &= ~(val - 1);

    if (size <= KMP_SIZE_T_MAX - val) {

      size += val; // Round up if there is no overflow.

    }

  }

  return size;

} // __kmp_round_up_to_va


// __kmp_task_alloc: Allocate the taskdata and task data structures for a task

//

// loc_ref: source location information

// gtid: global thread number.

// flags: include tiedness & task type (explicit vs. implicit) of the ''new''

// task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.

// sizeof_kmp_task_t:  Size in bytes of kmp_task_t data structure including

// private vars accessed in task.

// sizeof_shareds:  Size in bytes of array of pointers to shared vars accessed

// in task.

// task_entry: Pointer to task code entry point generated by compiler.

// returns: a pointer to the allocated kmp_task_t structure (task).


kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,

                             kmp_tasking_flags_t *flags,

                             size_t sizeof_kmp_task_t, size_t sizeof_shareds,

                             kmp_routine_entry_t task_entry) {

  kmp_task_t *task;

  kmp_taskdata_t *taskdata;

  kmp_info_t *thread = __kmp_threads[gtid];

  kmp_team_t *team = thread->th.th_team;

  kmp_taskdata_t *parent_task = thread->th.th_current_task;

  size_t shareds_offset;


  if (UNLIKELY(!TCR_4(__kmp_init_middle)))

    __kmp_middle_initialize();


  if (flags->hidden_helper) {

    if (__kmp_enable_hidden_helper) {

      if (!TCR_4(__kmp_init_hidden_helper))

        __kmp_hidden_helper_initialize();

    } else {

      // If the hidden helper task is not enabled, reset the flag to FALSE.

      flags->hidden_helper = FALSE;

    }

  }


  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "

                "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",

                gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,

                sizeof_shareds, task_entry));


  KMP_DEBUG_ASSERT(parent_task);

  if (parent_task->td_flags.final) {

    if (flags->merged_if0) {

    }

    flags->final = 1;

  }


  if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {

    // Untied task encountered causes the TSC algorithm to check entire deque of

    // the victim thread. If no untied task encountered, then checking the head

    // of the deque should be enough.

    KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);

  }


  // Detachable tasks are not proxy tasks yet but could be in the future. Doing

  // the tasking setup

  // when that happens is too late.

  if (UNLIKELY(flags->proxy == TASK_PROXY ||

               flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {

    if (flags->proxy == TASK_PROXY) {

      flags->tiedness = TASK_UNTIED;

      flags->merged_if0 = 1;

    }

    /* are we running in a sequential parallel or tskm_immediate_exec... we need

       tasking support enabled */

    if ((thread->th.th_task_team) == NULL) {

      /* This should only happen if the team is serialized

          setup a task team and propagate it to the thread */

      KMP_DEBUG_ASSERT(team->t.t_serialized);

      KA_TRACE(30,

               ("T#%d creating task team in __kmp_task_alloc for proxy task\n",

                gtid));

      __kmp_task_team_setup(thread, team);

      thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];

    }

    kmp_task_team_t *task_team = thread->th.th_task_team;


    /* tasking must be enabled now as the task might not be pushed */

    if (!KMP_TASKING_ENABLED(task_team)) {

      KA_TRACE(

          30,

          ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));

      __kmp_enable_tasking(task_team, thread);

      kmp_int32 tid = thread->th.th_info.ds.ds_tid;

      kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];

      // No lock needed since only owner can allocate

      if (thread_data->td.td_deque == NULL) {

        __kmp_alloc_task_deque(thread, thread_data);

      }

    }


    if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&

        task_team->tt.tt_found_proxy_tasks == FALSE)

      TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);

    if (flags->hidden_helper &&

        task_team->tt.tt_hidden_helper_task_encountered == FALSE)

      TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);

  }


  // Calculate shared structure offset including padding after kmp_task_t struct

  // to align pointers in shared struct

  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;

  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(kmp_uint64));


  // Allocate a kmp_taskdata_t block and a kmp_task_t block.

  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,

                shareds_offset));

  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,

                sizeof_shareds));


  // Avoid double allocation here by combining shareds with taskdata

#if USE_FAST_MEMORY

  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +

                                                               sizeof_shareds);

#else /* ! USE_FAST_MEMORY */

  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +

                                                               sizeof_shareds);

#endif /* USE_FAST_MEMORY */


  task = KMP_TASKDATA_TO_TASK(taskdata);


// Make sure task & taskdata are aligned appropriately

#if KMP_ARCH_X86 || KMP_ARCH_PPC64 || KMP_ARCH_S390X || !KMP_HAVE_QUAD

  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);

  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);

#else

  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);

  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);

#endif

  if (sizeof_shareds > 0) {

    // Avoid double allocation here by combining shareds with taskdata

    task->shareds = &((char *)taskdata)[shareds_offset];

    // Make sure shareds struct is aligned to pointer size

    KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==

                     0);

  } else {

    task->shareds = NULL;

  }

  task->routine = task_entry;

  task->part_id = 0; // AC: Always start with 0 part id


  taskdata->td_task_id = KMP_GEN_TASK_ID();

  taskdata->td_team = thread->th.th_team;

  taskdata->td_alloc_thread = thread;

  taskdata->td_parent = parent_task;

  taskdata->td_level = parent_task->td_level + 1; // increment nesting level

  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);

  taskdata->td_ident = loc_ref;

  taskdata->td_taskwait_ident = NULL;

  taskdata->td_taskwait_counter = 0;

  taskdata->td_taskwait_thread = 0;

  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);

  // avoid copying icvs for proxy tasks

  if (flags->proxy == TASK_FULL)

    copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);


  taskdata->td_flags = *flags;

  taskdata->td_task_team = thread->th.th_task_team;

  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;

  taskdata->td_flags.tasktype = TASK_EXPLICIT;

  // If it is hidden helper task, we need to set the team and task team

  // correspondingly.

  if (flags->hidden_helper) {

    kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];

    taskdata->td_team = shadow_thread->th.th_team;

    taskdata->td_task_team = shadow_thread->th.th_task_team;

  }


  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag

  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);


  // GEH - TODO: fix this to copy parent task's value of team_serial flag

  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;


  // GEH - Note we serialize the task if the team is serialized to make sure

  // implicit parallel region tasks are not left until program termination to

  // execute. Also, it helps locality to execute immediately.


  taskdata->td_flags.task_serial =

      (parent_task->td_flags.final || taskdata->td_flags.team_serial ||

       taskdata->td_flags.tasking_ser || flags->merged_if0);


  taskdata->td_flags.started = 0;

  taskdata->td_flags.executing = 0;

  taskdata->td_flags.complete = 0;

  taskdata->td_flags.freed = 0;

#if OMPX_TASKGRAPH

  taskdata->td_flags.onced = 0;

  taskdata->is_taskgraph = 0;

  taskdata->tdg = nullptr;

#endif

  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);

  // start at one because counts current task and children

  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);

  taskdata->td_taskgroup =

      parent_task->td_taskgroup; // task inherits taskgroup from the parent task

  taskdata->td_dephash = NULL;

  taskdata->td_depnode = NULL;

  taskdata->td_target_data.async_handle = NULL;

  if (flags->tiedness == TASK_UNTIED)

    taskdata->td_last_tied = NULL; // will be set when the task is scheduled

  else

    taskdata->td_last_tied = taskdata;

  taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;

#if OMPT_SUPPORT

  if (UNLIKELY(ompt_enabled.enabled))

    __ompt_task_init(taskdata, gtid);

#endif

  // TODO: What would be the balance between the conditions in the function and

  // an atomic operation?

  if (__kmp_track_children_task(taskdata)) {

    KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);

    if (parent_task->td_taskgroup)

      KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);

    // Only need to keep track of allocated child tasks for explicit tasks since

    // implicit not deallocated

    if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {

      KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);

    }

    if (flags->hidden_helper) {

      taskdata->td_flags.task_serial = FALSE;

      // Increment the number of hidden helper tasks to be executed

      KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);

    }

  }


#if OMPX_TASKGRAPH

  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);

  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&

      (task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) {

    taskdata->is_taskgraph = 1;

    taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];

    taskdata->td_task_id = KMP_GEN_TASK_ID();

    taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);

  }

#endif

  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",

                gtid, taskdata, taskdata->td_parent));


  return task;

}


kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,

                                  kmp_int32 flags, size_t sizeof_kmp_task_t,

                                  size_t sizeof_shareds,

                                  kmp_routine_entry_t task_entry) {

  kmp_task_t *retval;

  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;

  __kmp_assert_valid_gtid(gtid);

  input_flags->native = FALSE;

  // __kmp_task_alloc() sets up all other runtime flags

  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "

                "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",

                gtid, loc_ref, input_flags->tiedness ? "tied  " : "untied",

                input_flags->proxy ? "proxy" : "",

                input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,

                sizeof_shareds, task_entry));


  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,

                            sizeof_shareds, task_entry);


  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));


  return retval;

}


kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,

                                         kmp_int32 flags,

                                         size_t sizeof_kmp_task_t,

                                         size_t sizeof_shareds,

                                         kmp_routine_entry_t task_entry,

                                         kmp_int64 device_id) {

  auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);

  // target task is untied defined in the specification

  input_flags.tiedness = TASK_UNTIED;

  input_flags.target = 1;


  if (__kmp_enable_hidden_helper)

    input_flags.hidden_helper = TRUE;


  return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,

                               sizeof_shareds, task_entry);

}


/*!

@ingroup TASKING

@param loc_ref location of the original task directive

@param gtid Global Thread ID of encountering thread

@param new_task task thunk allocated by __kmpc_omp_task_alloc() for the ''new

task''

@param naffins Number of affinity items

@param affin_list List of affinity items

@return Returns non-zero if registering affinity information was not successful.

 Returns 0 if registration was successful

This entry registers the affinity information attached to a task with the task

thunk structure kmp_taskdata_t.

*/

kmp_int32


__kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,

                                  kmp_task_t *new_task, kmp_int32 naffins,

                                  kmp_task_affinity_info_t *affin_list) {

  return 0;

}


//  __kmp_invoke_task: invoke the specified task

//

// gtid: global thread ID of caller

// task: the task to invoke

// current_task: the task to resume after task invocation

#ifdef __s390x__

__attribute__((target("backchain")))

#endif

static void


__kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,

                  kmp_taskdata_t *current_task) {

  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);

  kmp_info_t *thread;

  int discard = 0 /* false */;

  KA_TRACE(

      30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",

           gtid, taskdata, current_task));

  KMP_DEBUG_ASSERT(task);

  if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&

               taskdata->td_flags.complete == 1)) {

    // This is a proxy task that was already completed but it needs to run

    // its bottom-half finish

    KA_TRACE(

        30,

        ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",

         gtid, taskdata));


    __kmp_bottom_half_finish_proxy(gtid, task);


    KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "

                  "proxy task %p, resuming task %p\n",

                  gtid, taskdata, current_task));


    return;

  }


#if OMPT_SUPPORT

  // For untied tasks, the first task executed only calls __kmpc_omp_task and

  // does not execute code.

  ompt_thread_info_t oldInfo;

  if (UNLIKELY(ompt_enabled.enabled)) {

    // Store the threads states and restore them after the task

    thread = __kmp_threads[gtid];

    oldInfo = thread->th.ompt_thread_info;

    thread->th.ompt_thread_info.wait_id = 0;

    thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)

                                            ? ompt_state_work_serial

                                            : ompt_state_work_parallel;

    taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);

  }

#endif


  // Proxy tasks are not handled by the runtime

  if (taskdata->td_flags.proxy != TASK_PROXY) {

    __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded

  }


  // TODO: cancel tasks if the parallel region has also been cancelled

  // TODO: check if this sequence can be hoisted above __kmp_task_start

  // if cancellation has been enabled for this run ...

  if (UNLIKELY(__kmp_omp_cancellation)) {

    thread = __kmp_threads[gtid];

    kmp_team_t *this_team = thread->th.th_team;

    kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;

    if ((taskgroup && taskgroup->cancel_request) ||

        (this_team->t.t_cancel_request == cancel_parallel)) {

#if OMPT_SUPPORT && OMPT_OPTIONAL

      ompt_data_t *task_data;

      if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {

        __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);

        ompt_callbacks.ompt_callback(ompt_callback_cancel)(

            task_data,

            ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup

                                                      : ompt_cancel_parallel) |

                ompt_cancel_discarded_task,

            NULL);

      }

#endif

      KMP_COUNT_BLOCK(TASK_cancelled);

      // this task belongs to a task group and we need to cancel it

      discard = 1 /* true */;

    }

  }


  // Invoke the task routine and pass in relevant data.

  // Thunks generated by gcc take a different argument list.

  if (!discard) {

    if (taskdata->td_flags.tiedness == TASK_UNTIED) {

      taskdata->td_last_tied = current_task->td_last_tied;

      KMP_DEBUG_ASSERT(taskdata->td_last_tied);

    }

#if KMP_STATS_ENABLED

    KMP_COUNT_BLOCK(TASK_executed);

    switch (KMP_GET_THREAD_STATE()) {

    case FORK_JOIN_BARRIER:

      KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);

      break;

    case PLAIN_BARRIER:

      KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);

      break;

    case TASKYIELD:

      KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);

      break;

    case TASKWAIT:

      KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);

      break;

    case TASKGROUP:

      KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);

      break;

    default:

      KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);

      break;

    }

#endif // KMP_STATS_ENABLED


// OMPT task begin

#if OMPT_SUPPORT

    if (UNLIKELY(ompt_enabled.enabled))

      __ompt_task_start(task, current_task, gtid);

#endif

#if OMPT_SUPPORT && OMPT_OPTIONAL

    if (UNLIKELY(ompt_enabled.ompt_callback_dispatch &&

                 taskdata->ompt_task_info.dispatch_chunk.iterations > 0)) {

      ompt_data_t instance = ompt_data_none;

      instance.ptr = &(taskdata->ompt_task_info.dispatch_chunk);

      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);

      ompt_callbacks.ompt_callback(ompt_callback_dispatch)(

          &(team_info->parallel_data), &(taskdata->ompt_task_info.task_data),

          ompt_dispatch_taskloop_chunk, instance);

      taskdata->ompt_task_info.dispatch_chunk = {0, 0};

    }

#endif // OMPT_SUPPORT && OMPT_OPTIONAL


#if OMPD_SUPPORT

    if (ompd_state & OMPD_ENABLE_BP)

      ompd_bp_task_begin();

#endif


#if USE_ITT_BUILD && USE_ITT_NOTIFY

    kmp_uint64 cur_time;

    kmp_int32 kmp_itt_count_task =

        __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&

        current_task->td_flags.tasktype == TASK_IMPLICIT;

    if (kmp_itt_count_task) {

      thread = __kmp_threads[gtid];

      // Time outer level explicit task on barrier for adjusting imbalance time

      if (thread->th.th_bar_arrive_time)

        cur_time = __itt_get_timestamp();

      else

        kmp_itt_count_task = 0; // thread is not on a barrier - skip timing

    }

    KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)

#endif


#if ENABLE_LIBOMPTARGET

    if (taskdata->td_target_data.async_handle != NULL) {

      // If we have a valid target async handle, that means that we have already

      // executed the task routine once. We must query for the handle completion

      // instead of re-executing the routine.

      KMP_ASSERT(tgt_target_nowait_query);

      tgt_target_nowait_query(&taskdata->td_target_data.async_handle);

    } else

#endif

    if (task->routine != NULL) {

#ifdef KMP_GOMP_COMPAT

      if (taskdata->td_flags.native) {

        ((void (*)(void *))(*(task->routine)))(task->shareds);

      } else

#endif /* KMP_GOMP_COMPAT */

      {

        (*(task->routine))(gtid, task);

      }

    }

    KMP_POP_PARTITIONED_TIMER();


#if USE_ITT_BUILD && USE_ITT_NOTIFY

    if (kmp_itt_count_task) {

      // Barrier imbalance - adjust arrive time with the task duration

      thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);

    }

    KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)

    KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent

#endif

  }


#if OMPD_SUPPORT

  if (ompd_state & OMPD_ENABLE_BP)

    ompd_bp_task_end();

#endif


  // Proxy tasks are not handled by the runtime

  if (taskdata->td_flags.proxy != TASK_PROXY) {

#if OMPT_SUPPORT

    if (UNLIKELY(ompt_enabled.enabled)) {

      thread->th.ompt_thread_info = oldInfo;

      if (taskdata->td_flags.tiedness == TASK_TIED) {

        taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;

      }

      __kmp_task_finish<true>(gtid, task, current_task);

    } else

#endif

      __kmp_task_finish<false>(gtid, task, current_task);

  }

#if OMPT_SUPPORT

  else if (UNLIKELY(ompt_enabled.enabled && taskdata->td_flags.target)) {

    __ompt_task_finish(task, current_task, ompt_task_switch);

  }

#endif


  KA_TRACE(

      30,

      ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",

       gtid, taskdata, current_task));

  return;

}


// __kmpc_omp_task_parts: Schedule a thread-switchable task for execution

//

// loc_ref: location of original task pragma (ignored)

// gtid: Global Thread ID of encountering thread

// new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''

// Returns:

//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to

//    be resumed later.

//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be

//    resumed later.


kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,

                                kmp_task_t *new_task) {

  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);


  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,

                loc_ref, new_taskdata));


#if OMPT_SUPPORT

  kmp_taskdata_t *parent;

  if (UNLIKELY(ompt_enabled.enabled)) {

    parent = new_taskdata->td_parent;

    if (ompt_enabled.ompt_callback_task_create) {

      ompt_callbacks.ompt_callback(ompt_callback_task_create)(

          &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),

          &(new_taskdata->ompt_task_info.task_data),

          TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,

          OMPT_GET_RETURN_ADDRESS(0));

    }

  }

#endif


  /* Should we execute the new task or queue it? For now, let's just always try

     to queue it.  If the queue fills up, then we'll execute it.  */


  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer

  { // Execute this task immediately

    kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;

    new_taskdata->td_flags.task_serial = 1;

    __kmp_invoke_task(gtid, new_task, current_task);

  }


  KA_TRACE(

      10,

      ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "

       "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",

       gtid, loc_ref, new_taskdata));


#if OMPT_SUPPORT

  if (UNLIKELY(ompt_enabled.enabled)) {

    parent->ompt_task_info.frame.enter_frame = ompt_data_none;

    parent->ompt_task_info.frame.enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;

  }

#endif

  return TASK_CURRENT_NOT_QUEUED;

}


// __kmp_omp_task: Schedule a non-thread-switchable task for execution

//

// gtid: Global Thread ID of encountering thread

// new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()

// serialize_immediate: if TRUE then if the task is executed immediately its

// execution will be serialized

// Returns:

//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to

//    be resumed later.

//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be

//    resumed later.


kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,

                         bool serialize_immediate) {

  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);


#if OMPX_TASKGRAPH

  if (new_taskdata->is_taskgraph &&

      __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {

    kmp_tdg_info_t *tdg = new_taskdata->tdg;

    // extend the record_map if needed

    if (new_taskdata->td_tdg_task_id >= new_taskdata->tdg->map_size) {

      __kmp_acquire_bootstrap_lock(&tdg->graph_lock);

      // map_size could have been updated by another thread if recursive

      // taskloop

      if (new_taskdata->td_tdg_task_id >= tdg->map_size) {

        kmp_uint old_size = tdg->map_size;

        kmp_uint new_size = old_size * 2;

        kmp_node_info_t *old_record = tdg->record_map;

        kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(

            new_size * sizeof(kmp_node_info_t));


        KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));

        tdg->record_map = new_record;


        __kmp_free(old_record);


        for (kmp_int i = old_size; i < new_size; i++) {

          kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(

              __kmp_successors_size * sizeof(kmp_int32));

          new_record[i].task = nullptr;

          new_record[i].successors = successorsList;

          new_record[i].nsuccessors = 0;

          new_record[i].npredecessors = 0;

          new_record[i].successors_size = __kmp_successors_size;

          KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);

        }

        // update the size at the end, so that we avoid other

        // threads use old_record while map_size is already updated

        tdg->map_size = new_size;

      }

      __kmp_release_bootstrap_lock(&tdg->graph_lock);

    }

    // record a task

    if (tdg->record_map[new_taskdata->td_tdg_task_id].task == nullptr) {

      tdg->record_map[new_taskdata->td_tdg_task_id].task = new_task;

      tdg->record_map[new_taskdata->td_tdg_task_id].parent_task =

          new_taskdata->td_parent;

      KMP_ATOMIC_INC(&tdg->num_tasks);

    }

  }

#endif


  /* Should we execute the new task or queue it? For now, let's just always try

     to queue it.  If the queue fills up, then we'll execute it.  */

  if (new_taskdata->td_flags.proxy == TASK_PROXY ||

      __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer

  { // Execute this task immediately

    kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;

    if (serialize_immediate)

      new_taskdata->td_flags.task_serial = 1;

    __kmp_invoke_task(gtid, new_task, current_task);

  } else if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&

             __kmp_wpolicy_passive) {

    kmp_info_t *this_thr = __kmp_threads[gtid];

    kmp_team_t *team = this_thr->th.th_team;

    kmp_int32 nthreads = this_thr->th.th_team_nproc;

    for (int i = 0; i < nthreads; ++i) {

      kmp_info_t *thread = team->t.t_threads[i];

      if (thread == this_thr)

        continue;

      if (thread->th.th_sleep_loc != NULL) {

        __kmp_null_resume_wrapper(thread);

        break; // awake one thread at a time

      }

    }

  }

  return TASK_CURRENT_NOT_QUEUED;

}


// __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a

// non-thread-switchable task from the parent thread only!

//

// loc_ref: location of original task pragma (ignored)

// gtid: Global Thread ID of encountering thread

// new_task: non-thread-switchable task thunk allocated by

// __kmp_omp_task_alloc()

// Returns:

//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to

//    be resumed later.

//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be

//    resumed later.


kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,

                          kmp_task_t *new_task) {

  kmp_int32 res;

  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);


#if KMP_DEBUG || OMPT_SUPPORT

  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);

#endif

  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,

                new_taskdata));

  __kmp_assert_valid_gtid(gtid);


#if OMPT_SUPPORT

  kmp_taskdata_t *parent = NULL;

  if (UNLIKELY(ompt_enabled.enabled)) {

    if (!new_taskdata->td_flags.started) {

      OMPT_STORE_RETURN_ADDRESS(gtid);

      parent = new_taskdata->td_parent;

      if (!parent->ompt_task_info.frame.enter_frame.ptr) {

        parent->ompt_task_info.frame.enter_frame.ptr =

            OMPT_GET_FRAME_ADDRESS(0);

      }

      if (ompt_enabled.ompt_callback_task_create) {

        ompt_callbacks.ompt_callback(ompt_callback_task_create)(

            &(parent->ompt_task_info.task_data),

            &(parent->ompt_task_info.frame),

            &(new_taskdata->ompt_task_info.task_data),

            TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,

            OMPT_LOAD_RETURN_ADDRESS(gtid));

      }

    } else {

      // We are scheduling the continuation of an UNTIED task.

      // Scheduling back to the parent task.

      __ompt_task_finish(new_task,

                         new_taskdata->ompt_task_info.scheduling_parent,

                         ompt_task_switch);

      new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;

    }

  }

#endif


  res = __kmp_omp_task(gtid, new_task, true);


  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "

                "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",

                gtid, loc_ref, new_taskdata));

#if OMPT_SUPPORT

  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {

    parent->ompt_task_info.frame.enter_frame = ompt_data_none;

  }

#endif

  return res;

}


// __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule

// a taskloop task with the correct OMPT return address

//

// loc_ref: location of original task pragma (ignored)

// gtid: Global Thread ID of encountering thread

// new_task: non-thread-switchable task thunk allocated by

// __kmp_omp_task_alloc()

// codeptr_ra: return address for OMPT callback

// Returns:

//    TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to

//    be resumed later.

//    TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be

//    resumed later.


kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,

                                  kmp_task_t *new_task, void *codeptr_ra) {

  kmp_int32 res;

  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);


#if KMP_DEBUG || OMPT_SUPPORT

  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);

#endif

  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,

                new_taskdata));


#if OMPT_SUPPORT

  kmp_taskdata_t *parent = NULL;

  if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {

    parent = new_taskdata->td_parent;

    if (!parent->ompt_task_info.frame.enter_frame.ptr)

      parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);

    if (ompt_enabled.ompt_callback_task_create) {

      ompt_callbacks.ompt_callback(ompt_callback_task_create)(

          &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),

          &(new_taskdata->ompt_task_info.task_data),

          TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, codeptr_ra);

    }

  }

#endif


  res = __kmp_omp_task(gtid, new_task, true);


  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "

                "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",

                gtid, loc_ref, new_taskdata));

#if OMPT_SUPPORT

  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {

    parent->ompt_task_info.frame.enter_frame = ompt_data_none;

  }

#endif

  return res;

}


template <bool ompt>


static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,

                                              void *frame_address,

                                              void *return_address) {

  kmp_taskdata_t *taskdata = nullptr;

  kmp_info_t *thread;

  int thread_finished = FALSE;

  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);


  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));

  KMP_DEBUG_ASSERT(gtid >= 0);


  if (__kmp_tasking_mode != tskm_immediate_exec) {

    thread = __kmp_threads[gtid];

    taskdata = thread->th.th_current_task;


#if OMPT_SUPPORT && OMPT_OPTIONAL

    ompt_data_t *my_task_data;

    ompt_data_t *my_parallel_data;


    if (ompt) {

      my_task_data = &(taskdata->ompt_task_info.task_data);

      my_parallel_data = OMPT_CUR_TEAM_DATA(thread);


      taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;


      if (ompt_enabled.ompt_callback_sync_region) {

        ompt_callbacks.ompt_callback(ompt_callback_sync_region)(

            ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,

            my_task_data, return_address);

      }


      if (ompt_enabled.ompt_callback_sync_region_wait) {

        ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(

            ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,

            my_task_data, return_address);

      }

    }

#endif // OMPT_SUPPORT && OMPT_OPTIONAL


#if ENABLE_LIBOMPTARGET

    // Give an opportunity to the offload runtime to make progress and create

    // any necessary proxy tasks

    if (UNLIKELY(kmp_target_sync_cb))

      (*kmp_target_sync_cb)(loc_ref, gtid, KMP_TASKDATA_TO_TASK(taskdata),

                            NULL);

#endif // ENABLE_LIBOMPTARGET


// Debugger: The taskwait is active. Store location and thread encountered the

// taskwait.

#if USE_ITT_BUILD

// Note: These values are used by ITT events as well.

#endif /* USE_ITT_BUILD */

    taskdata->td_taskwait_counter += 1;

    taskdata->td_taskwait_ident = loc_ref;

    taskdata->td_taskwait_thread = gtid + 1;


#if USE_ITT_BUILD

    void *itt_sync_obj = NULL;

#if USE_ITT_NOTIFY

    KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);

#endif /* USE_ITT_NOTIFY */

#endif /* USE_ITT_BUILD */


    bool must_wait =

        !taskdata->td_flags.team_serial && !taskdata->td_flags.final;


    must_wait = must_wait || (thread->th.th_task_team != NULL &&

                              thread->th.th_task_team->tt.tt_found_proxy_tasks);

    // If hidden helper thread is encountered, we must enable wait here.

    must_wait =

        must_wait ||

        (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&

         thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);


    if (must_wait) {

      kmp_flag_32<false, false> flag(

          RCAST(std::atomic<kmp_uint32> *,

                &(taskdata->td_incomplete_child_tasks)),

          0U);

      while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {

        flag.execute_tasks(thread, gtid, FALSE,

                           &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),

                           __kmp_task_stealing_constraint);

      }

    }

#if USE_ITT_BUILD

    KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);

    KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children

#endif /* USE_ITT_BUILD */


    // Debugger:  The taskwait is completed. Location remains, but thread is

    // negated.

    taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;


#if OMPT_SUPPORT && OMPT_OPTIONAL

    if (ompt) {

      if (ompt_enabled.ompt_callback_sync_region_wait) {

        ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(

            ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,

            my_task_data, return_address);

      }

      if (ompt_enabled.ompt_callback_sync_region) {

        ompt_callbacks.ompt_callback(ompt_callback_sync_region)(

            ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,

            my_task_data, return_address);

      }

      taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;

    }

#endif // OMPT_SUPPORT && OMPT_OPTIONAL

  }


  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "

                "returning TASK_CURRENT_NOT_QUEUED\n",

                gtid, taskdata));


  return TASK_CURRENT_NOT_QUEUED;

}


#if OMPT_SUPPORT && OMPT_OPTIONAL

OMPT_NOINLINE

static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,

                                          void *frame_address,

                                          void *return_address) {

  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,

                                            return_address);

}

#endif // OMPT_SUPPORT && OMPT_OPTIONAL


// __kmpc_omp_taskwait: Wait until all tasks generated by the current task are

// complete


kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {

#if OMPT_SUPPORT && OMPT_OPTIONAL

  if (UNLIKELY(ompt_enabled.enabled)) {

    OMPT_STORE_RETURN_ADDRESS(gtid);

    return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),

                                    OMPT_LOAD_RETURN_ADDRESS(gtid));

  }

#endif

  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);

}


// __kmpc_omp_taskyield: switch to a different task


kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {

  kmp_taskdata_t *taskdata = NULL;

  kmp_info_t *thread;

  int thread_finished = FALSE;


  KMP_COUNT_BLOCK(OMP_TASKYIELD);

  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);


  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",

                gtid, loc_ref, end_part));

  __kmp_assert_valid_gtid(gtid);


  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {

    thread = __kmp_threads[gtid];

    taskdata = thread->th.th_current_task;

// Should we model this as a task wait or not?

// Debugger: The taskwait is active. Store location and thread encountered the

// taskwait.

#if USE_ITT_BUILD

// Note: These values are used by ITT events as well.

#endif /* USE_ITT_BUILD */

    taskdata->td_taskwait_counter += 1;

    taskdata->td_taskwait_ident = loc_ref;

    taskdata->td_taskwait_thread = gtid + 1;


#if USE_ITT_BUILD

    void *itt_sync_obj = NULL;

#if USE_ITT_NOTIFY

    KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);

#endif /* USE_ITT_NOTIFY */

#endif /* USE_ITT_BUILD */

    if (!taskdata->td_flags.team_serial) {

      kmp_task_team_t *task_team = thread->th.th_task_team;

      if (task_team != NULL) {

        if (KMP_TASKING_ENABLED(task_team)) {

#if OMPT_SUPPORT

          if (UNLIKELY(ompt_enabled.enabled))

            thread->th.ompt_thread_info.ompt_task_yielded = 1;

#endif

          __kmp_execute_tasks_32(

              thread, gtid, (kmp_flag_32<> *)NULL, FALSE,

              &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),

              __kmp_task_stealing_constraint);

#if OMPT_SUPPORT

          if (UNLIKELY(ompt_enabled.enabled))

            thread->th.ompt_thread_info.ompt_task_yielded = 0;

#endif

        }

      }

    }

#if USE_ITT_BUILD

    KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);

#endif /* USE_ITT_BUILD */


    // Debugger:  The taskwait is completed. Location remains, but thread is

    // negated.

    taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;

  }


  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "

                "returning TASK_CURRENT_NOT_QUEUED\n",

                gtid, taskdata));


  return TASK_CURRENT_NOT_QUEUED;

}


// Task Reduction implementation

//

// Note: initial implementation didn't take into account the possibility

// to specify omp_orig for initializer of the UDR (user defined reduction).

// Corrected implementation takes into account the omp_orig object.

// Compiler is free to use old implementation if omp_orig is not specified.


/*!

@ingroup BASIC_TYPES

@{

*/


/*!

Flags for special info per task reduction item.

*/


typedef struct kmp_taskred_flags {

  /*! 1 - use lazy alloc/init (e.g. big objects, num tasks < num threads) */

  unsigned lazy_priv : 1;

  unsigned reserved31 : 31;

} kmp_taskred_flags_t;


/*!

Internal struct for reduction data item related info set up by compiler.

*/


typedef struct kmp_task_red_input {

  void *reduce_shar; /**< shared between tasks item to reduce into */

  size_t reduce_size; /**< size of data item in bytes */

  // three compiler-generated routines (init, fini are optional):

  void *reduce_init; /**< data initialization routine (single parameter) */

  void *reduce_fini; /**< data finalization routine */

  void *reduce_comb; /**< data combiner routine */

  kmp_taskred_flags_t flags; /**< flags for additional info from compiler */

} kmp_task_red_input_t;


/*!

Internal struct for reduction data item related info saved by the library.

*/


typedef struct kmp_taskred_data {

  void *reduce_shar; /**< shared between tasks item to reduce into */

  size_t reduce_size; /**< size of data item */

  kmp_taskred_flags_t flags; /**< flags for additional info from compiler */

  void *reduce_priv; /**< array of thread specific items */

  void *reduce_pend; /**< end of private data for faster comparison op */

  // three compiler-generated routines (init, fini are optional):

  void *reduce_comb; /**< data combiner routine */

  void *reduce_init; /**< data initialization routine (two parameters) */

  void *reduce_fini; /**< data finalization routine */

  void *reduce_orig; /**< original item (can be used in UDR initializer) */

} kmp_taskred_data_t;


/*!

Internal struct for reduction data item related info set up by compiler.


New interface: added reduce_orig field to provide omp_orig for UDR initializer.

*/


typedef struct kmp_taskred_input {

  void *reduce_shar; /**< shared between tasks item to reduce into */

  void *reduce_orig; /**< original reduction item used for initialization */

  size_t reduce_size; /**< size of data item */

  // three compiler-generated routines (init, fini are optional):

  void *reduce_init; /**< data initialization routine (two parameters) */

  void *reduce_fini; /**< data finalization routine */

  void *reduce_comb; /**< data combiner routine */

  kmp_taskred_flags_t flags; /**< flags for additional info from compiler */

} kmp_taskred_input_t;


/*!

@}

*/


template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);

template <>


void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,

                                             kmp_task_red_input_t &src) {

  item.reduce_orig = NULL;

}


template <>


void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,

                                            kmp_taskred_input_t &src) {

  if (src.reduce_orig != NULL) {

    item.reduce_orig = src.reduce_orig;

  } else {

    item.reduce_orig = src.reduce_shar;

  } // non-NULL reduce_orig means new interface used

}


template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);

template <>


void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,

                                           size_t offset) {

  ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);

}


template <>


void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,

                                          size_t offset) {

  ((void (*)(void *, void *))item.reduce_init)(

      (char *)(item.reduce_priv) + offset, item.reduce_orig);

}


template <typename T>


void *__kmp_task_reduction_init(int gtid, int num, T *data) {

  __kmp_assert_valid_gtid(gtid);

  kmp_info_t *thread = __kmp_threads[gtid];

  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;

  kmp_uint32 nth = thread->th.th_team_nproc;

  kmp_taskred_data_t *arr;


  // check input data just in case

  KMP_ASSERT(tg != NULL);

  KMP_ASSERT(data != NULL);

  KMP_ASSERT(num > 0);

  if (nth == 1 && !__kmp_enable_hidden_helper) {

    KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",

                  gtid, tg));

    return (void *)tg;

  }

  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",

                gtid, tg, num));

  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(

      thread, num * sizeof(kmp_taskred_data_t));

  for (int i = 0; i < num; ++i) {

    size_t size = data[i].reduce_size - 1;

    // round the size up to cache line per thread-specific item

    size += CACHE_LINE - size % CACHE_LINE;

    KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory

    arr[i].reduce_shar = data[i].reduce_shar;

    arr[i].reduce_size = size;

    arr[i].flags = data[i].flags;

    arr[i].reduce_comb = data[i].reduce_comb;

    arr[i].reduce_init = data[i].reduce_init;

    arr[i].reduce_fini = data[i].reduce_fini;

    __kmp_assign_orig<T>(arr[i], data[i]);

    if (!arr[i].flags.lazy_priv) {

      // allocate cache-line aligned block and fill it with zeros

      arr[i].reduce_priv = __kmp_allocate(nth * size);

      arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;

      if (arr[i].reduce_init != NULL) {

        // initialize all thread-specific items

        for (size_t j = 0; j < nth; ++j) {

          __kmp_call_init<T>(arr[i], j * size);

        }

      }

    } else {

      // only allocate space for pointers now,

      // objects will be lazily allocated/initialized if/when requested

      // note that __kmp_allocate zeroes the allocated memory

      arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));

    }

  }

  tg->reduce_data = (void *)arr;

  tg->reduce_num_data = num;

  return (void *)tg;

}


/*!

@ingroup TASKING

@param gtid      Global thread ID

@param num       Number of data items to reduce

@param data      Array of data for reduction

@return The taskgroup identifier


Initialize task reduction for the taskgroup.


Note: this entry supposes the optional compiler-generated initializer routine

has single parameter - pointer to object to be initialized. That means

the reduction either does not use omp_orig object, or the omp_orig is accessible

without help of the runtime library.

*/


void *__kmpc_task_reduction_init(int gtid, int num, void *data) {

#if OMPX_TASKGRAPH

  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);

  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {

    kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];

    this_tdg->rec_taskred_data =

        __kmp_allocate(sizeof(kmp_task_red_input_t) * num);

    this_tdg->rec_num_taskred = num;

    KMP_MEMCPY(this_tdg->rec_taskred_data, data,

               sizeof(kmp_task_red_input_t) * num);

  }

#endif

  return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);

}


/*!

@ingroup TASKING

@param gtid      Global thread ID

@param num       Number of data items to reduce

@param data      Array of data for reduction

@return The taskgroup identifier


Initialize task reduction for the taskgroup.


Note: this entry supposes the optional compiler-generated initializer routine

has two parameters, pointer to object to be initialized and pointer to omp_orig

*/


void *__kmpc_taskred_init(int gtid, int num, void *data) {

#if OMPX_TASKGRAPH

  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);

  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {

    kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];

    this_tdg->rec_taskred_data =

        __kmp_allocate(sizeof(kmp_task_red_input_t) * num);

    this_tdg->rec_num_taskred = num;

    KMP_MEMCPY(this_tdg->rec_taskred_data, data,

               sizeof(kmp_task_red_input_t) * num);

  }

#endif

  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);

}


// Copy task reduction data (except for shared pointers).

template <typename T>


void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,

                                    kmp_taskgroup_t *tg, void *reduce_data) {

  kmp_taskred_data_t *arr;

  KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"

                " from data %p\n",

                thr, tg, reduce_data));

  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(

      thr, num * sizeof(kmp_taskred_data_t));

  // threads will share private copies, thunk routines, sizes, flags, etc.:

  KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));

  for (int i = 0; i < num; ++i) {

    arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers

  }

  tg->reduce_data = (void *)arr;

  tg->reduce_num_data = num;

}


/*!

@ingroup TASKING

@param gtid    Global thread ID

@param tskgrp  The taskgroup ID (optional)

@param data    Shared location of the item

@return The pointer to per-thread data


Get thread-specific location of data item

*/


void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {

  __kmp_assert_valid_gtid(gtid);

  kmp_info_t *thread = __kmp_threads[gtid];

  kmp_int32 nth = thread->th.th_team_nproc;

  if (nth == 1)

    return data; // nothing to do


  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;

  if (tg == NULL)

    tg = thread->th.th_current_task->td_taskgroup;

  KMP_ASSERT(tg != NULL);

  kmp_taskred_data_t *arr;

  kmp_int32 num;

  kmp_int32 tid = thread->th.th_info.ds.ds_tid;


#if OMPX_TASKGRAPH

  if ((thread->th.th_current_task->is_taskgraph) &&

      (!__kmp_tdg_is_recording(

          __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {

    tg = thread->th.th_current_task->td_taskgroup;

    KMP_ASSERT(tg != NULL);

    KMP_ASSERT(tg->reduce_data != NULL);

    arr = (kmp_taskred_data_t *)(tg->reduce_data);

    num = tg->reduce_num_data;

  }

#endif


  KMP_ASSERT(data != NULL);

  while (tg != NULL) {

    arr = (kmp_taskred_data_t *)(tg->reduce_data);

    num = tg->reduce_num_data;

    for (int i = 0; i < num; ++i) {

      if (!arr[i].flags.lazy_priv) {

        if (data == arr[i].reduce_shar ||

            (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))

          return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;

      } else {

        // check shared location first

        void **p_priv = (void **)(arr[i].reduce_priv);

        if (data == arr[i].reduce_shar)

          goto found;

        // check if we get some thread specific location as parameter

        for (int j = 0; j < nth; ++j)

          if (data == p_priv[j])

            goto found;

        continue; // not found, continue search

      found:

        if (p_priv[tid] == NULL) {

          // allocate thread specific object lazily

          p_priv[tid] = __kmp_allocate(arr[i].reduce_size);

          if (arr[i].reduce_init != NULL) {

            if (arr[i].reduce_orig != NULL) { // new interface

              ((void (*)(void *, void *))arr[i].reduce_init)(

                  p_priv[tid], arr[i].reduce_orig);

            } else { // old interface (single parameter)

              ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);

            }

          }

        }

        return p_priv[tid];

      }

    }

    KMP_ASSERT(tg->parent);

    tg = tg->parent;

  }

  KMP_ASSERT2(0, "Unknown task reduction item");

  return NULL; // ERROR, this line never executed

}


// Finalize task reduction.

// Called from __kmpc_end_taskgroup()


static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {

  kmp_int32 nth = th->th.th_team_nproc;

  KMP_DEBUG_ASSERT(

      nth > 1 ||

      __kmp_enable_hidden_helper); // should not be called if nth == 1 unless we

                                   // are using hidden helper threads

  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;

  kmp_int32 num = tg->reduce_num_data;

  for (int i = 0; i < num; ++i) {

    void *sh_data = arr[i].reduce_shar;

    void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);

    void (*f_comb)(void *, void *) =

        (void (*)(void *, void *))(arr[i].reduce_comb);

    if (!arr[i].flags.lazy_priv) {

      void *pr_data = arr[i].reduce_priv;

      size_t size = arr[i].reduce_size;

      for (int j = 0; j < nth; ++j) {

        void *priv_data = (char *)pr_data + j * size;

        f_comb(sh_data, priv_data); // combine results

        if (f_fini)

          f_fini(priv_data); // finalize if needed

      }

    } else {

      void **pr_data = (void **)(arr[i].reduce_priv);

      for (int j = 0; j < nth; ++j) {

        if (pr_data[j] != NULL) {

          f_comb(sh_data, pr_data[j]); // combine results

          if (f_fini)

            f_fini(pr_data[j]); // finalize if needed

          __kmp_free(pr_data[j]);

        }

      }

    }

    __kmp_free(arr[i].reduce_priv);

  }

  __kmp_thread_free(th, arr);

  tg->reduce_data = NULL;

  tg->reduce_num_data = 0;

}


// Cleanup task reduction data for parallel or worksharing,

// do not touch task private data other threads still working with.

// Called from __kmpc_end_taskgroup()


static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {

  __kmp_thread_free(th, tg->reduce_data);

  tg->reduce_data = NULL;

  tg->reduce_num_data = 0;

}


template <typename T>


void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,

                                         int num, T *data) {

  __kmp_assert_valid_gtid(gtid);

  kmp_info_t *thr = __kmp_threads[gtid];

  kmp_int32 nth = thr->th.th_team_nproc;

  __kmpc_taskgroup(loc, gtid); // form new taskgroup first

  if (nth == 1) {

    KA_TRACE(10,

             ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",

              gtid, thr->th.th_current_task->td_taskgroup));

    return (void *)thr->th.th_current_task->td_taskgroup;

  }

  kmp_team_t *team = thr->th.th_team;

  void *reduce_data;

  kmp_taskgroup_t *tg;

  reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);

  if (reduce_data == NULL &&

      __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,

                                 (void *)1)) {

    // single thread enters this block to initialize common reduction data

    KMP_DEBUG_ASSERT(reduce_data == NULL);

    // first initialize own data, then make a copy other threads can use

    tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);

    reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));

    KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));

    // fini counters should be 0 at this point

    KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);

    KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);

    KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);

  } else {

    while (

        (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==

        (void *)1) { // wait for task reduction initialization

      KMP_CPU_PAUSE();

    }

    KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here

    tg = thr->th.th_current_task->td_taskgroup;

    __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);

  }

  return tg;

}


/*!

@ingroup TASKING

@param loc       Source location info

@param gtid      Global thread ID

@param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise

@param num       Number of data items to reduce

@param data      Array of data for reduction

@return The taskgroup identifier


Initialize task reduction for a parallel or worksharing.


Note: this entry supposes the optional compiler-generated initializer routine

has single parameter - pointer to object to be initialized. That means

the reduction either does not use omp_orig object, or the omp_orig is accessible

without help of the runtime library.

*/


void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,

                                          int num, void *data) {

  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,

                                            (kmp_task_red_input_t *)data);

}


/*!

@ingroup TASKING

@param loc       Source location info

@param gtid      Global thread ID

@param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise

@param num       Number of data items to reduce

@param data      Array of data for reduction

@return The taskgroup identifier


Initialize task reduction for a parallel or worksharing.


Note: this entry supposes the optional compiler-generated initializer routine

has two parameters, pointer to object to be initialized and pointer to omp_orig

*/


void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,

                                   void *data) {

  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,

                                            (kmp_taskred_input_t *)data);

}


/*!

@ingroup TASKING

@param loc       Source location info

@param gtid      Global thread ID

@param is_ws     Is 1 if the reduction is for worksharing, 0 otherwise


Finalize task reduction for a parallel or worksharing.

*/


void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {

  __kmpc_end_taskgroup(loc, gtid);

}


// __kmpc_taskgroup: Start a new taskgroup


void __kmpc_taskgroup(ident_t *loc, int gtid) {

  __kmp_assert_valid_gtid(gtid);

  kmp_info_t *thread = __kmp_threads[gtid];

  kmp_taskdata_t *taskdata = thread->th.th_current_task;

  kmp_taskgroup_t *tg_new =

      (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));

  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));

  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);

  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);

  tg_new->parent = taskdata->td_taskgroup;

  tg_new->reduce_data = NULL;

  tg_new->reduce_num_data = 0;

  tg_new->gomp_data = NULL;

  taskdata->td_taskgroup = tg_new;


#if OMPT_SUPPORT && OMPT_OPTIONAL

  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {

    void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);

    if (!codeptr)

      codeptr = OMPT_GET_RETURN_ADDRESS(0);

    kmp_team_t *team = thread->th.th_team;

    ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;

    // FIXME: I think this is wrong for lwt!

    ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;


    ompt_callbacks.ompt_callback(ompt_callback_sync_region)(

        ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),

        &(my_task_data), codeptr);

  }

#endif

}


// __kmpc_end_taskgroup: Wait until all tasks generated by the current task

//                       and its descendants are complete


void __kmpc_end_taskgroup(ident_t *loc, int gtid) {

  __kmp_assert_valid_gtid(gtid);

  kmp_info_t *thread = __kmp_threads[gtid];

  kmp_taskdata_t *taskdata = thread->th.th_current_task;

  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;

  int thread_finished = FALSE;


#if OMPT_SUPPORT && OMPT_OPTIONAL

  kmp_team_t *team;

  ompt_data_t my_task_data;

  ompt_data_t my_parallel_data;

  void *codeptr = nullptr;

  if (UNLIKELY(ompt_enabled.enabled)) {

    team = thread->th.th_team;

    my_task_data = taskdata->ompt_task_info.task_data;

    // FIXME: I think this is wrong for lwt!

    my_parallel_data = team->t.ompt_team_info.parallel_data;

    codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);

    if (!codeptr)

      codeptr = OMPT_GET_RETURN_ADDRESS(0);

  }

#endif


  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));

  KMP_DEBUG_ASSERT(taskgroup != NULL);

  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);


  if (__kmp_tasking_mode != tskm_immediate_exec) {

    // mark task as waiting not on a barrier

    taskdata->td_taskwait_counter += 1;

    taskdata->td_taskwait_ident = loc;

    taskdata->td_taskwait_thread = gtid + 1;

#if USE_ITT_BUILD

    // For ITT the taskgroup wait is similar to taskwait until we need to

    // distinguish them

    void *itt_sync_obj = NULL;

#if USE_ITT_NOTIFY

    KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);

#endif /* USE_ITT_NOTIFY */

#endif /* USE_ITT_BUILD */


#if OMPT_SUPPORT && OMPT_OPTIONAL

    if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {

      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(

          ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),

          &(my_task_data), codeptr);

    }

#endif


#if ENABLE_LIBOMPTARGET

    // Give an opportunity to the offload runtime to make progress and create

    // any necessary proxy tasks

    if (UNLIKELY(kmp_target_sync_cb))

      (*kmp_target_sync_cb)(loc, gtid, KMP_TASKDATA_TO_TASK(taskdata), NULL);

#endif // ENABLE_LIBOMPTARGET


    if (!taskdata->td_flags.team_serial ||

        (thread->th.th_task_team != NULL &&

         (thread->th.th_task_team->tt.tt_found_proxy_tasks ||

          thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {

      kmp_flag_32<false, false> flag(

          RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);

      while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {

        flag.execute_tasks(thread, gtid, FALSE,

                           &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),

                           __kmp_task_stealing_constraint);

      }

    }

    taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting


#if OMPT_SUPPORT && OMPT_OPTIONAL

    if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {

      ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(

          ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),

          &(my_task_data), codeptr);

    }

#endif


#if USE_ITT_BUILD

    KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);

    KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants

#endif /* USE_ITT_BUILD */

  }

  KMP_DEBUG_ASSERT(taskgroup->count == 0);


  if (taskgroup->reduce_data != NULL &&

      !taskgroup->gomp_data) { // need to reduce?

    int cnt;

    void *reduce_data;

    kmp_team_t *t = thread->th.th_team;

    kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;

    // check if <priv> data of the first reduction variable shared for the team

    void *priv0 = arr[0].reduce_priv;

    if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&

        ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {

      // finishing task reduction on parallel

      cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);

      if (cnt == thread->th.th_team_nproc - 1) {

        // we are the last thread passing __kmpc_reduction_modifier_fini()

        // finalize task reduction:

        __kmp_task_reduction_fini(thread, taskgroup);

        // cleanup fields in the team structure:

        // TODO: is relaxed store enough here (whole barrier should follow)?

        __kmp_thread_free(thread, reduce_data);

        KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);

        KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);

      } else {

        // we are not the last thread passing __kmpc_reduction_modifier_fini(),

        // so do not finalize reduction, just clean own copy of the data

        __kmp_task_reduction_clean(thread, taskgroup);

      }

    } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=

                   NULL &&

               ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {

      // finishing task reduction on worksharing

      cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);

      if (cnt == thread->th.th_team_nproc - 1) {

        // we are the last thread passing __kmpc_reduction_modifier_fini()

        __kmp_task_reduction_fini(thread, taskgroup);

        // cleanup fields in team structure:

        // TODO: is relaxed store enough here (whole barrier should follow)?

        __kmp_thread_free(thread, reduce_data);

        KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);

        KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);

      } else {

        // we are not the last thread passing __kmpc_reduction_modifier_fini(),

        // so do not finalize reduction, just clean own copy of the data

        __kmp_task_reduction_clean(thread, taskgroup);

      }

    } else {

      // finishing task reduction on taskgroup

      __kmp_task_reduction_fini(thread, taskgroup);

    }

  }

  // Restore parent taskgroup for the current task

  taskdata->td_taskgroup = taskgroup->parent;

  __kmp_thread_free(thread, taskgroup);


  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",

                gtid, taskdata));


#if OMPT_SUPPORT && OMPT_OPTIONAL

  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {

    ompt_callbacks.ompt_callback(ompt_callback_sync_region)(

        ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),

        &(my_task_data), codeptr);

  }

#endif

}


static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,

                                           kmp_task_team_t *task_team,

                                           kmp_int32 is_constrained) {

  kmp_task_t *task = NULL;

  kmp_taskdata_t *taskdata;

  kmp_taskdata_t *current;

  kmp_thread_data_t *thread_data;

  int ntasks = task_team->tt.tt_num_task_pri;

  if (ntasks == 0) {

    KA_TRACE(

        20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid));

    return NULL;

  }

  do {

    // decrement num_tasks to "reserve" one task to get for execution

    if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks,

                                   ntasks - 1))

      break;

    ntasks = task_team->tt.tt_num_task_pri;

  } while (ntasks > 0);

  if (ntasks == 0) {

    KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",

                  __kmp_get_gtid()));

    return NULL;

  }

  // We got a "ticket" to get a "reserved" priority task

  int deque_ntasks;

  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;

  do {

    KMP_ASSERT(list != NULL);

    thread_data = &list->td;

    __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);

    deque_ntasks = thread_data->td.td_deque_ntasks;

    if (deque_ntasks == 0) {

      __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);

      KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n",

                    __kmp_get_gtid(), thread_data));

      list = list->next;

    }

  } while (deque_ntasks == 0);

  KMP_DEBUG_ASSERT(deque_ntasks);

  int target = thread_data->td.td_deque_head;

  current = __kmp_threads[gtid]->th.th_current_task;

  taskdata = thread_data->td.td_deque[target];

  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {

    // Bump head pointer and Wrap.

    thread_data->td.td_deque_head =

        (target + 1) & TASK_DEQUE_MASK(thread_data->td);

  } else {

    if (!task_team->tt.tt_untied_task_encountered) {

      // The TSC does not allow to steal victim task

      __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);

      KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task "

                    "from %p: task_team=%p ntasks=%d head=%u tail=%u\n",

                    gtid, thread_data, task_team, deque_ntasks, target,

                    thread_data->td.td_deque_tail));

      task_team->tt.tt_num_task_pri++; // atomic inc, restore value

      return NULL;

    }

    int i;

    // walk through the deque trying to steal any task

    taskdata = NULL;

    for (i = 1; i < deque_ntasks; ++i) {

      target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);

      taskdata = thread_data->td.td_deque[target];

      if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {

        break; // found task to execute

      } else {

        taskdata = NULL;

      }

    }

    if (taskdata == NULL) {

      // No appropriate candidate found to execute

      __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);

      KA_TRACE(

          10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from "

               "%p: task_team=%p ntasks=%d head=%u tail=%u\n",

               gtid, thread_data, task_team, deque_ntasks,

               thread_data->td.td_deque_head, thread_data->td.td_deque_tail));

      task_team->tt.tt_num_task_pri++; // atomic inc, restore value

      return NULL;

    }

    int prev = target;

    for (i = i + 1; i < deque_ntasks; ++i) {

      // shift remaining tasks in the deque left by 1

      target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);

      thread_data->td.td_deque[prev] = thread_data->td.td_deque[target];

      prev = target;

    }

    KMP_DEBUG_ASSERT(

        thread_data->td.td_deque_tail ==

        (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td)));

    thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped))

  }

  thread_data->td.td_deque_ntasks = deque_ntasks - 1;

  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);

  task = KMP_TASKDATA_TO_TASK(taskdata);

  return task;

}


// __kmp_remove_my_task: remove a task from my own deque


static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,

                                        kmp_task_team_t *task_team,

                                        kmp_int32 is_constrained) {

  kmp_task_t *task;

  kmp_taskdata_t *taskdata;

  kmp_thread_data_t *thread_data;

  kmp_uint32 tail;


  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);

  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=

                   NULL); // Caller should check this condition


  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];


  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",

                gtid, thread_data->td.td_deque_ntasks,

                thread_data->td.td_deque_head, thread_data->td.td_deque_tail));


  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {

    KA_TRACE(10,

             ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "

              "ntasks=%d head=%u tail=%u\n",

              gtid, thread_data->td.td_deque_ntasks,

              thread_data->td.td_deque_head, thread_data->td.td_deque_tail));

    return NULL;

  }


  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);


  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {

    __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);

    KA_TRACE(10,

             ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "

              "ntasks=%d head=%u tail=%u\n",

              gtid, thread_data->td.td_deque_ntasks,

              thread_data->td.td_deque_head, thread_data->td.td_deque_tail));

    return NULL;

  }


  tail = (thread_data->td.td_deque_tail - 1) &

         TASK_DEQUE_MASK(thread_data->td); // Wrap index.

  taskdata = thread_data->td.td_deque[tail];


  if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,

                             thread->th.th_current_task)) {

    // The TSC does not allow to steal victim task

    __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);

    KA_TRACE(10,

             ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "

              "ntasks=%d head=%u tail=%u\n",

              gtid, thread_data->td.td_deque_ntasks,

              thread_data->td.td_deque_head, thread_data->td.td_deque_tail));

    return NULL;

  }


  thread_data->td.td_deque_tail = tail;

  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);


  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);


  KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "

                "ntasks=%d head=%u tail=%u\n",

                gtid, taskdata, thread_data->td.td_deque_ntasks,

                thread_data->td.td_deque_head, thread_data->td.td_deque_tail));


  task = KMP_TASKDATA_TO_TASK(taskdata);

  return task;

}


// __kmp_steal_task: remove a task from another thread's deque

// Assume that calling thread has already checked existence of

// task_team thread_data before calling this routine.


static kmp_task_t *__kmp_steal_task(kmp_int32 victim_tid, kmp_int32 gtid,

                                    kmp_task_team_t *task_team,

                                    std::atomic<kmp_int32> *unfinished_threads,

                                    int *thread_finished,

                                    kmp_int32 is_constrained) {

  kmp_task_t *task;

  kmp_taskdata_t *taskdata;

  kmp_taskdata_t *current;

  kmp_thread_data_t *victim_td, *threads_data;

  kmp_int32 target;

  kmp_info_t *victim_thr;


  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);


  threads_data = task_team->tt.tt_threads_data;

  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition

  KMP_DEBUG_ASSERT(victim_tid >= 0);

  KMP_DEBUG_ASSERT(victim_tid < task_team->tt.tt_max_threads);


  victim_td = &threads_data[victim_tid];

  victim_thr = victim_td->td.td_thr;

  (void)victim_thr; // Use in TRACE messages which aren't always enabled.


  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "

                "task_team=%p ntasks=%d head=%u tail=%u\n",

                gtid, __kmp_gtid_from_thread(victim_thr), task_team,

                victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,

                victim_td->td.td_deque_tail));


  if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {

    KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "

                  "task_team=%p ntasks=%d head=%u tail=%u\n",

                  gtid, __kmp_gtid_from_thread(victim_thr), task_team,

                  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,

                  victim_td->td.td_deque_tail));

    return NULL;

  }


  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);


  int ntasks = TCR_4(victim_td->td.td_deque_ntasks);

  // Check again after we acquire the lock

  if (ntasks == 0) {

    __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);

    KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "

                  "task_team=%p ntasks=%d head=%u tail=%u\n",

                  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,

                  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));

    return NULL;

  }


  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);

  current = __kmp_threads[gtid]->th.th_current_task;

  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];

  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {

    // Bump head pointer and Wrap.

    victim_td->td.td_deque_head =

        (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);

  } else {

    if (!task_team->tt.tt_untied_task_encountered) {

      // The TSC does not allow to steal victim task

      __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);

      KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "

                    "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",

                    gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,

                    victim_td->td.td_deque_head, victim_td->td.td_deque_tail));

      return NULL;

    }

    int i;

    // walk through victim's deque trying to steal any task

    target = victim_td->td.td_deque_head;

    taskdata = NULL;

    for (i = 1; i < ntasks; ++i) {

      target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);

      taskdata = victim_td->td.td_deque[target];

      if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {

        break; // found victim task

      } else {

        taskdata = NULL;

      }

    }

    if (taskdata == NULL) {

      // No appropriate candidate to steal found

      __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);

      KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "

                    "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",

                    gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,

                    victim_td->td.td_deque_head, victim_td->td.td_deque_tail));

      return NULL;

    }

    int prev = target;

    for (i = i + 1; i < ntasks; ++i) {

      // shift remaining tasks in the deque left by 1

      target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);

      victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];

      prev = target;

    }

    KMP_DEBUG_ASSERT(

        victim_td->td.td_deque_tail ==

        (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));

    victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))

  }

  if (*thread_finished) {

    // We need to un-mark this victim as a finished victim.  This must be done

    // before releasing the lock, or else other threads (starting with the

    // primary thread victim) might be prematurely released from the barrier!!!

#if KMP_DEBUG

    kmp_int32 count =

#endif

        KMP_ATOMIC_INC(unfinished_threads);

    KA_TRACE(

        20,

        ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",

         gtid, count + 1, task_team));

    *thread_finished = FALSE;

  }

  TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);


  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);


  KMP_COUNT_BLOCK(TASK_stolen);

  KA_TRACE(10,

           ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "

            "task_team=%p ntasks=%d head=%u tail=%u\n",

            gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,

            ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));


  task = KMP_TASKDATA_TO_TASK(taskdata);

  return task;

}


// __kmp_execute_tasks_template: Choose and execute tasks until either the

// condition is statisfied (return true) or there are none left (return false).

//

// final_spin is TRUE if this is the spin at the release barrier.

// thread_finished indicates whether the thread is finished executing all

// the tasks it has on its deque, and is at the release barrier.

// spinner is the location on which to spin.

// spinner == NULL means only execute a single task and return.

// checker is the value to check to terminate the spin.

template <class C>


static inline int __kmp_execute_tasks_template(

    kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,

    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),

    kmp_int32 is_constrained) {

  kmp_task_team_t *task_team = thread->th.th_task_team;

  kmp_thread_data_t *threads_data;

  kmp_task_t *task;

  kmp_info_t *other_thread;

  kmp_taskdata_t *current_task = thread->th.th_current_task;

  std::atomic<kmp_int32> *unfinished_threads;

  kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,

                      tid = thread->th.th_info.ds.ds_tid;


  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);

  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);


  if (task_team == NULL || current_task == NULL)

    return FALSE;


  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "

                "*thread_finished=%d\n",

                gtid, final_spin, *thread_finished));


  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;

  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);


  KMP_DEBUG_ASSERT(threads_data != NULL);


  nthreads = task_team->tt.tt_nproc;

  unfinished_threads = &(task_team->tt.tt_unfinished_threads);

  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);


  while (1) { // Outer loop keeps trying to find tasks in case of single thread

    // getting tasks from target constructs

    while (1) { // Inner loop to find a task and execute it

#if ENABLE_LIBOMPTARGET

      // Give an opportunity to the offload runtime to make progress

      if (UNLIKELY(kmp_target_sync_cb))

        (*kmp_target_sync_cb)(NULL, gtid, KMP_TASKDATA_TO_TASK(current_task),

                              NULL);

#endif // ENABLE_LIBOMPTARGET


      task = NULL;

      if (task_team->tt.tt_num_task_pri) { // get priority task first

        task = __kmp_get_priority_task(gtid, task_team, is_constrained);

      }

      if (task == NULL && use_own_tasks) { // check own queue next

        task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);

      }

      if ((task == NULL) && (nthreads > 1)) { // Steal a task finally

        int asleep = 1;

        use_own_tasks = 0;

        // Try to steal from the last place I stole from successfully.

        if (victim_tid == -2) { // haven't stolen anything yet

          victim_tid = threads_data[tid].td.td_deque_last_stolen;

          if (victim_tid !=

              -1) // if we have a last stolen from victim, get the thread

            other_thread = threads_data[victim_tid].td.td_thr;

        }

        if (victim_tid != -1) { // found last victim

          asleep = 0;

        } else if (!new_victim) { // no recent steals and we haven't already

          // used a new victim; select a random thread

          do { // Find a different thread to steal work from.

            // Pick a random thread. Initial plan was to cycle through all the

            // threads, and only return if we tried to steal from every thread,

            // and failed.  Arch says that's not such a great idea.

            victim_tid = __kmp_get_random(thread) % (nthreads - 1);

            if (victim_tid >= tid) {

              ++victim_tid; // Adjusts random distribution to exclude self

            }

            // Found a potential victim

            other_thread = threads_data[victim_tid].td.td_thr;

            // There is a slight chance that __kmp_enable_tasking() did not wake

            // up all threads waiting at the barrier.  If victim is sleeping,

            // then wake it up. Since we were going to pay the cache miss

            // penalty for referencing another thread's kmp_info_t struct

            // anyway,

            // the check shouldn't cost too much performance at this point. In

            // extra barrier mode, tasks do not sleep at the separate tasking

            // barrier, so this isn't a problem.

            asleep = 0;

            if ((__kmp_tasking_mode == tskm_task_teams) &&

                (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&

                (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=

                 NULL)) {

              asleep = 1;

              __kmp_null_resume_wrapper(other_thread);

              // A sleeping thread should not have any tasks on it's queue.

              // There is a slight possibility that it resumes, steals a task

              // from another thread, which spawns more tasks, all in the time

              // that it takes this thread to check => don't write an assertion

              // that the victim's queue is empty.  Try stealing from a

              // different thread.

            }

          } while (asleep);

        }


        if (!asleep) {

          // We have a victim to try to steal from

          task =

              __kmp_steal_task(victim_tid, gtid, task_team, unfinished_threads,

                               thread_finished, is_constrained);

        }

        if (task != NULL) { // set last stolen to victim

          if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {

            threads_data[tid].td.td_deque_last_stolen = victim_tid;

            // The pre-refactored code did not try more than 1 successful new

            // vicitm, unless the last one generated more local tasks;

            // new_victim keeps track of this

            new_victim = 1;

          }

        } else { // No tasks found; unset last_stolen

          KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);

          victim_tid = -2; // no successful victim found

        }

      }


      if (task == NULL)

        break; // break out of tasking loop


// Found a task; execute it

#if USE_ITT_BUILD && USE_ITT_NOTIFY

      if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {

        if (itt_sync_obj == NULL) { // we are at fork barrier where we could not

          // get the object reliably

          itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);

        }

        __kmp_itt_task_starting(itt_sync_obj);

      }

#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */

      __kmp_invoke_task(gtid, task, current_task);

#if USE_ITT_BUILD

      if (itt_sync_obj != NULL)

        __kmp_itt_task_finished(itt_sync_obj);

#endif /* USE_ITT_BUILD */

      // If this thread is only partway through the barrier and the condition is

      // met, then return now, so that the barrier gather/release pattern can

      // proceed. If this thread is in the last spin loop in the barrier,

      // waiting to be released, we know that the termination condition will not

      // be satisfied, so don't waste any cycles checking it.

      if (flag == NULL || (!final_spin && flag->done_check())) {

        KA_TRACE(

            15,

            ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",

             gtid));

        return TRUE;

      }

      if (thread->th.th_task_team == NULL) {

        break;

      }

      KMP_YIELD(__kmp_library == library_throughput); // Yield before next task

      // If execution of a stolen task results in more tasks being placed on our

      // run queue, reset use_own_tasks

      if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {

        KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "

                      "other tasks, restart\n",

                      gtid));

        use_own_tasks = 1;

        new_victim = 0;

      }

    }


    // The task source has been exhausted. If in final spin loop of barrier,

    // check if termination condition is satisfied. The work queue may be empty

    // but there might be proxy tasks still executing.

    if (final_spin &&

        KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {

      // First, decrement the #unfinished threads, if that has not already been

      // done.  This decrement might be to the spin location, and result in the

      // termination condition being satisfied.

      if (!*thread_finished) {

#if KMP_DEBUG

        kmp_int32 count = -1 +

#endif

            KMP_ATOMIC_DEC(unfinished_threads);

        KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "

                      "unfinished_threads to %d task_team=%p\n",

                      gtid, count, task_team));

        *thread_finished = TRUE;

      }


      // It is now unsafe to reference thread->th.th_team !!!

      // Decrementing task_team->tt.tt_unfinished_threads can allow the primary

      // thread to pass through the barrier, where it might reset each thread's

      // th.th_team field for the next parallel region. If we can steal more

      // work, we know that this has not happened yet.

      if (flag != NULL && flag->done_check()) {

        KA_TRACE(

            15,

            ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",

             gtid));

        return TRUE;

      }

    }


    // If this thread's task team is NULL, primary thread has recognized that

    // there are no more tasks; bail out

    if (thread->th.th_task_team == NULL) {

      KA_TRACE(15,

               ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));

      return FALSE;

    }


    // Check the flag again to see if it has already done in case to be trapped

    // into infinite loop when a if0 task depends on a hidden helper task

    // outside any parallel region. Detached tasks are not impacted in this case

    // because the only thread executing this function has to execute the proxy

    // task so it is in another code path that has the same check.

    if (flag == NULL || (!final_spin && flag->done_check())) {

      KA_TRACE(15,

               ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",

                gtid));

      return TRUE;

    }


    // We could be getting tasks from target constructs; if this is the only

    // thread, keep trying to execute tasks from own queue

    if (nthreads == 1 &&

        KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))

      use_own_tasks = 1;

    else {

      KA_TRACE(15,

               ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));

      return FALSE;

    }

  }

}


template <bool C, bool S>


int __kmp_execute_tasks_32(

    kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,

    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),

    kmp_int32 is_constrained) {

  return __kmp_execute_tasks_template(

      thread, gtid, flag, final_spin,

      thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);

}


template <bool C, bool S>


int __kmp_execute_tasks_64(

    kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,

    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),

    kmp_int32 is_constrained) {

  return __kmp_execute_tasks_template(

      thread, gtid, flag, final_spin,

      thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);

}


template <bool C, bool S>


int __kmp_atomic_execute_tasks_64(

    kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,

    int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),

    kmp_int32 is_constrained) {

  return __kmp_execute_tasks_template(

      thread, gtid, flag, final_spin,

      thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);

}


int __kmp_execute_tasks_oncore(

    kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,

    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),

    kmp_int32 is_constrained) {

  return __kmp_execute_tasks_template(

      thread, gtid, flag, final_spin,

      thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);

}


template int

__kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,

                                     kmp_flag_32<false, false> *, int,

                                     int *USE_ITT_BUILD_ARG(void *), kmp_int32);


template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,

                                                 kmp_flag_64<false, true> *,

                                                 int,

                                                 int *USE_ITT_BUILD_ARG(void *),

                                                 kmp_int32);


template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,

                                                 kmp_flag_64<true, false> *,

                                                 int,

                                                 int *USE_ITT_BUILD_ARG(void *),

                                                 kmp_int32);


template int __kmp_atomic_execute_tasks_64<false, true>(

    kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,

    int *USE_ITT_BUILD_ARG(void *), kmp_int32);


template int __kmp_atomic_execute_tasks_64<true, false>(

    kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,

    int *USE_ITT_BUILD_ARG(void *), kmp_int32);


// __kmp_enable_tasking: Allocate task team and resume threads sleeping at the

// next barrier so they can assist in executing enqueued tasks.

// First thread in allocates the task team atomically.


static void __kmp_enable_tasking(kmp_task_team_t *task_team,

                                 kmp_info_t *this_thr) {

  kmp_thread_data_t *threads_data;

  int nthreads, i, is_init_thread;


  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",

                __kmp_gtid_from_thread(this_thr)));


  KMP_DEBUG_ASSERT(task_team != NULL);

  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);


  nthreads = task_team->tt.tt_nproc;

  KMP_DEBUG_ASSERT(nthreads > 0);

  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);


  // Allocate or increase the size of threads_data if necessary

  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);


  if (!is_init_thread) {

    // Some other thread already set up the array.

    KA_TRACE(

        20,

        ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",

         __kmp_gtid_from_thread(this_thr)));

    return;

  }

  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);

  KMP_DEBUG_ASSERT(threads_data != NULL);


  if (__kmp_tasking_mode == tskm_task_teams &&

      (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {

    // Release any threads sleeping at the barrier, so that they can steal

    // tasks and execute them.  In extra barrier mode, tasks do not sleep

    // at the separate tasking barrier, so this isn't a problem.

    for (i = 0; i < nthreads; i++) {

      void *sleep_loc;

      kmp_info_t *thread = threads_data[i].td.td_thr;


      if (i == this_thr->th.th_info.ds.ds_tid) {

        continue;

      }

      // Since we haven't locked the thread's suspend mutex lock at this

      // point, there is a small window where a thread might be putting

      // itself to sleep, but hasn't set the th_sleep_loc field yet.

      // To work around this, __kmp_execute_tasks_template() periodically checks

      // see if other threads are sleeping (using the same random mechanism that

      // is used for task stealing) and awakens them if they are.

      if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=

          NULL) {

        KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",

                      __kmp_gtid_from_thread(this_thr),

                      __kmp_gtid_from_thread(thread)));

        __kmp_null_resume_wrapper(thread);

      } else {

        KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",

                      __kmp_gtid_from_thread(this_thr),

                      __kmp_gtid_from_thread(thread)));

      }

    }

  }


  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",

                __kmp_gtid_from_thread(this_thr)));

}


/* // TODO: Check the comment consistency

 * Utility routines for "task teams".  A task team (kmp_task_t) is kind of

 * like a shadow of the kmp_team_t data struct, with a different lifetime.

 * After a child * thread checks into a barrier and calls __kmp_release() from

 * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no

 * longer assume that the kmp_team_t structure is intact (at any moment, the

 * primary thread may exit the barrier code and free the team data structure,

 * and return the threads to the thread pool).

 *

 * This does not work with the tasking code, as the thread is still

 * expected to participate in the execution of any tasks that may have been

 * spawned my a member of the team, and the thread still needs access to all

 * to each thread in the team, so that it can steal work from it.

 *

 * Enter the existence of the kmp_task_team_t struct.  It employs a reference

 * counting mechanism, and is allocated by the primary thread before calling

 * __kmp_<barrier_kind>_release, and then is release by the last thread to

 * exit __kmp_<barrier_kind>_release at the next barrier.  I.e. the lifetimes

 * of the kmp_task_team_t structs for consecutive barriers can overlap

 * (and will, unless the primary thread is the last thread to exit the barrier

 * release phase, which is not typical). The existence of such a struct is

 * useful outside the context of tasking.

 *

 * We currently use the existence of the threads array as an indicator that

 * tasks were spawned since the last barrier.  If the structure is to be

 * useful outside the context of tasking, then this will have to change, but

 * not setting the field minimizes the performance impact of tasking on

 * barriers, when no explicit tasks were spawned (pushed, actually).

 */


static kmp_task_team_t *__kmp_free_task_teams =

    NULL; // Free list for task_team data structures

// Lock for task team data structures

kmp_bootstrap_lock_t __kmp_task_team_lock =

    KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);


// __kmp_alloc_task_deque:

// Allocates a task deque for a particular thread, and initialize the necessary

// data structures relating to the deque.  This only happens once per thread

// per task team since task teams are recycled. No lock is needed during

// allocation since each thread allocates its own deque.


static void __kmp_alloc_task_deque(kmp_info_t *thread,

                                   kmp_thread_data_t *thread_data) {

  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);

  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);


  // Initialize last stolen task field to "none"

  thread_data->td.td_deque_last_stolen = -1;


  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);

  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);

  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);


  KE_TRACE(

      10,

      ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",

       __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));

  // Allocate space for task deque, and zero the deque

  // Cannot use __kmp_thread_calloc() because threads not around for

  // kmp_reap_task_team( ).

  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(

      INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));

  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;

}


// __kmp_free_task_deque:

// Deallocates a task deque for a particular thread. Happens at library

// deallocation so don't need to reset all thread data fields.


static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {

  if (thread_data->td.td_deque != NULL) {

    __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);

    TCW_4(thread_data->td.td_deque_ntasks, 0);

    __kmp_free(thread_data->td.td_deque);

    thread_data->td.td_deque = NULL;

    __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);

  }

}


// __kmp_realloc_task_threads_data:

// Allocates a threads_data array for a task team, either by allocating an

// initial array or enlarging an existing array.  Only the first thread to get

// the lock allocs or enlarges the array and re-initializes the array elements.

// That thread returns "TRUE", the rest return "FALSE".

// Assumes that the new array size is given by task_team -> tt.tt_nproc.

// The current size is given by task_team -> tt.tt_max_threads.


static int __kmp_realloc_task_threads_data(kmp_info_t *thread,

                                           kmp_task_team_t *task_team) {

  kmp_thread_data_t **threads_data_p;

  kmp_int32 nthreads, maxthreads;

  int is_init_thread = FALSE;


  if (TCR_4(task_team->tt.tt_found_tasks)) {

    // Already reallocated and initialized.

    return FALSE;

  }


  threads_data_p = &task_team->tt.tt_threads_data;

  nthreads = task_team->tt.tt_nproc;

  maxthreads = task_team->tt.tt_max_threads;


  // All threads must lock when they encounter the first task of the implicit

  // task region to make sure threads_data fields are (re)initialized before

  // used.

  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);


  if (!TCR_4(task_team->tt.tt_found_tasks)) {

    // first thread to enable tasking

    kmp_team_t *team = thread->th.th_team;

    int i;


    is_init_thread = TRUE;

    if (maxthreads < nthreads) {


      if (*threads_data_p != NULL) {

        kmp_thread_data_t *old_data = *threads_data_p;

        kmp_thread_data_t *new_data = NULL;


        KE_TRACE(

            10,

            ("__kmp_realloc_task_threads_data: T#%d reallocating "

             "threads data for task_team %p, new_size = %d, old_size = %d\n",

             __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));

        // Reallocate threads_data to have more elements than current array

        // Cannot use __kmp_thread_realloc() because threads not around for

        // kmp_reap_task_team( ).  Note all new array entries are initialized

        // to zero by __kmp_allocate().

        new_data = (kmp_thread_data_t *)__kmp_allocate(

            nthreads * sizeof(kmp_thread_data_t));

        // copy old data to new data

        KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),

                     (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));


        // Install the new data and free the old data

        (*threads_data_p) = new_data;

        __kmp_free(old_data);

      } else {

        KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "

                      "threads data for task_team %p, size = %d\n",

                      __kmp_gtid_from_thread(thread), task_team, nthreads));

        // Make the initial allocate for threads_data array, and zero entries

        // Cannot use __kmp_thread_calloc() because threads not around for

        // kmp_reap_task_team( ).

        *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(

            nthreads * sizeof(kmp_thread_data_t));

      }

      task_team->tt.tt_max_threads = nthreads;

    } else {

      // If array has (more than) enough elements, go ahead and use it

      KMP_DEBUG_ASSERT(*threads_data_p != NULL);

    }


    // initialize threads_data pointers back to thread_info structures

    for (i = 0; i < nthreads; i++) {

      kmp_thread_data_t *thread_data = &(*threads_data_p)[i];

      thread_data->td.td_thr = team->t.t_threads[i];


      if (thread_data->td.td_deque_last_stolen >= nthreads) {

        // The last stolen field survives across teams / barrier, and the number

        // of threads may have changed.  It's possible (likely?) that a new

        // parallel region will exhibit the same behavior as previous region.

        thread_data->td.td_deque_last_stolen = -1;

      }

    }


    KMP_MB();

    TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);

  }


  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);

  return is_init_thread;

}


// __kmp_free_task_threads_data:

// Deallocates a threads_data array for a task team, including any attached

// tasking deques.  Only occurs at library shutdown.


static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {

  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);

  if (task_team->tt.tt_threads_data != NULL) {

    int i;

    for (i = 0; i < task_team->tt.tt_max_threads; i++) {

      __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);

    }

    __kmp_free(task_team->tt.tt_threads_data);

    task_team->tt.tt_threads_data = NULL;

  }

  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);

}


// __kmp_free_task_pri_list:

// Deallocates tasking deques used for priority tasks.

// Only occurs at library shutdown.


static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {

  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);

  if (task_team->tt.tt_task_pri_list != NULL) {

    kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;

    while (list != NULL) {

      kmp_task_pri_t *next = list->next;

      __kmp_free_task_deque(&list->td);

      __kmp_free(list);

      list = next;

    }

    task_team->tt.tt_task_pri_list = NULL;

  }

  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);

}


static inline void __kmp_task_team_init(kmp_task_team_t *task_team,

                                        kmp_team_t *team) {

  int team_nth = team->t.t_nproc;

  // Only need to init if task team is isn't active or team size changed

  if (!task_team->tt.tt_active || team_nth != task_team->tt.tt_nproc) {

    TCW_4(task_team->tt.tt_found_tasks, FALSE);

    TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);

    TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);

    TCW_4(task_team->tt.tt_nproc, team_nth);

    KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team_nth);

    TCW_4(task_team->tt.tt_active, TRUE);

  }

}


// __kmp_allocate_task_team:

// Allocates a task team associated with a specific team, taking it from

// the global task team free list if possible.  Also initializes data

// structures.


static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,

                                                 kmp_team_t *team) {

  kmp_task_team_t *task_team = NULL;


  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",

                (thread ? __kmp_gtid_from_thread(thread) : -1), team));


  if (TCR_PTR(__kmp_free_task_teams) != NULL) {

    // Take a task team from the task team pool

    __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);

    if (__kmp_free_task_teams != NULL) {

      task_team = __kmp_free_task_teams;

      TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);

      task_team->tt.tt_next = NULL;

    }

    __kmp_release_bootstrap_lock(&__kmp_task_team_lock);

  }


  if (task_team == NULL) {

    KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "

                  "task team for team %p\n",

                  __kmp_gtid_from_thread(thread), team));

    // Allocate a new task team if one is not available. Cannot use

    // __kmp_thread_malloc because threads not around for kmp_reap_task_team.

    task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));

    __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);

    __kmp_init_bootstrap_lock(&task_team->tt.tt_task_pri_lock);

#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG

    // suppress race conditions detection on synchronization flags in debug mode

    // this helps to analyze library internals eliminating false positives

    __itt_suppress_mark_range(

        __itt_suppress_range, __itt_suppress_threading_errors,

        &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));

    __itt_suppress_mark_range(__itt_suppress_range,

                              __itt_suppress_threading_errors,

                              CCAST(kmp_uint32 *, &task_team->tt.tt_active),

                              sizeof(task_team->tt.tt_active));

#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */

    // Note: __kmp_allocate zeroes returned memory, othewise we would need:

    // task_team->tt.tt_threads_data = NULL;

    // task_team->tt.tt_max_threads = 0;

    // task_team->tt.tt_next = NULL;

  }


  __kmp_task_team_init(task_team, team);


  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "

                "unfinished_threads init'd to %d\n",

                (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,

                KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));

  return task_team;

}


// __kmp_free_task_team:

// Frees the task team associated with a specific thread, and adds it

// to the global task team free list.


void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {

  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",

                thread ? __kmp_gtid_from_thread(thread) : -1, task_team));


  // Put task team back on free list

  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);


  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);

  task_team->tt.tt_next = __kmp_free_task_teams;

  TCW_PTR(__kmp_free_task_teams, task_team);


  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);

}


// __kmp_reap_task_teams:

// Free all the task teams on the task team free list.

// Should only be done during library shutdown.

// Cannot do anything that needs a thread structure or gtid since they are

// already gone.


void __kmp_reap_task_teams(void) {

  kmp_task_team_t *task_team;


  if (TCR_PTR(__kmp_free_task_teams) != NULL) {

    // Free all task_teams on the free list

    __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);

    while ((task_team = __kmp_free_task_teams) != NULL) {

      __kmp_free_task_teams = task_team->tt.tt_next;

      task_team->tt.tt_next = NULL;


      // Free threads_data if necessary

      if (task_team->tt.tt_threads_data != NULL) {

        __kmp_free_task_threads_data(task_team);

      }

      if (task_team->tt.tt_task_pri_list != NULL) {

        __kmp_free_task_pri_list(task_team);

      }

      __kmp_free(task_team);

    }

    __kmp_release_bootstrap_lock(&__kmp_task_team_lock);

  }

}


// View the array of two task team pointers as a pair of pointers:

//  1) a single task_team pointer

//  2) next pointer for stack

// Serial teams can create a stack of task teams for nested serial teams.


void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team) {

  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);

  kmp_task_team_list_t *current =

      (kmp_task_team_list_t *)(&team->t.t_task_team[0]);

  kmp_task_team_list_t *node =

      (kmp_task_team_list_t *)__kmp_allocate(sizeof(kmp_task_team_list_t));

  node->task_team = current->task_team;

  node->next = current->next;

  thread->th.th_task_team = current->task_team = NULL;

  current->next = node;

}


// Serial team pops a task team off the stack


void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team) {

  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);

  kmp_task_team_list_t *current =

      (kmp_task_team_list_t *)(&team->t.t_task_team[0]);

  if (current->task_team) {

    __kmp_free_task_team(thread, current->task_team);

  }

  kmp_task_team_list_t *next = current->next;

  if (next) {

    current->task_team = next->task_team;

    current->next = next->next;

    KMP_DEBUG_ASSERT(next != current);

    __kmp_free(next);

    thread->th.th_task_team = current->task_team;

  }

}


// __kmp_wait_to_unref_task_teams:

// Some threads could still be in the fork barrier release code, possibly

// trying to steal tasks.  Wait for each thread to unreference its task team.


void __kmp_wait_to_unref_task_teams(void) {

  kmp_info_t *thread;

  kmp_uint32 spins;

  kmp_uint64 time;

  int done;


  KMP_INIT_YIELD(spins);

  KMP_INIT_BACKOFF(time);


  for (;;) {

    done = TRUE;


    // TODO: GEH - this may be is wrong because some sync would be necessary

    // in case threads are added to the pool during the traversal. Need to

    // verify that lock for thread pool is held when calling this routine.

    for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;

         thread = thread->th.th_next_pool) {

#if KMP_OS_WINDOWS

      DWORD exit_val;

#endif

      if (TCR_PTR(thread->th.th_task_team) == NULL) {

        KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",

                      __kmp_gtid_from_thread(thread)));

        continue;

      }

#if KMP_OS_WINDOWS

      // TODO: GEH - add this check for Linux* OS / OS X* as well?

      if (!__kmp_is_thread_alive(thread, &exit_val)) {

        thread->th.th_task_team = NULL;

        continue;

      }

#endif


      done = FALSE; // Because th_task_team pointer is not NULL for this thread


      KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "

                    "unreference task_team\n",

                    __kmp_gtid_from_thread(thread)));


      if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {

        void *sleep_loc;

        // If the thread is sleeping, awaken it.

        if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=

            NULL) {

          KA_TRACE(

              10,

              ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",

               __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));

          __kmp_null_resume_wrapper(thread);

        }

      }

    }

    if (done) {

      break;

    }


    // If oversubscribed or have waited a bit, yield.

    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);

  }

}


// __kmp_task_team_setup:  Create a task_team for the current team, but use

// an already created, unused one if it already exists.


void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {

  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);


  // For the serial and root teams, setup the first task team pointer to point

  // to task team. The other pointer is a stack of task teams from previous

  // serial levels.

  if (team == this_thr->th.th_serial_team ||

      team == this_thr->th.th_root->r.r_root_team) {

    KMP_DEBUG_ASSERT(team->t.t_nproc == 1);

    if (team->t.t_task_team[0] == NULL) {

      team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);

      KA_TRACE(

          20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"

               " for serial/root team %p\n",

               __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));


    } else

      __kmp_task_team_init(team->t.t_task_team[0], team);

    return;

  }


  // If this task_team hasn't been created yet, allocate it. It will be used in

  // the region after the next.

  // If it exists, it is the current task team and shouldn't be touched yet as

  // it may still be in use.

  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL) {

    team->t.t_task_team[this_thr->th.th_task_state] =

        __kmp_allocate_task_team(this_thr, team);

    KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"

                  " for team %d at parity=%d\n",

                  __kmp_gtid_from_thread(this_thr),

                  team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,

                  this_thr->th.th_task_state));

  }


  // After threads exit the release, they will call sync, and then point to this

  // other task_team; make sure it is allocated and properly initialized. As

  // threads spin in the barrier release phase, they will continue to use the

  // previous task_team struct(above), until they receive the signal to stop

  // checking for tasks (they can't safely reference the kmp_team_t struct,

  // which could be reallocated by the primary thread).

  int other_team = 1 - this_thr->th.th_task_state;

  KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);

  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well

    team->t.t_task_team[other_team] = __kmp_allocate_task_team(this_thr, team);

    KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "

                  "task_team %p for team %d at parity=%d\n",

                  __kmp_gtid_from_thread(this_thr),

                  team->t.t_task_team[other_team], team->t.t_id, other_team));

  } else { // Leave the old task team struct in place for the upcoming region;

    // adjust as needed

    kmp_task_team_t *task_team = team->t.t_task_team[other_team];

    __kmp_task_team_init(task_team, team);

    // if team size has changed, the first thread to enable tasking will

    // realloc threads_data if necessary

    KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "

                  "%p for team %d at parity=%d\n",

                  __kmp_gtid_from_thread(this_thr),

                  team->t.t_task_team[other_team], team->t.t_id, other_team));

  }


  // For regular thread, task enabling should be called when the task is going

  // to be pushed to a dequeue. However, for the hidden helper thread, we need

  // it ahead of time so that some operations can be performed without race

  // condition.

  if (this_thr == __kmp_hidden_helper_main_thread) {

    for (int i = 0; i < 2; ++i) {

      kmp_task_team_t *task_team = team->t.t_task_team[i];

      if (KMP_TASKING_ENABLED(task_team)) {

        continue;

      }

      __kmp_enable_tasking(task_team, this_thr);

      for (int j = 0; j < task_team->tt.tt_nproc; ++j) {

        kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];

        if (thread_data->td.td_deque == NULL) {

          __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);

        }

      }

    }

  }

}


// __kmp_task_team_sync: Propagation of task team data from team to threads

// which happens just after the release phase of a team barrier.  This may be

// called by any thread. This is not called for serial or root teams.


void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {

  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);

  KMP_DEBUG_ASSERT(team != this_thr->th.th_serial_team);

  KMP_DEBUG_ASSERT(team != this_thr->th.th_root->r.r_root_team);


  // Toggle the th_task_state field, to switch which task_team this thread

  // refers to

  this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);


  // It is now safe to propagate the task team pointer from the team struct to

  // the current thread.

  TCW_PTR(this_thr->th.th_task_team,

          team->t.t_task_team[this_thr->th.th_task_state]);

  KA_TRACE(20,

           ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "

            "%p from Team #%d (parity=%d)\n",

            __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,

            team->t.t_id, this_thr->th.th_task_state));

}


// __kmp_task_team_wait: Primary thread waits for outstanding tasks after the

// barrier gather phase. Only called by the primary thread.

//

// wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off

// by passing in 0 optionally as the last argument. When wait is zero, primary

// thread does not wait for unfinished_threads to reach 0.


void __kmp_task_team_wait(

    kmp_info_t *this_thr,

    kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {

  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];


  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);

  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);


  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {

    if (wait) {

      KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "

                    "(for unfinished_threads to reach 0) on task_team = %p\n",

                    __kmp_gtid_from_thread(this_thr), task_team));

      // Worker threads may have dropped through to release phase, but could

      // still be executing tasks. Wait here for tasks to complete. To avoid

      // memory contention, only primary thread checks termination condition.

      kmp_flag_32<false, false> flag(

          RCAST(std::atomic<kmp_uint32> *,

                &task_team->tt.tt_unfinished_threads),

          0U);

      flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));

    }

    // Deactivate the old task team, so that the worker threads will stop

    // referencing it while spinning.

    KA_TRACE(

        20,

        ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "

         "setting active to false, setting local and team's pointer to NULL\n",

         __kmp_gtid_from_thread(this_thr), task_team));

    TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);

    TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);

    KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);

    TCW_SYNC_4(task_team->tt.tt_active, FALSE);

    KMP_MB();


    TCW_PTR(this_thr->th.th_task_team, NULL);

  }

}


// __kmp_tasking_barrier:

// This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.

// Internal function to execute all tasks prior to a regular barrier or a join

// barrier. It is a full barrier itself, which unfortunately turns regular

// barriers into double barriers and join barriers into 1 1/2 barriers.


void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {

  std::atomic<kmp_uint32> *spin = RCAST(

      std::atomic<kmp_uint32> *,

      &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);

  int flag = FALSE;

  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);


#if USE_ITT_BUILD

  KMP_FSYNC_SPIN_INIT(spin, NULL);

#endif /* USE_ITT_BUILD */

  kmp_flag_32<false, false> spin_flag(spin, 0U);

  while (!spin_flag.execute_tasks(thread, gtid, TRUE,

                                  &flag USE_ITT_BUILD_ARG(NULL), 0)) {

#if USE_ITT_BUILD

    // TODO: What about itt_sync_obj??

    KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));

#endif /* USE_ITT_BUILD */


    if (TCR_4(__kmp_global.g.g_done)) {

      if (__kmp_global.g.g_abort)

        __kmp_abort_thread();

      break;

    }

    KMP_YIELD(TRUE);

  }

#if USE_ITT_BUILD

  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));

#endif /* USE_ITT_BUILD */

}


// __kmp_give_task puts a task into a given thread queue if:

//  - the queue for that thread was created

//  - there's space in that queue

// Because of this, __kmp_push_task needs to check if there's space after

// getting the lock


static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,

                            kmp_int32 pass) {

  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);

  kmp_task_team_t *task_team = taskdata->td_task_team;


  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",

                taskdata, tid));


  // If task_team is NULL something went really bad...

  KMP_DEBUG_ASSERT(task_team != NULL);


  bool result = false;

  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];


  if (thread_data->td.td_deque == NULL) {

    // There's no queue in this thread, go find another one

    // We're guaranteed that at least one thread has a queue

    KA_TRACE(30,

             ("__kmp_give_task: thread %d has no queue while giving task %p.\n",

              tid, taskdata));

    return result;

  }


  if (TCR_4(thread_data->td.td_deque_ntasks) >=

      TASK_DEQUE_SIZE(thread_data->td)) {

    KA_TRACE(

        30,

        ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",

         taskdata, tid));


    // if this deque is bigger than the pass ratio give a chance to another

    // thread

    if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)

      return result;


    __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);

    if (TCR_4(thread_data->td.td_deque_ntasks) >=

        TASK_DEQUE_SIZE(thread_data->td)) {

      // expand deque to push the task which is not allowed to execute

      __kmp_realloc_task_deque(thread, thread_data);

    }


  } else {


    __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);


    if (TCR_4(thread_data->td.td_deque_ntasks) >=

        TASK_DEQUE_SIZE(thread_data->td)) {

      KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "

                    "thread %d.\n",

                    taskdata, tid));


      // if this deque is bigger than the pass ratio give a chance to another

      // thread

      if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)

        goto release_and_exit;


      __kmp_realloc_task_deque(thread, thread_data);

    }

  }


  // lock is held here, and there is space in the deque


  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;

  // Wrap index.

  thread_data->td.td_deque_tail =

      (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);

  TCW_4(thread_data->td.td_deque_ntasks,

        TCR_4(thread_data->td.td_deque_ntasks) + 1);


  result = true;

  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",

                taskdata, tid));


release_and_exit:

  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);


  return result;

}


#define PROXY_TASK_FLAG 0x40000000

/* The finish of the proxy tasks is divided in two pieces:

    - the top half is the one that can be done from a thread outside the team

    - the bottom half must be run from a thread within the team


   In order to run the bottom half the task gets queued back into one of the

   threads of the team. Once the td_incomplete_child_task counter of the parent

   is decremented the threads can leave the barriers. So, the bottom half needs

   to be queued before the counter is decremented. The top half is therefore

   divided in two parts:

    - things that can be run before queuing the bottom half

    - things that must be run after queuing the bottom half


   This creates a second race as the bottom half can free the task before the

   second top half is executed. To avoid this we use the

   td_incomplete_child_task of the proxy task to synchronize the top and bottom

   half. */


static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {

  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);

  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);

  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);

  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);


  taskdata->td_flags.complete = 1; // mark the task as completed

#if OMPX_TASKGRAPH

  taskdata->td_flags.onced = 1;

#endif


  if (taskdata->td_taskgroup)

    KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);


  // Create an imaginary children for this task so the bottom half cannot

  // release the task before we have completed the second top half

  KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);

}


static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {

#if KMP_DEBUG

  kmp_int32 children = 0;

  // Predecrement simulated by "- 1" calculation

  children = -1 +

#endif

      KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);

  KMP_DEBUG_ASSERT(children >= 0);


  // Remove the imaginary children

  KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);

}


static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {

  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);

  kmp_info_t *thread = __kmp_threads[gtid];


  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);

  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==

                   1); // top half must run before bottom half


  // We need to wait to make sure the top half is finished

  // Spinning here should be ok as this should happen quickly

  while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &

          PROXY_TASK_FLAG) > 0)

    ;


  __kmp_release_deps(gtid, taskdata);

  __kmp_free_task_and_ancestors(gtid, taskdata, thread);

}


/*!

@ingroup TASKING

@param gtid Global Thread ID of encountering thread

@param ptask Task which execution is completed


Execute the completion of a proxy task from a thread of that is part of the

team. Run first and bottom halves directly.

*/


void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {

  KMP_DEBUG_ASSERT(ptask != NULL);

  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);

  KA_TRACE(

      10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",

           gtid, taskdata));

  __kmp_assert_valid_gtid(gtid);

  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);


  __kmp_first_top_half_finish_proxy(taskdata);

  __kmp_second_top_half_finish_proxy(taskdata);

  __kmp_bottom_half_finish_proxy(gtid, ptask);


  KA_TRACE(10,

           ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",

            gtid, taskdata));

}


void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {

  KMP_DEBUG_ASSERT(ptask != NULL);

  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);


  // Enqueue task to complete bottom half completion from a thread within the

  // corresponding team

  kmp_team_t *team = taskdata->td_team;

  kmp_int32 nthreads = team->t.t_nproc;

  kmp_info_t *thread;


  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads

  // but we cannot use __kmp_get_random here

  kmp_int32 start_k = start % nthreads;

  kmp_int32 pass = 1;

  kmp_int32 k = start_k;


  do {

    // For now we're just linearly trying to find a thread

    thread = team->t.t_threads[k];

    k = (k + 1) % nthreads;


    // we did a full pass through all the threads

    if (k == start_k)

      pass = pass << 1;


  } while (!__kmp_give_task(thread, k, ptask, pass));


  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && __kmp_wpolicy_passive) {

    // awake at least one thread to execute given task

    for (int i = 0; i < nthreads; ++i) {

      thread = team->t.t_threads[i];

      if (thread->th.th_sleep_loc != NULL) {

        __kmp_null_resume_wrapper(thread);

        break;

      }

    }

  }

}


/*!

@ingroup TASKING

@param ptask Task which execution is completed


Execute the completion of a proxy task from a thread that could not belong to

the team.

*/


void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {

  KMP_DEBUG_ASSERT(ptask != NULL);

  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);


  KA_TRACE(

      10,

      ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",

       taskdata));


  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);


  __kmp_first_top_half_finish_proxy(taskdata);


  __kmpc_give_task(ptask);


  __kmp_second_top_half_finish_proxy(taskdata);


  KA_TRACE(

      10,

      ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",

       taskdata));

}


kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,

                                                kmp_task_t *task) {

  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);

  if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {

    td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;

    td->td_allow_completion_event.ed.task = task;

    __kmp_init_tas_lock(&td->td_allow_completion_event.lock);

  }

  return &td->td_allow_completion_event;

}


void __kmp_fulfill_event(kmp_event_t *event) {

  if (event->type == KMP_EVENT_ALLOW_COMPLETION) {

    kmp_task_t *ptask = event->ed.task;

    kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);

    bool detached = false;

    int gtid = __kmp_get_gtid();


    // The associated task might have completed or could be completing at this

    // point.

    // We need to take the lock to avoid races

    __kmp_acquire_tas_lock(&event->lock, gtid);

    if (taskdata->td_flags.proxy == TASK_PROXY) {

      detached = true;

    } else {

#if OMPT_SUPPORT

      // The OMPT event must occur under mutual exclusion,

      // otherwise the tool might access ptask after free

      if (UNLIKELY(ompt_enabled.enabled))

        __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);

#endif

    }

    event->type = KMP_EVENT_UNINITIALIZED;

    __kmp_release_tas_lock(&event->lock, gtid);


    if (detached) {

#if OMPT_SUPPORT

      // We free ptask afterwards and know the task is finished,

      // so locking is not necessary

      if (UNLIKELY(ompt_enabled.enabled))

        __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);

#endif

      // If the task detached complete the proxy task

      if (gtid >= 0) {

        kmp_team_t *team = taskdata->td_team;

        kmp_info_t *thread = __kmp_get_thread();

        if (thread->th.th_team == team) {

          __kmpc_proxy_task_completed(gtid, ptask);

          return;

        }

      }


      // fallback

      __kmpc_proxy_task_completed_ooo(ptask);

    }

  }

}


// __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task

// for taskloop

//

// thread:   allocating thread

// task_src: pointer to source task to be duplicated

// taskloop_recur: used only when dealing with taskgraph,

//      indicating whether we need to update task->td_task_id

// returns:  a pointer to the allocated kmp_task_t structure (task).


kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src

#if OMPX_TASKGRAPH

                                 , int taskloop_recur

#endif

) {

  kmp_task_t *task;

  kmp_taskdata_t *taskdata;

  kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);

  kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task

  size_t shareds_offset;

  size_t task_size;


  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,

                task_src));

  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==

                   TASK_FULL); // it should not be proxy task

  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);

  task_size = taskdata_src->td_size_alloc;


  // Allocate a kmp_taskdata_t block and a kmp_task_t block.

  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,

                task_size));

#if USE_FAST_MEMORY

  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);

#else

  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);

#endif /* USE_FAST_MEMORY */

  KMP_MEMCPY(taskdata, taskdata_src, task_size);


  task = KMP_TASKDATA_TO_TASK(taskdata);


  // Initialize new task (only specific fields not affected by memcpy)

#if OMPX_TASKGRAPH

  if (taskdata->is_taskgraph && !taskloop_recur &&

      __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))

    taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);

#endif

  taskdata->td_task_id = KMP_GEN_TASK_ID();

  if (task->shareds != NULL) { // need setup shareds pointer

    shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;

    task->shareds = &((char *)taskdata)[shareds_offset];

    KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==

                     0);

  }

  taskdata->td_alloc_thread = thread;

  taskdata->td_parent = parent_task;

  // task inherits the taskgroup from the parent task

  taskdata->td_taskgroup = parent_task->td_taskgroup;

  // tied task needs to initialize the td_last_tied at creation,

  // untied one does this when it is scheduled for execution

  if (taskdata->td_flags.tiedness == TASK_TIED)

    taskdata->td_last_tied = taskdata;


  // Only need to keep track of child task counts if team parallel and tasking

  // not serialized

  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {

    KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);

    if (parent_task->td_taskgroup)

      KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);

    // Only need to keep track of allocated child tasks for explicit tasks since

    // implicit not deallocated

    if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)

      KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);

  }


  KA_TRACE(20,

           ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",

            thread, taskdata, taskdata->td_parent));

#if OMPT_SUPPORT

  if (UNLIKELY(ompt_enabled.enabled))

    __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);

#endif

  return task;

}


// Routine optionally generated by the compiler for setting the lastprivate flag

// and calling needed constructors for private/firstprivate objects

// (used to form taskloop tasks from pattern task)

// Parameters: dest task, src task, lastprivate flag.

typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);


KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);


// class to encapsulate manipulating loop bounds in a taskloop task.

// this abstracts away the Intel vs GOMP taskloop interface for setting/getting

// the loop bound variables.


class kmp_taskloop_bounds_t {

  kmp_task_t *task;

  const kmp_taskdata_t *taskdata;

  size_t lower_offset;

  size_t upper_offset;


public:


  kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)

      : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),

        lower_offset((char *)lb - (char *)task),

        upper_offset((char *)ub - (char *)task) {

    KMP_DEBUG_ASSERT((char *)lb > (char *)_task);

    KMP_DEBUG_ASSERT((char *)ub > (char *)_task);

  }


  kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)

      : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),

        lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}


  size_t get_lower_offset() const { return lower_offset; }

  size_t get_upper_offset() const { return upper_offset; }


  kmp_uint64 get_lb() const {

    kmp_int64 retval;

#if defined(KMP_GOMP_COMPAT)

    // Intel task just returns the lower bound normally

    if (!taskdata->td_flags.native) {

      retval = *(kmp_int64 *)((char *)task + lower_offset);

    } else {

      // GOMP task has to take into account the sizeof(long)

      if (taskdata->td_size_loop_bounds == 4) {

        kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);

        retval = (kmp_int64)*lb;

      } else {

        kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);

        retval = (kmp_int64)*lb;

      }

    }

#else

    (void)taskdata;

    retval = *(kmp_int64 *)((char *)task + lower_offset);

#endif // defined(KMP_GOMP_COMPAT)

    return retval;

  }


  kmp_uint64 get_ub() const {

    kmp_int64 retval;

#if defined(KMP_GOMP_COMPAT)

    // Intel task just returns the upper bound normally

    if (!taskdata->td_flags.native) {

      retval = *(kmp_int64 *)((char *)task + upper_offset);

    } else {

      // GOMP task has to take into account the sizeof(long)

      if (taskdata->td_size_loop_bounds == 4) {

        kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;

        retval = (kmp_int64)*ub;

      } else {

        kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;

        retval = (kmp_int64)*ub;

      }

    }

#else

    retval = *(kmp_int64 *)((char *)task + upper_offset);

#endif // defined(KMP_GOMP_COMPAT)

    return retval;

  }


  void set_lb(kmp_uint64 lb) {

#if defined(KMP_GOMP_COMPAT)

    // Intel task just sets the lower bound normally

    if (!taskdata->td_flags.native) {

      *(kmp_uint64 *)((char *)task + lower_offset) = lb;

    } else {

      // GOMP task has to take into account the sizeof(long)

      if (taskdata->td_size_loop_bounds == 4) {

        kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);

        *lower = (kmp_uint32)lb;

      } else {

        kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);

        *lower = (kmp_uint64)lb;

      }

    }

#else

    *(kmp_uint64 *)((char *)task + lower_offset) = lb;

#endif // defined(KMP_GOMP_COMPAT)

  }


  void set_ub(kmp_uint64 ub) {

#if defined(KMP_GOMP_COMPAT)

    // Intel task just sets the upper bound normally

    if (!taskdata->td_flags.native) {

      *(kmp_uint64 *)((char *)task + upper_offset) = ub;

    } else {

      // GOMP task has to take into account the sizeof(long)

      if (taskdata->td_size_loop_bounds == 4) {

        kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;

        *upper = (kmp_uint32)ub;

      } else {

        kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;

        *upper = (kmp_uint64)ub;

      }

    }

#else

    *(kmp_uint64 *)((char *)task + upper_offset) = ub;

#endif // defined(KMP_GOMP_COMPAT)

  }


};


// __kmp_taskloop_linear: Start tasks of the taskloop linearly

//

// loc        Source location information

// gtid       Global thread ID

// task       Pattern task, exposes the loop iteration range

// lb         Pointer to loop lower bound in task structure

// ub         Pointer to loop upper bound in task structure

// st         Loop stride

// ub_glob    Global upper bound (used for lastprivate check)

// num_tasks  Number of tasks to execute

// grainsize  Number of loop iterations per task

// extras     Number of chunks with grainsize+1 iterations

// last_chunk Reduction of grainsize for last task

// tc         Iterations count

// task_dup   Tasks duplication routine

// codeptr_ra Return address for OMPT events


void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,

                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,

                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,

                           kmp_uint64 grainsize, kmp_uint64 extras,

                           kmp_int64 last_chunk, kmp_uint64 tc,

#if OMPT_SUPPORT

                           void *codeptr_ra,

#endif

                           void *task_dup) {

  KMP_COUNT_BLOCK(OMP_TASKLOOP);

  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);

  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;

  // compiler provides global bounds here

  kmp_taskloop_bounds_t task_bounds(task, lb, ub);

  kmp_uint64 lower = task_bounds.get_lb();

  kmp_uint64 upper = task_bounds.get_ub();

  kmp_uint64 i;

  kmp_info_t *thread = __kmp_threads[gtid];

  kmp_taskdata_t *current_task = thread->th.th_current_task;

  kmp_task_t *next_task;

  kmp_int32 lastpriv = 0;


  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +

                             (last_chunk < 0 ? last_chunk : extras));

  KMP_DEBUG_ASSERT(num_tasks > extras);

  KMP_DEBUG_ASSERT(num_tasks > 0);

  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "

                "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",

                gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,

                ub_glob, st, task_dup));


  // Launch num_tasks tasks, assign grainsize iterations each task

  for (i = 0; i < num_tasks; ++i) {

    kmp_uint64 chunk_minus_1;

    if (extras == 0) {

      chunk_minus_1 = grainsize - 1;

    } else {

      chunk_minus_1 = grainsize;

      --extras; // first extras iterations get bigger chunk (grainsize+1)

    }

    upper = lower + st * chunk_minus_1;

    if (upper > *ub) {

      upper = *ub;

    }

    if (i == num_tasks - 1) {

      // schedule the last task, set lastprivate flag if needed

      if (st == 1) { // most common case

        KMP_DEBUG_ASSERT(upper == *ub);

        if (upper == ub_glob)

          lastpriv = 1;

      } else if (st > 0) { // positive loop stride

        KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);

        if ((kmp_uint64)st > ub_glob - upper)

          lastpriv = 1;

      } else { // negative loop stride

        KMP_DEBUG_ASSERT(upper + st < *ub);

        if (upper - ub_glob < (kmp_uint64)(-st))

          lastpriv = 1;

      }

    }


#if OMPX_TASKGRAPH

    next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0);

#else

    next_task = __kmp_task_dup_alloc(thread, task); // allocate new task

#endif


    kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);

    kmp_taskloop_bounds_t next_task_bounds =

        kmp_taskloop_bounds_t(next_task, task_bounds);


    // adjust task-specific bounds

    next_task_bounds.set_lb(lower);

    if (next_taskdata->td_flags.native) {

      next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));

    } else {

      next_task_bounds.set_ub(upper);

    }

    if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,

                           // etc.

      ptask_dup(next_task, task, lastpriv);

    KA_TRACE(40,

             ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "

              "upper %lld stride %lld, (offsets %p %p)\n",

              gtid, i, next_task, lower, upper, st,

              next_task_bounds.get_lower_offset(),

              next_task_bounds.get_upper_offset()));

#if OMPT_SUPPORT

    __kmp_omp_taskloop_task(NULL, gtid, next_task,

                            codeptr_ra); // schedule new task

#if OMPT_OPTIONAL

    if (ompt_enabled.ompt_callback_dispatch) {

      OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,

                              lower, upper, st);

    }

#endif // OMPT_OPTIONAL

#else

    __kmp_omp_task(gtid, next_task, true); // schedule new task

#endif

    lower = upper + st; // adjust lower bound for the next iteration

  }

  // free the pattern task and exit

  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping

  // do not execute the pattern task, just do internal bookkeeping

  __kmp_task_finish<false>(gtid, task, current_task);

}


// Structure to keep taskloop parameters for auxiliary task

// kept in the shareds of the task structure.


typedef struct __taskloop_params {

  kmp_task_t *task;

  kmp_uint64 *lb;

  kmp_uint64 *ub;

  void *task_dup;

  kmp_int64 st;

  kmp_uint64 ub_glob;

  kmp_uint64 num_tasks;

  kmp_uint64 grainsize;

  kmp_uint64 extras;

  kmp_int64 last_chunk;

  kmp_uint64 tc;

  kmp_uint64 num_t_min;

#if OMPT_SUPPORT

  void *codeptr_ra;

#endif

} __taskloop_params_t;


void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,

                          kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,

                          kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,

                          kmp_uint64,

#if OMPT_SUPPORT

                          void *,

#endif

                          void *);


// Execute part of the taskloop submitted as a task.


int __kmp_taskloop_task(int gtid, void *ptask) {

  __taskloop_params_t *p =

      (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;

  kmp_task_t *task = p->task;

  kmp_uint64 *lb = p->lb;

  kmp_uint64 *ub = p->ub;

  void *task_dup = p->task_dup;

  //  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;

  kmp_int64 st = p->st;

  kmp_uint64 ub_glob = p->ub_glob;

  kmp_uint64 num_tasks = p->num_tasks;

  kmp_uint64 grainsize = p->grainsize;

  kmp_uint64 extras = p->extras;

  kmp_int64 last_chunk = p->last_chunk;

  kmp_uint64 tc = p->tc;

  kmp_uint64 num_t_min = p->num_t_min;

#if OMPT_SUPPORT

  void *codeptr_ra = p->codeptr_ra;

#endif

#if KMP_DEBUG

  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);

  KMP_DEBUG_ASSERT(task != NULL);

  KA_TRACE(20,

           ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"

            " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",

            gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,

            st, task_dup));

#endif

  KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);

  if (num_tasks > num_t_min)

    __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,

                         grainsize, extras, last_chunk, tc, num_t_min,

#if OMPT_SUPPORT

                         codeptr_ra,

#endif

                         task_dup);

  else

    __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,

                          grainsize, extras, last_chunk, tc,

#if OMPT_SUPPORT

                          codeptr_ra,

#endif

                          task_dup);


  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));

  return 0;

}


// Schedule part of the taskloop as a task,

// execute the rest of the taskloop.

//

// loc        Source location information

// gtid       Global thread ID

// task       Pattern task, exposes the loop iteration range

// lb         Pointer to loop lower bound in task structure

// ub         Pointer to loop upper bound in task structure

// st         Loop stride

// ub_glob    Global upper bound (used for lastprivate check)

// num_tasks  Number of tasks to execute

// grainsize  Number of loop iterations per task

// extras     Number of chunks with grainsize+1 iterations

// last_chunk Reduction of grainsize for last task

// tc         Iterations count

// num_t_min  Threshold to launch tasks recursively

// task_dup   Tasks duplication routine

// codeptr_ra Return address for OMPT events


void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,

                          kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,

                          kmp_uint64 ub_glob, kmp_uint64 num_tasks,

                          kmp_uint64 grainsize, kmp_uint64 extras,

                          kmp_int64 last_chunk, kmp_uint64 tc,

                          kmp_uint64 num_t_min,

#if OMPT_SUPPORT

                          void *codeptr_ra,

#endif

                          void *task_dup) {

  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);

  KMP_DEBUG_ASSERT(task != NULL);

  KMP_DEBUG_ASSERT(num_tasks > num_t_min);

  KA_TRACE(20,

           ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"

            " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",

            gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,

            st, task_dup));

  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;

  kmp_uint64 lower = *lb;

  kmp_info_t *thread = __kmp_threads[gtid];

  //  kmp_taskdata_t *current_task = thread->th.th_current_task;

  kmp_task_t *next_task;

  size_t lower_offset =

      (char *)lb - (char *)task; // remember offset of lb in the task structure

  size_t upper_offset =

      (char *)ub - (char *)task; // remember offset of ub in the task structure


  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +

                             (last_chunk < 0 ? last_chunk : extras));

  KMP_DEBUG_ASSERT(num_tasks > extras);

  KMP_DEBUG_ASSERT(num_tasks > 0);


  // split the loop in two halves

  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;

  kmp_int64 last_chunk0 = 0, last_chunk1 = 0;

  kmp_uint64 gr_size0 = grainsize;

  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute

  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task

  if (last_chunk < 0) {

    ext0 = ext1 = 0;

    last_chunk1 = last_chunk;

    tc0 = grainsize * n_tsk0;

    tc1 = tc - tc0;

  } else if (n_tsk0 <= extras) {

    gr_size0++; // integrate extras into grainsize

    ext0 = 0; // no extra iters in 1st half

    ext1 = extras - n_tsk0; // remaining extras

    tc0 = gr_size0 * n_tsk0;

    tc1 = tc - tc0;

  } else { // n_tsk0 > extras

    ext1 = 0; // no extra iters in 2nd half

    ext0 = extras;

    tc1 = grainsize * n_tsk1;

    tc0 = tc - tc1;

  }

  ub0 = lower + st * (tc0 - 1);

  lb1 = ub0 + st;


  // create pattern task for 2nd half of the loop

#if OMPX_TASKGRAPH

  next_task = __kmp_task_dup_alloc(thread, task,

                                   /* taskloop_recur */ 1);

#else

  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task

#endif

  // adjust lower bound (upper bound is not changed) for the 2nd half

  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;

  if (ptask_dup != NULL) // construct firstprivates, etc.

    ptask_dup(next_task, task, 0);

  *ub = ub0; // adjust upper bound for the 1st half


  // create auxiliary task for 2nd half of the loop

  // make sure new task has same parent task as the pattern task

  kmp_taskdata_t *current_task = thread->th.th_current_task;

  thread->th.th_current_task = taskdata->td_parent;

  kmp_task_t *new_task =

      __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),

                            sizeof(__taskloop_params_t), &__kmp_taskloop_task);

  // restore current task

  thread->th.th_current_task = current_task;

  __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;

  p->task = next_task;

  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);

  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);

  p->task_dup = task_dup;

  p->st = st;

  p->ub_glob = ub_glob;

  p->num_tasks = n_tsk1;

  p->grainsize = grainsize;

  p->extras = ext1;

  p->last_chunk = last_chunk1;

  p->tc = tc1;

  p->num_t_min = num_t_min;

#if OMPT_SUPPORT

  p->codeptr_ra = codeptr_ra;

#endif


#if OMPX_TASKGRAPH

  kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);

  new_task_data->tdg = taskdata->tdg;

  new_task_data->is_taskgraph = 0;

#endif


#if OMPT_SUPPORT

  // schedule new task with correct return address for OMPT events

  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);

#else

  __kmp_omp_task(gtid, new_task, true); // schedule new task

#endif


  // execute the 1st half of current subrange

  if (n_tsk0 > num_t_min)

    __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,

                         ext0, last_chunk0, tc0, num_t_min,

#if OMPT_SUPPORT

                         codeptr_ra,

#endif

                         task_dup);

  else

    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,

                          gr_size0, ext0, last_chunk0, tc0,

#if OMPT_SUPPORT

                          codeptr_ra,

#endif

                          task_dup);


  KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));

}


static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,

                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,

                           int nogroup, int sched, kmp_uint64 grainsize,

                           int modifier, void *task_dup) {

  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);

  KMP_DEBUG_ASSERT(task != NULL);

  if (nogroup == 0) {

#if OMPT_SUPPORT && OMPT_OPTIONAL

    OMPT_STORE_RETURN_ADDRESS(gtid);

#endif

    __kmpc_taskgroup(loc, gtid);

  }


#if OMPX_TASKGRAPH

  KMP_ATOMIC_DEC(&__kmp_tdg_task_id);

#endif

  // =========================================================================

  // calculate loop parameters

  kmp_taskloop_bounds_t task_bounds(task, lb, ub);

  kmp_uint64 tc;

  // compiler provides global bounds here

  kmp_uint64 lower = task_bounds.get_lb();

  kmp_uint64 upper = task_bounds.get_ub();

  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag

  kmp_uint64 num_tasks = 0, extras = 0;

  kmp_int64 last_chunk =

      0; // reduce grainsize of last task by last_chunk in strict mode

  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;

  kmp_info_t *thread = __kmp_threads[gtid];

  kmp_taskdata_t *current_task = thread->th.th_current_task;


  KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "

                "grain %llu(%d, %d), dup %p\n",

                gtid, taskdata, lower, upper, st, grainsize, sched, modifier,

                task_dup));


  // compute trip count

  if (st == 1) { // most common case

    tc = upper - lower + 1;

  } else if (st < 0) {

    tc = (lower - upper) / (-st) + 1;

  } else { // st > 0

    tc = (upper - lower) / st + 1;

  }

  if (tc == 0) {

    KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));

    // free the pattern task and exit

    __kmp_task_start(gtid, task, current_task);

    // do not execute anything for zero-trip loop

    __kmp_task_finish<false>(gtid, task, current_task);

    return;

  }


#if OMPT_SUPPORT && OMPT_OPTIONAL

  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);

  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);

  if (ompt_enabled.ompt_callback_work) {

    ompt_callbacks.ompt_callback(ompt_callback_work)(

        ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),

        &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));

  }

#endif


  if (num_tasks_min == 0)

    // TODO: can we choose better default heuristic?

    num_tasks_min =

        KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);


  // compute num_tasks/grainsize based on the input provided

  switch (sched) {

  case 0: // no schedule clause specified, we can choose the default

    // let's try to schedule (team_size*10) tasks

    grainsize = thread->th.th_team_nproc * static_cast<kmp_uint64>(10);

    KMP_FALLTHROUGH();

  case 2: // num_tasks provided

    if (grainsize > tc) {

      num_tasks = tc; // too big num_tasks requested, adjust values

      grainsize = 1;

      extras = 0;

    } else {

      num_tasks = grainsize;

      grainsize = tc / num_tasks;

      extras = tc % num_tasks;

    }

    break;

  case 1: // grainsize provided

    if (grainsize > tc) {

      num_tasks = 1;

      grainsize = tc; // too big grainsize requested, adjust values

      extras = 0;

    } else {

      if (modifier) {

        num_tasks = (tc + grainsize - 1) / grainsize;

        last_chunk = tc - (num_tasks * grainsize);

        extras = 0;

      } else {

        num_tasks = tc / grainsize;

        // adjust grainsize for balanced distribution of iterations

        grainsize = tc / num_tasks;

        extras = tc % num_tasks;

      }

    }

    break;

  default:

    KMP_ASSERT2(0, "unknown scheduling of taskloop");

  }


  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +

                             (last_chunk < 0 ? last_chunk : extras));

  KMP_DEBUG_ASSERT(num_tasks > extras);

  KMP_DEBUG_ASSERT(num_tasks > 0);

  // =========================================================================


  // check if clause value first

  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)

  if (if_val == 0) { // if(0) specified, mark task as serial

    taskdata->td_flags.task_serial = 1;

    taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied

    // always start serial tasks linearly

    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,

                          grainsize, extras, last_chunk, tc,

#if OMPT_SUPPORT

                          OMPT_GET_RETURN_ADDRESS(0),

#endif

                          task_dup);

    // !taskdata->td_flags.native => currently force linear spawning of tasks

    // for GOMP_taskloop

  } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {

    KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"

                  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",

                  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,

                  last_chunk));

    __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,

                         grainsize, extras, last_chunk, tc, num_tasks_min,

#if OMPT_SUPPORT

                         OMPT_GET_RETURN_ADDRESS(0),

#endif

                         task_dup);

  } else {

    KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"

                  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",

                  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,

                  last_chunk));

    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,

                          grainsize, extras, last_chunk, tc,

#if OMPT_SUPPORT

                          OMPT_GET_RETURN_ADDRESS(0),

#endif

                          task_dup);

  }


#if OMPT_SUPPORT && OMPT_OPTIONAL

  if (ompt_enabled.ompt_callback_work) {

    ompt_callbacks.ompt_callback(ompt_callback_work)(

        ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),

        &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));

  }

#endif


  if (nogroup == 0) {

#if OMPT_SUPPORT && OMPT_OPTIONAL

    OMPT_STORE_RETURN_ADDRESS(gtid);

#endif

    __kmpc_end_taskgroup(loc, gtid);

  }

  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));

}


/*!

@ingroup TASKING

@param loc       Source location information

@param gtid      Global thread ID

@param task      Task structure

@param if_val    Value of the if clause

@param lb        Pointer to loop lower bound in task structure

@param ub        Pointer to loop upper bound in task structure

@param st        Loop stride

@param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise

@param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks

@param grainsize Schedule value if specified

@param task_dup  Tasks duplication routine


Execute the taskloop construct.

*/


void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,

                     kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,

                     int sched, kmp_uint64 grainsize, void *task_dup) {

  __kmp_assert_valid_gtid(gtid);

  KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));

  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,

                 0, task_dup);

  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));

}


/*!

@ingroup TASKING

@param loc       Source location information

@param gtid      Global thread ID

@param task      Task structure

@param if_val    Value of the if clause

@param lb        Pointer to loop lower bound in task structure

@param ub        Pointer to loop upper bound in task structure

@param st        Loop stride

@param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise

@param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks

@param grainsize Schedule value if specified

@param modifier  Modifier 'strict' for sched, 1 if present, 0 otherwise

@param task_dup  Tasks duplication routine


Execute the taskloop construct.

*/


void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,

                       kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,

                       int nogroup, int sched, kmp_uint64 grainsize,

                       int modifier, void *task_dup) {

  __kmp_assert_valid_gtid(gtid);

  KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));

  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,

                 modifier, task_dup);

  KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));

}


/*!

@ingroup TASKING

@param gtid Global Thread ID of current thread

@return Returns a pointer to the thread's current task async handle. If no task

is present or gtid is invalid, returns NULL.


Acqurires a pointer to the target async handle from the current task.

*/


void **__kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid) {

  if (gtid == KMP_GTID_DNE)

    return NULL;


  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);

  kmp_taskdata_t *taskdata = thread->th.th_current_task;


  if (!taskdata)

    return NULL;


  return &taskdata->td_target_data.async_handle;

}


/*!

@ingroup TASKING

@param gtid Global Thread ID of current thread

@return Returns TRUE if the current task being executed of the given thread has

a task team allocated to it. Otherwise, returns FALSE.


Checks if the current thread has a task team.

*/


bool __kmpc_omp_has_task_team(kmp_int32 gtid) {

  if (gtid == KMP_GTID_DNE)

    return FALSE;


  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);

  kmp_taskdata_t *taskdata = thread->th.th_current_task;


  if (!taskdata)

    return FALSE;


  return taskdata->td_task_team != NULL;

}


#if OMPX_TASKGRAPH

// __kmp_find_tdg: identify a TDG through its ID

// tdg_id: ID of the TDG

// returns: If a TDG corresponding to this ID is found and not

// its initial state, return the pointer to it, otherwise nullptr

static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {

  kmp_tdg_info_t *res = nullptr;

  if (__kmp_max_tdgs == 0)

    return res;


  if (__kmp_global_tdgs == NULL)

    __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(

        sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);


  if ((__kmp_global_tdgs[tdg_id]) &&

      (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))

    res = __kmp_global_tdgs[tdg_id];

  return res;

}


// __kmp_print_tdg_dot: prints the TDG to a dot file

// tdg:    ID of the TDG

// gtid:   Global Thread ID

void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg, kmp_int32 gtid) {

  kmp_int32 tdg_id = tdg->tdg_id;

  KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));


  char file_name[20];

  sprintf(file_name, "tdg_%d.dot", tdg_id);

  kmp_safe_raii_file_t tdg_file(file_name, "w");


  kmp_int32 num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);

  fprintf(tdg_file,

          "digraph TDG {\n"

          "   compound=true\n"

          "   subgraph cluster {\n"

          "      label=TDG_%d\n",

          tdg_id);

  for (kmp_int32 i = 0; i < num_tasks; i++) {

    fprintf(tdg_file, "      %d[style=bold]\n", i);

  }

  fprintf(tdg_file, "   }\n");

  for (kmp_int32 i = 0; i < num_tasks; i++) {

    kmp_int32 nsuccessors = tdg->record_map[i].nsuccessors;

    kmp_int32 *successors = tdg->record_map[i].successors;

    if (nsuccessors > 0) {

      for (kmp_int32 j = 0; j < nsuccessors; j++)

        fprintf(tdg_file, "   %d -> %d \n", i, successors[j]);

    }

  }

  fprintf(tdg_file, "}");

  KA_TRACE(10, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id));

}


// __kmp_exec_tdg: launch the execution of a previous

// recorded TDG

// gtid:   Global Thread ID

// tdg:    ID of the TDG

void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {

  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);

  KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,

                tdg->tdg_id, tdg->num_roots));

  kmp_node_info_t *this_record_map = tdg->record_map;

  kmp_int32 *this_root_tasks = tdg->root_tasks;

  kmp_int32 this_num_roots = tdg->num_roots;

  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);


  kmp_info_t *thread = __kmp_threads[gtid];

  kmp_taskdata_t *parent_task = thread->th.th_current_task;


  if (tdg->rec_taskred_data) {

    __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);

  }


  for (kmp_int32 j = 0; j < this_num_tasks; j++) {

    kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);


    td->td_parent = parent_task;

    this_record_map[j].parent_task = parent_task;


    kmp_taskgroup_t *parent_taskgroup =

        this_record_map[j].parent_task->td_taskgroup;


    KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,

                      this_record_map[j].npredecessors);

    KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);


    if (parent_taskgroup) {

      KMP_ATOMIC_INC(&parent_taskgroup->count);

      // The taskgroup is different so we must update it

      td->td_taskgroup = parent_taskgroup;

    } else if (td->td_taskgroup != nullptr) {

      // If the parent doesnt have a taskgroup, remove it from the task

      td->td_taskgroup = nullptr;

    }

    if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)

      KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);

  }


  for (kmp_int32 j = 0; j < this_num_roots; ++j) {

    __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);

  }

  KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,

                tdg->tdg_id, tdg->num_roots));

}


// __kmp_start_record: set up a TDG structure and turn the

// recording flag to true

// gtid:        Global Thread ID of the encountering thread

// input_flags: Flags associated with the TDG

// tdg_id:      ID of the TDG to record

static inline void __kmp_start_record(kmp_int32 gtid,

                                      kmp_taskgraph_flags_t *flags,

                                      kmp_int32 tdg_id) {

  kmp_tdg_info_t *tdg =

      (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));

  __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;

  // Initializing the TDG structure

  tdg->tdg_id = tdg_id;

  tdg->map_size = INIT_MAPSIZE;

  tdg->num_roots = -1;

  tdg->root_tasks = nullptr;

  tdg->tdg_status = KMP_TDG_RECORDING;

  tdg->rec_num_taskred = 0;

  tdg->rec_taskred_data = nullptr;

  KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);


  // Initializing the list of nodes in this TDG

  kmp_node_info_t *this_record_map =

      (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));

  for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {

    kmp_int32 *successorsList =

        (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));

    this_record_map[i].task = nullptr;

    this_record_map[i].successors = successorsList;

    this_record_map[i].nsuccessors = 0;

    this_record_map[i].npredecessors = 0;

    this_record_map[i].successors_size = __kmp_successors_size;

    KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);

  }


  __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;

}


// __kmpc_start_record_task: Wrapper around __kmp_start_record to mark

// the beginning of the record process of a task region

// loc_ref:     Location of TDG, not used yet

// gtid:        Global Thread ID of the encountering thread

// input_flags: Flags associated with the TDG

// tdg_id:      ID of the TDG to record, for now, incremental integer

// returns:     1 if we record, otherwise, 0

kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid,

                                   kmp_int32 input_flags, kmp_int32 tdg_id) {


  kmp_int32 res;

  kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;

  KA_TRACE(10,

           ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",

            gtid, loc_ref, input_flags, tdg_id));


  if (__kmp_max_tdgs == 0) {

    KA_TRACE(

        10,

        ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "

         "__kmp_max_tdgs = 0\n",

         gtid, loc_ref, input_flags, tdg_id));

    return 1;

  }


  __kmpc_taskgroup(loc_ref, gtid);

  if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {

    // TODO: use re_record flag

    __kmp_exec_tdg(gtid, tdg);

    res = 0;

  } else {

    __kmp_curr_tdg_idx = tdg_id;

    KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);

    __kmp_start_record(gtid, flags, tdg_id);

    __kmp_num_tdg++;

    res = 1;

  }

  KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",

                gtid, tdg_id, res ? "record" : "execute"));

  return res;

}


// __kmp_end_record: set up a TDG after recording it

// gtid:   Global thread ID

// tdg:    Pointer to the TDG

void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {

  // Store roots

  kmp_node_info_t *this_record_map = tdg->record_map;

  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);

  kmp_int32 *this_root_tasks =

      (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));

  kmp_int32 this_map_size = tdg->map_size;

  kmp_int32 this_num_roots = 0;

  kmp_info_t *thread = __kmp_threads[gtid];


  for (kmp_int32 i = 0; i < this_num_tasks; i++) {

    if (this_record_map[i].npredecessors == 0) {

      this_root_tasks[this_num_roots++] = i;

    }

  }


  // Update with roots info and mapsize

  tdg->map_size = this_map_size;

  tdg->num_roots = this_num_roots;

  tdg->root_tasks = this_root_tasks;

  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);

  tdg->tdg_status = KMP_TDG_READY;


  if (thread->th.th_current_task->td_dephash) {

    __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);

    thread->th.th_current_task->td_dephash = NULL;

  }


  // Reset predecessor counter

  for (kmp_int32 i = 0; i < this_num_tasks; i++) {

    KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,

                      this_record_map[i].npredecessors);

  }

  KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);


  if (__kmp_tdg_dot)

    __kmp_print_tdg_dot(tdg, gtid);

}


// __kmpc_end_record_task: wrapper around __kmp_end_record to mark

// the end of recording phase

//

// loc_ref:      Source location information

// gtid:         Global thread ID

// input_flags:  Flags attached to the graph

// tdg_id:       ID of the TDG just finished recording

void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,

                            kmp_int32 input_flags, kmp_int32 tdg_id) {

  kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);


  KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"

                " tdg=%d with flags=%d\n",

                gtid, loc_ref, tdg_id, input_flags));

  if (__kmp_max_tdgs) {

    // TODO: use input_flags->nowait

    __kmpc_end_taskgroup(loc_ref, gtid);

    if (__kmp_tdg_is_recording(tdg->tdg_status))

      __kmp_end_record(gtid, tdg);

  }

  KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"

                " tdg=%d, its status is now READY\n",

                gtid, loc_ref, tdg_id));

}

#endif

target
void * target(void *task)
Definition bug_nested_proxy_task.c:99

kmp_uint8
uint8_t kmp_uint8
Definition bug_nested_proxy_task.c:26

task_entry
int task_entry(kmp_int32 gtid, kmp_task_t *task)
Definition bug_nested_proxy_task.c:109

ptask
struct task * ptask

result
int result[2]
Definition cancellation_for_sections.c:13

kmp_atomic_flag_64
Definition kmp_wait_release.h:898

kmp_flag_32
Definition kmp_wait_release.h:827

kmp_flag_32::execute_tasks
int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained)
Definition kmp_wait_release.h:840

kmp_flag_64
Definition kmp_wait_release.h:861

kmp_flag_oncore
Definition kmp_wait_release.h:935

kmp_safe_raii_file_t
This class safely opens and closes a C-style FILE* object using RAII semantics.
Definition kmp.h:4719

kmp_taskloop_bounds_t
Definition kmp_tasking.cpp:4519

kmp_taskloop_bounds_t::get_ub
kmp_uint64 get_ub() const
Definition kmp_tasking.cpp:4560

kmp_taskloop_bounds_t::set_ub
void set_ub(kmp_uint64 ub)
Definition kmp_tasking.cpp:4600

kmp_taskloop_bounds_t::get_lower_offset
size_t get_lower_offset() const
Definition kmp_tasking.cpp:4536

kmp_taskloop_bounds_t::set_lb
void set_lb(kmp_uint64 lb)
Definition kmp_tasking.cpp:4581

kmp_taskloop_bounds_t::get_upper_offset
size_t get_upper_offset() const
Definition kmp_tasking.cpp:4537

kmp_taskloop_bounds_t::kmp_taskloop_bounds_t
kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
Definition kmp_tasking.cpp:4533

kmp_taskloop_bounds_t::kmp_taskloop_bounds_t
kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
Definition kmp_tasking.cpp:4526

kmp_taskloop_bounds_t::get_lb
kmp_uint64 get_lb() const
Definition kmp_tasking.cpp:4538

kmp_int64
int64_t kmp_int64
Definition common.h:10

kmp_routine_entry_t
kmp_int32(* kmp_routine_entry_t)(kmp_int32, void *)
Definition kmp.h:2483

kmp_task_t
struct kmp_task kmp_task_t

kmp_taskred_data_t
struct kmp_taskred_data kmp_taskred_data_t
Internal struct for reduction data item related info saved by the library.

kmp_task_red_input_t
struct kmp_task_red_input kmp_task_red_input_t
Internal struct for reduction data item related info set up by compiler.

kmp_taskred_flags_t
struct kmp_taskred_flags kmp_taskred_flags_t
Flags for special info per task reduction item.

kmp_taskred_input_t
struct kmp_taskred_input kmp_taskred_input_t
Internal struct for reduction data item related info set up by compiler.

__kmpc_task_reduction_get_th_data
void * __kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data)
Definition kmp_tasking.cpp:2436

__kmpc_taskloop
void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void *task_dup)
Definition kmp_tasking.cpp:5154

__kmpc_task_reduction_modifier_init
void * __kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
Definition kmp_tasking.cpp:2615

__kmpc_taskred_modifier_init
void * __kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
Definition kmp_tasking.cpp:2635

__kmpc_omp_has_task_team
bool __kmpc_omp_has_task_team(kmp_int32 gtid)
Definition kmp_tasking.cpp:5221

__kmpc_proxy_task_completed_ooo
void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask)
Definition kmp_tasking.cpp:4344

__kmpc_task_reduction_modifier_fini
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws)
Definition kmp_tasking.cpp:2649

__kmpc_omp_reg_task_with_affinity
kmp_int32 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins, kmp_task_affinity_info_t *affin_list)
Definition kmp_tasking.cpp:1505

__kmpc_taskloop_5
void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, int modifier, void *task_dup)
Definition kmp_tasking.cpp:5181

__kmpc_task_reduction_init
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
Definition kmp_tasking.cpp:2366

__kmpc_proxy_task_completed
void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask)
Definition kmp_tasking.cpp:4280

__kmpc_taskred_init
void * __kmpc_taskred_init(int gtid, int num, void *data)
Definition kmp_tasking.cpp:2393

__kmpc_omp_get_target_async_handle_ptr
void ** __kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid)
Definition kmp_tasking.cpp:5200

void
void
Definition ittnotify.h:3324

data
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type size_t void * data
Definition ittnotify_static.h:415

new_size
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t new_size
Definition ittnotify_static.h:322

instance
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance * instance
Definition ittnotify_static.h:217

count
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t count
Definition ittnotify_static.h:282

event
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type size_t void ITT_FORMAT p const __itt_domain __itt_id __itt_string_handle const wchar_t size_t ITT_FORMAT lu const __itt_domain __itt_id __itt_relation __itt_id ITT_FORMAT p const wchar_t int ITT_FORMAT __itt_group_mark d __itt_event event
Definition ittnotify_static.h:462

parent
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id parent
Definition ittnotify_static.h:349

size
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t size
Definition ittnotify_static.h:198

p
void const char const char int ITT_FORMAT __itt_group_sync p
Definition ittnotify_static.h:154

tail
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type size_t void ITT_FORMAT p const __itt_domain __itt_id __itt_string_handle const wchar_t size_t ITT_FORMAT lu const __itt_domain __itt_id __itt_relation __itt_id tail
Definition ittnotify_static.h:443

kmp.h

TASK_UNTIED
#define TASK_UNTIED
Definition kmp.h:40

TASK_NOT_PUSHED
#define TASK_NOT_PUSHED
Definition kmp.h:37

__kmp_free
#define __kmp_free(ptr)
Definition kmp.h:3759

__kmp_hidden_helper_main_thread
kmp_info_t * __kmp_hidden_helper_main_thread
Definition kmp_runtime.cpp:9223

TASK_PROXY
#define TASK_PROXY
Definition kmp.h:43

KMP_CPU_PAUSE
#define KMP_CPU_PAUSE()
Definition kmp.h:1597

__kmp_global
kmp_global_t __kmp_global
Definition kmp_global.cpp:464

TASK_EXPLICIT
#define TASK_EXPLICIT
Definition kmp.h:41

kmp_task_team_t
union kmp_task_team kmp_task_team_t
Definition kmp.h:256

__kmp_hidden_helper_worker_thread_signal
void __kmp_hidden_helper_worker_thread_signal()
Definition z_Linux_util.cpp:3065

KMP_YIELD_OVERSUB_ELSE_SPIN
#define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time)
Definition kmp.h:1673

KMP_MAX_BLOCKTIME
#define KMP_MAX_BLOCKTIME
Definition kmp.h:1257

INITIAL_TASK_DEQUE_SIZE
#define INITIAL_TASK_DEQUE_SIZE
Definition kmp.h:2835

kmp_taskgroup_t
struct kmp_taskgroup kmp_taskgroup_t

KMP_TASKDATA_TO_TASK
#define KMP_TASKDATA_TO_TASK(taskdata)
Definition kmp.h:2470

KMP_NOT_SAFE_TO_REAP
#define KMP_NOT_SAFE_TO_REAP
Definition kmp.h:2151

TASK_DEQUE_MASK
#define TASK_DEQUE_MASK(td)
Definition kmp.h:2838

__kmp_get_random
unsigned short __kmp_get_random(kmp_info_t *thread)
Definition kmp_runtime.cpp:3642

TASK_FULL
#define TASK_FULL
Definition kmp.h:44

__kmp_tasking_mode
kmp_tasking_mode_t __kmp_tasking_mode
Definition kmp_global.cpp:292

__kmp_abort_thread
void __kmp_abort_thread(void)
Definition kmp_runtime.cpp:495

__kmp_dflt_blocktime
int __kmp_dflt_blocktime
Definition kmp_global.cpp:156

__kmp_thread_pool
volatile kmp_info_t * __kmp_thread_pool
Definition kmp_global.cpp:455

KMP_GEN_TASK_ID
#define KMP_GEN_TASK_ID()
Definition kmp.h:3683

__kmp_task_team_lock
kmp_bootstrap_lock_t __kmp_task_team_lock
Definition kmp_tasking.cpp:3546

__kmp_omp_cancellation
int __kmp_omp_cancellation
Definition kmp_global.cpp:213

TASK_DEQUE_SIZE
#define TASK_DEQUE_SIZE(td)
Definition kmp.h:2837

KMP_GTID_TO_SHADOW_GTID
#define KMP_GTID_TO_SHADOW_GTID(gtid)
Definition kmp.h:4612

__kmp_get_thread
#define __kmp_get_thread()
Definition kmp.h:3607

kmp_depnode_t
union kmp_depnode kmp_depnode_t
Definition kmp.h:2524

TASK_CURRENT_NOT_QUEUED
#define TASK_CURRENT_NOT_QUEUED
Definition kmp.h:34

__kmp_tid_from_gtid
static int __kmp_tid_from_gtid(int gtid)
Definition kmp.h:3622

KMP_MIN
#define KMP_MIN(x, y)
Definition kmp.h:303

__kmp_init_hidden_helper
volatile int __kmp_init_hidden_helper
Definition kmp_global.cpp:50

KMP_EVENT_UNINITIALIZED
@ KMP_EVENT_UNINITIALIZED
Definition kmp.h:2621

KMP_EVENT_ALLOW_COMPLETION
@ KMP_EVENT_ALLOW_COMPLETION
Definition kmp.h:2622

__kmp_init_middle
volatile int __kmp_init_middle
Definition kmp_global.cpp:48

TASK_DETACHABLE
#define TASK_DETACHABLE
Definition kmp.h:45

cancel_parallel
@ cancel_parallel
Definition kmp.h:984

cancel_taskgroup
@ cancel_taskgroup
Definition kmp.h:987

cancel_noreq
@ cancel_noreq
Definition kmp.h:983

KMP_CHECK_UPDATE
#define KMP_CHECK_UPDATE(a, b)
Definition kmp.h:2386

TASK_IMPLICIT
#define TASK_IMPLICIT
Definition kmp.h:42

KMP_TASK_TO_TASKDATA
#define KMP_TASK_TO_TASKDATA(task)
Definition kmp.h:2469

kmp_thread_data_t
union KMP_ALIGN_CACHE kmp_thread_data kmp_thread_data_t

TASK_SUCCESSFULLY_PUSHED
#define TASK_SUCCESSFULLY_PUSHED
Definition kmp.h:38

kmp_task_affinity_info_t
struct kmp_task_affinity_info kmp_task_affinity_info_t

TASK_TIED
#define TASK_TIED
Definition kmp.h:39

__kmp_thread_malloc
#define __kmp_thread_malloc(th, size)
Definition kmp.h:3779

__kmp_middle_initialize
void __kmp_middle_initialize(void)
Definition kmp_runtime.cpp:7479

copy_icvs
static void copy_icvs(kmp_internal_control_t *dst, kmp_internal_control_t *src)
Definition kmp.h:2217

KMP_TASKING_ENABLED
#define KMP_TASKING_ENABLED(task_team)
Definition kmp.h:2474

__kmp_threads
kmp_info_t ** __kmp_threads
Definition kmp_global.cpp:447

KMP_HIDDEN_HELPER_THREAD
#define KMP_HIDDEN_HELPER_THREAD(gtid)
Definition kmp.h:4598

__kmp_enable_task_throttling
int __kmp_enable_task_throttling
Definition kmp_global.cpp:353

__kmp_task_stealing_constraint
int __kmp_task_stealing_constraint
Definition kmp_global.cpp:352

kmp_team_t
union kmp_team kmp_team_t
Definition kmp.h:254

KMP_INIT_YIELD
#define KMP_INIT_YIELD(count)
Definition kmp.h:1600

KMP_INIT_BACKOFF
#define KMP_INIT_BACKOFF(time)
Definition kmp.h:1603

KMP_YIELD
#define KMP_YIELD(cond)
Definition kmp.h:1615

__kmp_init_parallel
volatile int __kmp_init_parallel
Definition kmp_global.cpp:49

__kmp_enable_hidden_helper
kmp_int32 __kmp_enable_hidden_helper
Definition kmp_runtime.cpp:9230

__kmp_allocate
#define __kmp_allocate(size)
Definition kmp.h:3757

TRUE
#define TRUE
Definition kmp.h:1353

__kmp_library
enum library_type __kmp_library
Definition kmp_global.cpp:141

FALSE
#define FALSE
Definition kmp.h:1352

kmp_tasking_flags_t
struct kmp_tasking_flags kmp_tasking_flags_t

tskm_extra_barrier
@ tskm_extra_barrier
Definition kmp.h:2451

tskm_task_teams
@ tskm_task_teams
Definition kmp.h:2452

tskm_immediate_exec
@ tskm_immediate_exec
Definition kmp.h:2450

kmp_task_pri_t
struct kmp_task_pri kmp_task_pri_t

UNLIKELY
#define UNLIKELY(x)
Definition kmp.h:153

__kmp_taskloop_min_tasks
kmp_uint64 __kmp_taskloop_min_tasks
Definition kmp_global.cpp:294

__kmp_unexecuted_hidden_helper_tasks
std::atomic< kmp_int32 > __kmp_unexecuted_hidden_helper_tasks
Definition kmp_runtime.cpp:9224

__kmp_hidden_helper_threads
kmp_info_t ** __kmp_hidden_helper_threads
Definition kmp_runtime.cpp:9222

__kmp_wpolicy_passive
bool __kmp_wpolicy_passive
Definition kmp_global.cpp:158

bs_forkjoin_barrier
@ bs_forkjoin_barrier
Definition kmp.h:2167

__kmp_hidden_helper_initialize
void __kmp_hidden_helper_initialize()
Definition kmp_runtime.cpp:7565

__kmp_get_gtid
#define __kmp_get_gtid()
Definition kmp.h:3603

__kmp_max_task_priority
kmp_int32 __kmp_max_task_priority
Definition kmp_global.cpp:293

__kmp_assert_valid_gtid
static void __kmp_assert_valid_gtid(kmp_int32 gtid)
Definition kmp.h:3647

__kmp_thread_from_gtid
static kmp_info_t * __kmp_thread_from_gtid(int gtid)
Definition kmp.h:3637

__kmp_gtid_from_thread
static int __kmp_gtid_from_thread(const kmp_info_t *thr)
Definition kmp.h:3632

library_throughput
@ library_throughput
Definition kmp.h:504

kmp_taskdata_t
struct kmp_taskdata kmp_taskdata_t
Definition kmp.h:255

KMP_GTID_DNE
#define KMP_GTID_DNE
Definition kmp.h:1012

kmp_info_t
union KMP_ALIGN_CACHE kmp_info kmp_info_t

__kmp_thread_free
#define __kmp_thread_free(th, ptr)
Definition kmp.h:3785

char
char
Definition kmp_atomic.cpp:1112

kmp_uint32
KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86<<, 2i, 1, KMP_ARCH_X86) ATOMIC_CMPXCHG(fixed2, shr, kmp_int16, 16, > KMP_ARCH_X86 KMP_ARCH_X86 kmp_uint32
Definition kmp_atomic.cpp:1031

KE_TRACE
#define KE_TRACE(d, x)
Definition kmp_debug.h:161

KA_TRACE
#define KA_TRACE(d, x)
Definition kmp_debug.h:157

KMP_DEBUG_USE_VAR
#define KMP_DEBUG_USE_VAR(x)
Definition kmp_debug.h:63

KMP_ASSERT
#define KMP_ASSERT(cond)
Definition kmp_debug.h:59

KMP_BUILD_ASSERT
#define KMP_BUILD_ASSERT(expr)
Definition kmp_debug.h:26

KF_TRACE
#define KF_TRACE(d, x)
Definition kmp_debug.h:162

KMP_DEBUG_ASSERT
#define KMP_DEBUG_ASSERT(cond)
Definition kmp_debug.h:61

KMP_ASSERT2
#define KMP_ASSERT2(cond, msg)
Definition kmp_debug.h:60

kmp_uint64
unsigned long long kmp_uint64
Definition kmp_detach_tasks_t1.c:12

status
static volatile kmp_i18n_cat_status_t status
Definition kmp_i18n.cpp:48

kmp_i18n.h

kmp_itt.h

KMP_FSYNC_RELEASING
#define KMP_FSYNC_RELEASING(obj)
Definition kmp_itt.h:335

KMP_FSYNC_ACQUIRED
#define KMP_FSYNC_ACQUIRED(obj)
Definition kmp_itt.h:334

KMP_FSYNC_SPIN_ACQUIRED
#define KMP_FSYNC_SPIN_ACQUIRED(obj)
Definition kmp_itt.h:339

KMP_FSYNC_CANCEL
#define KMP_FSYNC_CANCEL(obj)
Definition kmp_itt.h:333

KMP_FSYNC_SPIN_PREPARE
#define KMP_FSYNC_SPIN_PREPARE(obj)
Definition kmp_itt.h:338

USE_ITT_BUILD_ARG
#define USE_ITT_BUILD_ARG(x)
Definition kmp_itt.h:346

KMP_FSYNC_SPIN_INIT
#define KMP_FSYNC_SPIN_INIT(obj, spin)
Definition kmp_itt.h:337

__kmp_acquire_tas_lock
int __kmp_acquire_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid)
Definition kmp_lock.cpp:118

__kmp_init_tas_lock
void __kmp_init_tas_lock(kmp_tas_lock_t *lck)
Definition kmp_lock.cpp:186

__kmp_release_tas_lock
int __kmp_release_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid)
Definition kmp_lock.cpp:157

__kmp_release_bootstrap_lock
static void __kmp_release_bootstrap_lock(kmp_bootstrap_lock_t *lck)
Definition kmp_lock.h:535

kmp_bootstrap_lock_t
kmp_ticket_lock_t kmp_bootstrap_lock_t
Definition kmp_lock.h:521

__kmp_test_lock
static int __kmp_test_lock(kmp_lock_t *lck, kmp_int32 gtid)
Definition kmp_lock.h:563

__kmp_acquire_bootstrap_lock
static int __kmp_acquire_bootstrap_lock(kmp_bootstrap_lock_t *lck)
Definition kmp_lock.h:527

__kmp_release_lock
static void __kmp_release_lock(kmp_lock_t *lck, kmp_int32 gtid)
Definition kmp_lock.h:567

__kmp_init_bootstrap_lock
static void __kmp_init_bootstrap_lock(kmp_bootstrap_lock_t *lck)
Definition kmp_lock.h:539

KMP_BOOTSTRAP_LOCK_INITIALIZER
#define KMP_BOOTSTRAP_LOCK_INITIALIZER(lock)
Definition kmp_lock.h:523

TCW_PTR
#define TCW_PTR(a, b)
Definition kmp_os.h:1167

KMP_ATOMIC_AND
#define KMP_ATOMIC_AND(p, v)
Definition kmp_os.h:1267

KMP_SIZE_T_MAX
#define KMP_SIZE_T_MAX
Definition kmp_os.h:195

kmp_uint
kmp_uint32 kmp_uint
Definition kmp_os.h:215

KMP_ATOMIC_ST_REL
#define KMP_ATOMIC_ST_REL(p, v)
Definition kmp_os.h:1261

__kmp_atomic_compare_store
bool __kmp_atomic_compare_store(std::atomic< T > *p, T expected, T desired)
Definition kmp_os.h:1276

TCR_PTR
#define TCR_PTR(a)
Definition kmp_os.h:1166

RCAST
#define RCAST(type, var)
Definition kmp_os.h:292

CACHE_LINE
#define CACHE_LINE
Definition kmp_os.h:340

KMP_ATOMIC_LD_ACQ
#define KMP_ATOMIC_LD_ACQ(p)
Definition kmp_os.h:1259

TCW_SYNC_4
#define TCW_SYNC_4(a, b)
Definition kmp_os.h:1146

KMP_ATOMIC_ST_RLX
#define KMP_ATOMIC_ST_RLX(p, v)
Definition kmp_os.h:1262

CCAST
#define CCAST(type, var)
Definition kmp_os.h:291

KMP_MB
#define KMP_MB()
Definition kmp_os.h:1066

kmp_int
kmp_int32 kmp_int
Definition kmp_os.h:214

TCR_4
#define TCR_4(a)
Definition kmp_os.h:1137

KMP_FALLTHROUGH
#define KMP_FALLTHROUGH()
Definition kmp_os.h:364

KMP_ATOMIC_DEC
#define KMP_ATOMIC_DEC(p)
Definition kmp_os.h:1270

KMP_ATOMIC_LD_RLX
#define KMP_ATOMIC_LD_RLX(p)
Definition kmp_os.h:1260

KMP_COMPARE_AND_STORE_ACQ32
#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv)
Definition kmp_os.h:814

TCW_4
#define TCW_4(a, b)
Definition kmp_os.h:1138

kmp_uintptr_t
unsigned long kmp_uintptr_t
Definition kmp_os.h:205

KMP_DLSYM
#define KMP_DLSYM(name)
Definition kmp_os.h:1302

KMP_ATOMIC_OR
#define KMP_ATOMIC_OR(p, v)
Definition kmp_os.h:1268

KMP_ATOMIC_INC
#define KMP_ATOMIC_INC(p)
Definition kmp_os.h:1269

KMP_MEMCPY
#define KMP_MEMCPY
Definition kmp_safe_c_api.h:73

KMP_MEMCPY_S
#define KMP_MEMCPY_S(dst, bsz, src, cnt)
Definition kmp_safe_c_api.h:64

sched
sched
Definition kmp_sch_simd_guided.c:26

kmp_stats.h
Functions for collecting statistics.

KMP_PUSH_PARTITIONED_TIMER
#define KMP_PUSH_PARTITIONED_TIMER(name)
Definition kmp_stats.h:1014

KMP_GET_THREAD_STATE
#define KMP_GET_THREAD_STATE()
Definition kmp_stats.h:1017

KMP_POP_PARTITIONED_TIMER
#define KMP_POP_PARTITIONED_TIMER()
Definition kmp_stats.h:1015

KMP_SET_THREAD_STATE_BLOCK
#define KMP_SET_THREAD_STATE_BLOCK(state_name)
Definition kmp_stats.h:1018

KMP_TIME_PARTITIONED_BLOCK
#define KMP_TIME_PARTITIONED_BLOCK(name)
Definition kmp_stats.h:1013

KMP_COUNT_BLOCK
#define KMP_COUNT_BLOCK(n)
Definition kmp_stats.h:1001

i
#define i
Definition kmp_stub.cpp:87

kmp_taskdeps.h

__kmp_dephash_free
static void __kmp_dephash_free(kmp_info_t *thread, kmp_dephash_t *h)
Definition kmp_taskdeps.h:86

__kmp_dephash_free_entries
static void __kmp_dephash_free_entries(kmp_info_t *thread, kmp_dephash_t *h)
Definition kmp_taskdeps.h:59

__kmp_release_deps
static void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task)
Definition kmp_taskdeps.h:97

__kmp_free_task_team
void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team)
Definition kmp_tasking.cpp:3793

__kmp_atomic_execute_tasks_64< true, false >
template int __kmp_atomic_execute_tasks_64< true, false >(kmp_info_t *, kmp_int32, kmp_atomic_flag_64< true, false > *, int, int *USE_ITT_BUILD_ARG(void *), kmp_int32)

__kmp_call_init
void __kmp_call_init(kmp_taskred_data_t &item, size_t j)

__kmp_call_init< kmp_task_red_input_t >
void __kmp_call_init< kmp_task_red_input_t >(kmp_taskred_data_t &item, size_t offset)
Definition kmp_tasking.cpp:2286

PROXY_TASK_FLAG
#define PROXY_TASK_FLAG
Definition kmp_tasking.cpp:4205

p_task_dup_t
void(* p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32)
Definition kmp_tasking.cpp:4512

__kmp_task_reduction_modifier_init
void * __kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, T *data)
Definition kmp_tasking.cpp:2557

__kmp_task_reduction_clean
static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg)
Definition kmp_tasking.cpp:2550

__kmpc_omp_task_complete_if0_template
static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task)
Definition kmp_tasking.cpp:1009

__kmpc_omp_task_alloc
kmp_task_t * __kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds, kmp_routine_entry_t task_entry)
Definition kmp_tasking.cpp:1449

__kmp_reap_task_teams
void __kmp_reap_task_teams(void)
Definition kmp_tasking.cpp:3812

__kmpc_omp_taskyield
kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part)
Definition kmp_tasking.cpp:2133

__kmpc_omp_task
kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task)
Definition kmp_tasking.cpp:1884

__kmp_task_start
static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *current_task)
Definition kmp_tasking.cpp:455

__kmp_pop_task_team_node
void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team)
Definition kmp_tasking.cpp:3852

__kmp_execute_tasks_64
int __kmp_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64< C, S > *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained)
Definition kmp_tasking.cpp:3392

__kmp_wait_to_unref_task_teams
void __kmp_wait_to_unref_task_teams(void)
Definition kmp_tasking.cpp:3872

__kmp_allocate_task_team
static kmp_task_team_t * __kmp_allocate_task_team(kmp_info_t *thread, kmp_team_t *team)
Definition kmp_tasking.cpp:3737

__kmp_task_team_sync
void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team)
Definition kmp_tasking.cpp:4020

__kmp_task_reduction_init_copy
void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data, kmp_taskgroup_t *tg, void *reduce_data)
Definition kmp_tasking.cpp:2410

__kmpc_omp_task_complete_if0
void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task)
Definition kmp_tasking.cpp:1046

__kmp_realloc_task_deque
static void __kmp_realloc_task_deque(kmp_info_t *thread, kmp_thread_data_t *thread_data)
Definition kmp_tasking.cpp:97

__kmp_bottom_half_finish_proxy
static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask)
Definition kmp_tasking.cpp:4254

__kmpc_end_taskgroup
void __kmpc_end_taskgroup(ident_t *loc, int gtid)
Definition kmp_tasking.cpp:2688

__kmpc_omp_task_begin_if0
void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task)
Definition kmp_tasking.cpp:611

__kmp_first_top_half_finish_proxy
static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata)
Definition kmp_tasking.cpp:4222

__kmp_execute_tasks_template
static int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained)
Definition kmp_tasking.cpp:3152

__kmp_free_task_and_ancestors
static void __kmp_free_task_and_ancestors(kmp_int32 gtid, kmp_taskdata_t *taskdata, kmp_info_t *thread)
Definition kmp_tasking.cpp:705

__kmp_task_finish
static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *resumed_task)
Definition kmp_tasking.cpp:799

__kmp_invoke_task
static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t *current_task)
Definition kmp_tasking.cpp:1520

__kmp_get_priority_deque_data
static kmp_thread_data_t * __kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri)
Definition kmp_tasking.cpp:143

__kmp_execute_tasks_64< false, true >
template int __kmp_execute_tasks_64< false, true >(kmp_info_t *, kmp_int32, kmp_flag_64< false, true > *, int, int *USE_ITT_BUILD_ARG(void *), kmp_int32)

__kmp_finish_implicit_task
void __kmp_finish_implicit_task(kmp_info_t *thread)
Definition kmp_tasking.cpp:1150

__kmp_execute_tasks_32< false, false >
template int __kmp_execute_tasks_32< false, false >(kmp_info_t *, kmp_int32, kmp_flag_32< false, false > *, int, int *USE_ITT_BUILD_ARG(void *), kmp_int32)

__taskloop_params_t
struct __taskloop_params __taskloop_params_t

__kmpc_task_allow_completion_event
kmp_event_t * __kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid, kmp_task_t *task)
Definition kmp_tasking.cpp:4367

__kmp_free_task_threads_data
static void __kmp_free_task_threads_data(kmp_task_team_t *task_team)
Definition kmp_tasking.cpp:3688

__kmp_assign_orig
void __kmp_assign_orig(kmp_taskred_data_t &item, T &src)

__kmpc_omp_taskwait_template
static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid, void *frame_address, void *return_address)
Definition kmp_tasking.cpp:1991

__kmp_task_reduction_fini
static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg)
Definition kmp_tasking.cpp:2507

__kmp_free_task
static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata, kmp_info_t *thread)
Definition kmp_tasking.cpp:649

__kmp_execute_tasks_64< true, false >
template int __kmp_execute_tasks_64< true, false >(kmp_info_t *, kmp_int32, kmp_flag_64< true, false > *, int, int *USE_ITT_BUILD_ARG(void *), kmp_int32)

__kmp_atomic_execute_tasks_64
int __kmp_atomic_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64< C, S > *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained)
Definition kmp_tasking.cpp:3402

__kmp_task_team_wait
void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait)
Definition kmp_tasking.cpp:4046

__kmpc_omp_task_parts
kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task)
Definition kmp_tasking.cpp:1737

__kmp_alloc_task_pri_list
static kmp_task_pri_t * __kmp_alloc_task_pri_list()
Definition kmp_tasking.cpp:123

__kmp_taskloop
static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, int modifier, void *task_dup)
Definition kmp_tasking.cpp:4970

__kmp_omp_taskloop_task
kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, void *codeptr_ra)
Definition kmp_tasking.cpp:1951

__kmp_init_implicit_task
void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, kmp_team_t *team, int tid, int set_curr_task)
Definition kmp_tasking.cpp:1085

__kmp_push_task
static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task)
Definition kmp_tasking.cpp:262

__kmp_steal_task
static kmp_task_t * __kmp_steal_task(kmp_int32 victim_tid, kmp_int32 gtid, kmp_task_team_t *task_team, std::atomic< kmp_int32 > *unfinished_threads, int *thread_finished, kmp_int32 is_constrained)
Definition kmp_tasking.cpp:3011

__kmp_task_reduction_init
void * __kmp_task_reduction_init(int gtid, int num, T *data)
Definition kmp_tasking.cpp:2298

__kmp_enable_tasking
static void __kmp_enable_tasking(kmp_task_team_t *task_team, kmp_info_t *this_thr)
Definition kmp_tasking.cpp:3448

__kmp_give_task
static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task, kmp_int32 pass)
Definition kmp_tasking.cpp:4125

__kmp_execute_tasks_32
int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32< C, S > *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained)
Definition kmp_tasking.cpp:3382

__kmp_get_priority_task
static kmp_task_t * __kmp_get_priority_task(kmp_int32 gtid, kmp_task_team_t *task_team, kmp_int32 is_constrained)
Definition kmp_tasking.cpp:2838

__kmp_tasking_barrier
void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid)
Definition kmp_tasking.cpp:4090

__kmp_free_implicit_task
void __kmp_free_implicit_task(kmp_info_t *thread)
Definition kmp_tasking.cpp:1186

__kmp_execute_tasks_oncore
int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj), kmp_int32 is_constrained)
Definition kmp_tasking.cpp:3411

__kmp_round_up_to_val
static size_t __kmp_round_up_to_val(size_t size, size_t val)
Definition kmp_tasking.cpp:1196

__kmpc_omp_target_task_alloc
kmp_task_t * __kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds, kmp_routine_entry_t task_entry, kmp_int64 device_id)
Definition kmp_tasking.cpp:1473

__kmp_fulfill_event
void __kmp_fulfill_event(kmp_event_t *event)
Definition kmp_tasking.cpp:4378

__kmp_assign_orig< kmp_taskred_input_t >
void __kmp_assign_orig< kmp_taskred_input_t >(kmp_taskred_data_t &item, kmp_taskred_input_t &src)
Definition kmp_tasking.cpp:2275

__kmpc_omp_task_begin_if0_template
static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task, void *frame_address, void *return_address)
Definition kmp_tasking.cpp:537

__kmp_assign_orig< kmp_task_red_input_t >
void __kmp_assign_orig< kmp_task_red_input_t >(kmp_taskred_data_t &item, kmp_task_red_input_t &src)
Definition kmp_tasking.cpp:2270

__kmp_realloc_task_threads_data
static int __kmp_realloc_task_threads_data(kmp_info_t *thread, kmp_task_team_t *task_team)
Definition kmp_tasking.cpp:3598

__kmp_atomic_execute_tasks_64< false, true >
template int __kmp_atomic_execute_tasks_64< false, true >(kmp_info_t *, kmp_int32, kmp_atomic_flag_64< false, true > *, int, int *USE_ITT_BUILD_ARG(void *), kmp_int32)

__kmp_free_task_teams
static kmp_task_team_t * __kmp_free_task_teams
Definition kmp_tasking.cpp:3543

__kmp_task_dup_alloc
kmp_task_t * __kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src)
Definition kmp_tasking.cpp:4433

__kmp_task_alloc
kmp_task_t * __kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_tasking_flags_t *flags, size_t sizeof_kmp_task_t, size_t sizeof_shareds, kmp_routine_entry_t task_entry)
Definition kmp_tasking.cpp:1218

__kmpc_omp_taskwait
kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid)
Definition kmp_tasking.cpp:2121

__kmp_taskloop_task
int __kmp_taskloop_task(int gtid, void *ptask)
Definition kmp_tasking.cpp:4774

__kmp_push_current_task_to_thread
void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team, int tid)
Definition kmp_tasking.cpp:419

__kmp_push_task_team_node
void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team)
Definition kmp_tasking.cpp:3839

__kmp_second_top_half_finish_proxy
static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata)
Definition kmp_tasking.cpp:4241

__kmp_free_task_pri_list
static void __kmp_free_task_pri_list(kmp_task_team_t *task_team)
Definition kmp_tasking.cpp:3704

__kmp_task_is_allowed
static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained, const kmp_taskdata_t *tasknew, const kmp_taskdata_t *taskcurr)
Definition kmp_tasking.cpp:48

__kmp_remove_my_task
static kmp_task_t * __kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid, kmp_task_team_t *task_team, kmp_int32 is_constrained)
Definition kmp_tasking.cpp:2939

__kmpc_taskgroup
void __kmpc_taskgroup(ident_t *loc, int gtid)
Definition kmp_tasking.cpp:2654

__kmp_omp_task
kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task, bool serialize_immediate)
Definition kmp_tasking.cpp:1794

__kmp_task_team_init
static void __kmp_task_team_init(kmp_task_team_t *task_team, kmp_team_t *team)
Definition kmp_tasking.cpp:3719

__kmp_pop_current_task_from_thread
void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr)
Definition kmp_tasking.cpp:397

__kmp_task_team_setup
void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team)
Definition kmp_tasking.cpp:3935

__kmp_alloc_task_deque
static void __kmp_alloc_task_deque(kmp_info_t *thread, kmp_thread_data_t *thread_data)
Definition kmp_tasking.cpp:3554

__kmp_free_task_deque
static void __kmp_free_task_deque(kmp_thread_data_t *thread_data)
Definition kmp_tasking.cpp:3581

__kmp_push_priority_task
static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread, kmp_taskdata_t *taskdata, kmp_task_team_t *task_team, kmp_int32 pri)
Definition kmp_tasking.cpp:187

__kmpc_give_task
void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start=0)
Definition kmp_tasking.cpp:4298

__kmp_call_init< kmp_taskred_input_t >
void __kmp_call_init< kmp_taskred_input_t >(kmp_taskred_data_t &item, size_t offset)
Definition kmp_tasking.cpp:2291

__kmp_taskloop_linear
void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, kmp_uint64 ub_glob, kmp_uint64 num_tasks, kmp_uint64 grainsize, kmp_uint64 extras, kmp_int64 last_chunk, kmp_uint64 tc, void *task_dup)
Definition kmp_tasking.cpp:4637

__kmp_taskloop_recur
void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *, kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64, kmp_uint64, void *)
Definition kmp_tasking.cpp:4840

__kmp_track_children_task
static bool __kmp_track_children_task(kmp_taskdata_t *taskdata)
Definition kmp_tasking.cpp:775

counter
int counter
Definition kmp_taskloop.c:13

kmp_wait_release.h

__kmp_null_resume_wrapper
static void __kmp_null_resume_wrapper(kmp_info_t *thr)
Definition kmp_wait_release.h:1027

kmp_int32
int32_t kmp_int32
Definition omp__kmpc_fork_call_if.c:6

arr
int arr[N][N][N]
Definition omp_for_collapse_non_rectangular.c:10

level
int level
Definition omp_foreign_thread_team_reuse.c:18

C
#define C
Definition omp_nonmonotonic_nowait.c:10

val
int val
Definition omp_record_replay_deps.cpp:9

__kmpc_start_record_task
int __kmpc_start_record_task(ident_t *, int, int, int)

__kmpc_end_record_task
void __kmpc_end_record_task(ident_t *, int, int, int)

j
int j
Definition omp_single_copyprivate.c:6

res
#define res
Definition omp_task_red_taskloop.c:54

ompt_enabled
ompt_callbacks_active_t ompt_enabled
Definition ompt-general.cpp:88

ret
return ret
Definition ompt-general.cpp:394

ompt_callbacks
ompt_callbacks_internal_t ompt_callbacks
Definition ompt-general.cpp:102

OMPT_NOINLINE
#define OMPT_NOINLINE
Definition ompt-internal.h:127

OMPT_GET_RETURN_ADDRESS
#define OMPT_GET_RETURN_ADDRESS(level)
Definition ompt-internal.h:112

TASK_TYPE_DETAILS_FORMAT
#define TASK_TYPE_DETAILS_FORMAT(info)
Definition ompt-internal.h:47

OMPT_FRAME_FLAGS_APP
#define OMPT_FRAME_FLAGS_APP
Definition ompt-internal.h:115

OMPT_GET_FRAME_ADDRESS
#define OMPT_GET_FRAME_ADDRESS(level)
Definition ompt-internal.h:114

OMPT_FRAME_FLAGS_RUNTIME
#define OMPT_FRAME_FLAGS_RUNTIME
Definition ompt-internal.h:116

__ompt_get_teaminfo
ompt_team_info_t * __ompt_get_teaminfo(int depth, int *size)
Definition ompt-specific.cpp:51

__ompt_get_task_info_internal
int __ompt_get_task_info_internal(int ancestor_level, int *type, ompt_data_t **task_data, ompt_frame_t **task_frame, ompt_data_t **parallel_data, int *thread_num)
Definition ompt-specific.cpp:359

__ompt_get_task_info_object
ompt_task_info_t * __ompt_get_task_info_object(int depth)
Definition ompt-specific.cpp:103

ompt-specific.h

loc
static id loc
Definition kmp_task_depend_all.c:170

flag
volatile int flag
Definition root-threads-affinity.c:24

__attribute__
__attribute__((noinline))
Definition simd_conservative_ordered.c:22

__taskloop_params
Definition kmp_tasking.cpp:4746

__taskloop_params::num_tasks
kmp_uint64 num_tasks
Definition kmp_tasking.cpp:4753

__taskloop_params::grainsize
kmp_uint64 grainsize
Definition kmp_tasking.cpp:4754

__taskloop_params::tc
kmp_uint64 tc
Definition kmp_tasking.cpp:4757

__taskloop_params::st
kmp_int64 st
Definition kmp_tasking.cpp:4751

__taskloop_params::last_chunk
kmp_int64 last_chunk
Definition kmp_tasking.cpp:4756

__taskloop_params::ub
kmp_uint64 * ub
Definition kmp_tasking.cpp:4749

__taskloop_params::num_t_min
kmp_uint64 num_t_min
Definition kmp_tasking.cpp:4758

__taskloop_params::task_dup
void * task_dup
Definition kmp_tasking.cpp:4750

__taskloop_params::task
kmp_task_t * task
Definition kmp_tasking.cpp:4747

__taskloop_params::extras
kmp_uint64 extras
Definition kmp_tasking.cpp:4755

__taskloop_params::ub_glob
kmp_uint64 ub_glob
Definition kmp_tasking.cpp:4752

__taskloop_params::lb
kmp_uint64 * lb
Definition kmp_tasking.cpp:4748

ident_t
Definition teams-no-par.c:23

kmp_base_depnode::mtx_locks
kmp_lock_t * mtx_locks[MAX_MTX_DEPS]
Definition kmp.h:2575

kmp_base_depnode::mtx_num_locks
kmp_int32 mtx_num_locks
Definition kmp.h:2576

kmp_base_task_team::tt_found_proxy_tasks
kmp_int32 tt_found_proxy_tasks
Definition kmp.h:2871

kmp_base_task_team::tt_unfinished_threads
KMP_ALIGN_CACHE std::atomic< kmp_int32 > tt_unfinished_threads
Definition kmp.h:2879

kmp_base_task_team::tt_max_threads
kmp_int32 tt_max_threads
Definition kmp.h:2870

kmp_base_task_team::tt_nproc
kmp_int32 tt_nproc
Definition kmp.h:2869

kmp_base_task_team::tt_task_pri_lock
kmp_bootstrap_lock_t tt_task_pri_lock
Definition kmp.h:2859

kmp_base_task_team::tt_num_task_pri
std::atomic< kmp_int32 > tt_num_task_pri
Definition kmp.h:2873

kmp_base_task_team::tt_threads_lock
kmp_bootstrap_lock_t tt_threads_lock
Definition kmp.h:2855

kmp_base_task_team::tt_untied_task_encountered
kmp_int32 tt_untied_task_encountered
Definition kmp.h:2872

kmp_base_task_team::tt_task_pri_list
kmp_task_pri_t * tt_task_pri_list
Definition kmp.h:2860

kmp_base_task_team::tt_hidden_helper_task_encountered
kmp_int32 tt_hidden_helper_task_encountered
Definition kmp.h:2876

kmp_base_task_team::tt_threads_data
kmp_thread_data_t * tt_threads_data
Definition kmp.h:2864

kmp_base_task_team::tt_active
KMP_ALIGN_CACHE volatile kmp_uint32 tt_active
Definition kmp.h:2883

kmp_base_task_team::tt_next
kmp_task_team_t * tt_next
Definition kmp.h:2862

kmp_base_task_team::tt_found_tasks
kmp_int32 tt_found_tasks
Definition kmp.h:2866

kmp_event_t
Definition kmp.h:2625

kmp_event_t::lock
kmp_tas_lock_t lock
Definition kmp.h:2627

kmp_event_t::ed
union kmp_event_t::@225247164214002037241165027111053205022334261172 ed

kmp_event_t::task
kmp_task_t * task
Definition kmp.h:2629

kmp_event_t::type
kmp_event_type_t type
Definition kmp.h:2626

kmp_target_data::async_handle
void * async_handle
Definition kmp.h:2762

kmp_task_pri::priority
kmp_int32 priority
Definition kmp.h:2848

kmp_task_pri::next
kmp_task_pri * next
Definition kmp.h:2849

kmp_task_pri::td
kmp_thread_data_t td
Definition kmp.h:2847

kmp_task_red_input
Internal struct for reduction data item related info set up by compiler.
Definition kmp_tasking.cpp:2223

kmp_task_red_input::reduce_shar
void * reduce_shar
shared between tasks item to reduce into
Definition kmp_tasking.cpp:2224

kmp_task_red_input::reduce_fini
void * reduce_fini
data finalization routine
Definition kmp_tasking.cpp:2228

kmp_task_red_input::flags
kmp_taskred_flags_t flags
flags for additional info from compiler
Definition kmp_tasking.cpp:2230

kmp_task_red_input::reduce_size
size_t reduce_size
size of data item in bytes
Definition kmp_tasking.cpp:2225

kmp_task_red_input::reduce_init
void * reduce_init
data initialization routine (single parameter)
Definition kmp_tasking.cpp:2227

kmp_task_red_input::reduce_comb
void * reduce_comb
data combiner routine
Definition kmp_tasking.cpp:2229

kmp_task_team_list_t
Definition kmp.h:2892

kmp_task_team_list_t::next
kmp_task_team_list_t * next
Definition kmp.h:2894

kmp_task_team_list_t::task_team
kmp_task_team_t * task_team
Definition kmp.h:2893

kmp_task::shareds
void * shareds
pointer to block of pointers to shared vars
Definition kmp.h:2497

kmp_taskdata::td_taskwait_counter
kmp_uint32 td_taskwait_counter
Definition kmp.h:2777

kmp_taskdata::td_taskwait_ident
ident_t * td_taskwait_ident
Definition kmp.h:2776

kmp_taskdata::td_level
kmp_int32 td_level
Definition kmp.h:2772

kmp_taskdata::td_team
kmp_team_t * td_team
Definition kmp.h:2768

kmp_taskdata::td_task_team
kmp_task_team_t * td_task_team
Definition kmp.h:2792

kmp_taskdata::td_dephash
kmp_dephash_t * td_dephash
Definition kmp.h:2789

kmp_taskdata::td_parent
kmp_taskdata_t * td_parent
Definition kmp.h:2771

kmp_taskdata::td_incomplete_child_tasks
std::atomic< kmp_int32 > td_incomplete_child_tasks
Definition kmp.h:2785

kmp_taskdata::td_untied_count
std::atomic< kmp_int32 > td_untied_count
Definition kmp.h:2773

kmp_taskdata::td_taskgroup
kmp_taskgroup_t * td_taskgroup
Definition kmp.h:2787

kmp_taskdata::td_task_id
kmp_int32 td_task_id
Definition kmp.h:2766

kmp_taskdata::td_alloc_thread
kmp_info_p * td_alloc_thread
Definition kmp.h:2769

kmp_taskdata::td_ident
ident_t * td_ident
Definition kmp.h:2774

kmp_taskdata::td_depnode
kmp_depnode_t * td_depnode
Definition kmp.h:2791

kmp_taskdata::td_taskwait_thread
kmp_int32 td_taskwait_thread
Definition kmp.h:2778

kmp_taskdata::td_flags
kmp_tasking_flags_t td_flags
Definition kmp.h:2767

kmp_taskdata::td_last_tied
kmp_taskdata_t * td_last_tied
Definition kmp.h:2798

kmp_taskdata::td_icvs
KMP_ALIGN_CACHE kmp_internal_control_t td_icvs
Definition kmp.h:2780

kmp_taskdata::td_allow_completion_event
kmp_event_t td_allow_completion_event
Definition kmp.h:2803

kmp_taskdata::td_size_alloc
size_t td_size_alloc
Definition kmp.h:2793

kmp_taskdata::td_target_data
kmp_target_data_t td_target_data
Definition kmp.h:2812

kmp_taskdata::td_allocated_child_tasks
KMP_ALIGN_CACHE std::atomic< kmp_int32 > td_allocated_child_tasks
Definition kmp.h:2782

kmp_taskgroup::cancel_request
std::atomic< kmp_int32 > cancel_request
Definition kmp.h:2515

kmp_taskgroup::gomp_data
uintptr_t * gomp_data
Definition kmp.h:2520

kmp_taskgroup::count
std::atomic< kmp_int32 > count
Definition kmp.h:2513

kmp_taskgroup::reduce_data
void * reduce_data
Definition kmp.h:2518

kmp_taskgroup::parent
struct kmp_taskgroup * parent
Definition kmp.h:2516

kmp_taskgroup::reduce_num_data
kmp_int32 reduce_num_data
Definition kmp.h:2519

kmp_tasking_flags::target
unsigned target
Definition kmp.h:2750

kmp_tasking_flags::priority_specified
unsigned priority_specified
Definition kmp.h:2729

kmp_tasking_flags::detachable
unsigned detachable
Definition kmp.h:2731

kmp_tasking_flags::task_serial
unsigned task_serial
Definition kmp.h:2738

kmp_tasking_flags::merged_if0
unsigned merged_if0
Definition kmp.h:2723

kmp_tasking_flags::complete
unsigned complete
Definition kmp.h:2747

kmp_tasking_flags::freed
unsigned freed
Definition kmp.h:2748

kmp_tasking_flags::executing
unsigned executing
Definition kmp.h:2746

kmp_tasking_flags::tasking_ser
unsigned tasking_ser
Definition kmp.h:2739

kmp_tasking_flags::team_serial
unsigned team_serial
Definition kmp.h:2741

kmp_tasking_flags::native
unsigned native
Definition kmp.h:2749

kmp_tasking_flags::tiedness
unsigned tiedness
Definition kmp.h:2721

kmp_tasking_flags::started
unsigned started
Definition kmp.h:2745

kmp_tasking_flags::destructors_thunk
unsigned destructors_thunk
Definition kmp.h:2725

kmp_tasking_flags::proxy
unsigned proxy
Definition kmp.h:2727

kmp_tasking_flags::tasktype
unsigned tasktype
Definition kmp.h:2737

kmp_tasking_flags::final
unsigned final
Definition kmp.h:2722

kmp_tasking_flags::hidden_helper
unsigned hidden_helper
Definition kmp.h:2751

kmp_taskred_data
Internal struct for reduction data item related info saved by the library.
Definition kmp_tasking.cpp:2236

kmp_taskred_data::reduce_init
void * reduce_init
data initialization routine (two parameters)
Definition kmp_tasking.cpp:2244

kmp_taskred_data::reduce_priv
void * reduce_priv
array of thread specific items
Definition kmp_tasking.cpp:2240

kmp_taskred_data::reduce_pend
void * reduce_pend
end of private data for faster comparison op
Definition kmp_tasking.cpp:2241

kmp_taskred_data::reduce_comb
void * reduce_comb
data combiner routine
Definition kmp_tasking.cpp:2243

kmp_taskred_data::flags
kmp_taskred_flags_t flags
flags for additional info from compiler
Definition kmp_tasking.cpp:2239

kmp_taskred_data::reduce_fini
void * reduce_fini
data finalization routine
Definition kmp_tasking.cpp:2245

kmp_taskred_data::reduce_size
size_t reduce_size
size of data item
Definition kmp_tasking.cpp:2238

kmp_taskred_data::reduce_shar
void * reduce_shar
shared between tasks item to reduce into
Definition kmp_tasking.cpp:2237

kmp_taskred_data::reduce_orig
void * reduce_orig
original item (can be used in UDR initializer)
Definition kmp_tasking.cpp:2246

kmp_taskred_flags
Flags for special info per task reduction item.
Definition kmp_tasking.cpp:2214

kmp_taskred_flags::reserved31
unsigned reserved31
Definition kmp_tasking.cpp:2217

kmp_taskred_flags::lazy_priv
unsigned lazy_priv
1 - use lazy alloc/init (e.g.
Definition kmp_tasking.cpp:2216

kmp_taskred_input
Internal struct for reduction data item related info set up by compiler.
Definition kmp_tasking.cpp:2254

kmp_taskred_input::reduce_shar
void * reduce_shar
shared between tasks item to reduce into
Definition kmp_tasking.cpp:2255

kmp_taskred_input::reduce_fini
void * reduce_fini
data finalization routine
Definition kmp_tasking.cpp:2260

kmp_taskred_input::reduce_init
void * reduce_init
data initialization routine (two parameters)
Definition kmp_tasking.cpp:2259

kmp_taskred_input::reduce_comb
void * reduce_comb
data combiner routine
Definition kmp_tasking.cpp:2261

kmp_taskred_input::reduce_size
size_t reduce_size
size of data item
Definition kmp_tasking.cpp:2257

kmp_taskred_input::reduce_orig
void * reduce_orig
original reduction item used for initialization
Definition kmp_tasking.cpp:2256

kmp_taskred_input::flags
kmp_taskred_flags_t flags
flags for additional info from compiler
Definition kmp_tasking.cpp:2262

ompt_task_info_t
Definition ompt-internal.h:59

ompt_task_info_t::task_data
ompt_data_t task_data
Definition ompt-internal.h:61

ompt_task_info_t::frame
ompt_frame_t frame
Definition ompt-internal.h:60

ompt_team_info_t
Definition ompt-internal.h:67

ompt_team_info_t::parallel_data
ompt_data_t parallel_data
Definition ompt-internal.h:68

ompt_thread_info_t
Definition ompt-internal.h:79

ompt_thread_info_t::wait_id
ompt_wait_id_t wait_id
Definition ompt-internal.h:86

task
Definition kmp_task_depend_all.c:99

task::routine
int(* routine)(int, struct task *)
Definition bug_taskwait_detach.cpp:46

task::part_id
int part_id
Definition kmp_task_depend_all.c:102

task::th
int th
Definition kmp_taskloop.c:38

task::shareds
void ** shareds
Definition kmp_task_depend_all.c:100

task::priority
int priority
Definition kmp_task_depend_all.c:104

kmp_depnode::dn
kmp_base_depnode_t dn
Definition kmp.h:2588

kmp_task_team::tt
kmp_base_task_team_t tt
Definition kmp.h:2887

kmp_team::t
kmp_base_team_t t
Definition kmp.h:3237

__kmp_is_thread_alive
int __kmp_is_thread_alive(kmp_info_t *th, DWORD *exit_val)
Definition z_Windows_NT_util.cpp:1291