doxygen/kmp__dispatch_8cpp_source.html

/*

 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.

 */


//===----------------------------------------------------------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//


/* Dynamic scheduling initialization and dispatch.

 *

 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however

 *       it may change values between parallel regions.  __kmp_max_nth

 *       is the largest value __kmp_nth may take, 1 is the smallest.

 */


#include "kmp.h"

#include "kmp_error.h"

#include "kmp_i18n.h"

#include "kmp_itt.h"

#include "kmp_stats.h"

#include "kmp_str.h"

#if KMP_USE_X87CONTROL

#include <float.h>

#endif

#include "kmp_lock.h"

#include "kmp_dispatch.h"

#if KMP_USE_HIER_SCHED

#include "kmp_dispatch_hier.h"

#endif


#if OMPT_SUPPORT

#include "ompt-specific.h"

#endif


/* ------------------------------------------------------------------------ */

/* ------------------------------------------------------------------------ */


void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {

  kmp_info_t *th;


  KMP_DEBUG_ASSERT(gtid_ref);


  if (__kmp_env_consistency_check) {

    th = __kmp_threads[*gtid_ref];

    if (th->th.th_root->r.r_active &&

        (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {

#if KMP_USE_DYNAMIC_LOCK

      __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);

#else

      __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);

#endif

    }

  }

}


void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {

  kmp_info_t *th;


  if (__kmp_env_consistency_check) {

    th = __kmp_threads[*gtid_ref];

    if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {

      __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);

    }

  }

}


// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC

static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,

                                         bool use_hier = false) {

  // Pick up the nonmonotonic/monotonic bits from the scheduling type

  // Nonmonotonic as default for dynamic schedule when no modifier is specified

  int monotonicity = SCHEDULE_NONMONOTONIC;


  // Let default be monotonic for executables

  // compiled with OpenMP* 4.5 or less compilers

  if (loc != NULL && loc->get_openmp_version() < 50)

    monotonicity = SCHEDULE_MONOTONIC;


  if (use_hier || __kmp_force_monotonic)

    monotonicity = SCHEDULE_MONOTONIC;

  else if (SCHEDULE_HAS_NONMONOTONIC(schedule))

    monotonicity = SCHEDULE_NONMONOTONIC;

  else if (SCHEDULE_HAS_MONOTONIC(schedule))

    monotonicity = SCHEDULE_MONOTONIC;


  return monotonicity;

}


#if KMP_WEIGHTED_ITERATIONS_SUPPORTED

// Return floating point number rounded to two decimal points

static inline float __kmp_round_2decimal_val(float num) {

  return (float)(static_cast<int>(num * 100 + 0.5)) / 100;

}

static inline int __kmp_get_round_val(float num) {

  return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5);

}

#endif


template <typename T>

inline void

__kmp_initialize_self_buffer(kmp_team_t *team, T id,

                             dispatch_private_info_template<T> *pr,

                             typename traits_t<T>::unsigned_t nchunks, T nproc,

                             typename traits_t<T>::unsigned_t &init,

                             T &small_chunk, T &extras, T &p_extra) {


#if KMP_WEIGHTED_ITERATIONS_SUPPORTED

  if (pr->flags.use_hybrid) {

    kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)];

    kmp_hw_core_type_t type =

        (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;

    T pchunks = pr->u.p.pchunks;

    T echunks = nchunks - pchunks;

    T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;

    T num_procs_with_ecore = nproc - num_procs_with_pcore;

    T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;

    T big_chunk =

        pchunks / num_procs_with_pcore; // chunks per thread with p-core

    small_chunk =

        echunks / num_procs_with_ecore; // chunks per thread with e-core


    extras =

        (pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);


    p_extra = (big_chunk - small_chunk);


    if (type == KMP_HW_CORE_TYPE_CORE) {

      if (id < first_thread_with_ecore) {

        init = id * small_chunk + id * p_extra + (id < extras ? id : extras);

      } else {

        init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +

               (id < extras ? id : extras);

      }

    } else {

      if (id == first_thread_with_ecore) {

        init = id * small_chunk + id * p_extra + (id < extras ? id : extras);

      } else {

        init = id * small_chunk + first_thread_with_ecore * p_extra +

               (id < extras ? id : extras);

      }

    }

    p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;

    return;

  }

#endif


  small_chunk = nchunks / nproc; // chunks per thread

  extras = nchunks % nproc;

  p_extra = 0;

  init = id * small_chunk + (id < extras ? id : extras);

}


#if KMP_STATIC_STEAL_ENABLED

enum { // values for steal_flag (possible states of private per-loop buffer)

  UNUSED = 0,

  CLAIMED = 1, // owner thread started initialization

  READY = 2, // available for stealing

  THIEF = 3 // finished by owner, or claimed by thief

  // possible state changes:

  // 0 -> 1 owner only, sync

  // 0 -> 3 thief only, sync

  // 1 -> 2 owner only, async

  // 2 -> 3 owner only, async

  // 3 -> 2 owner only, async

  // 3 -> 0 last thread finishing the loop, async

};

#endif


// Initialize a dispatch_private_info_template<T> buffer for a particular

// type of schedule,chunk.  The loop description is found in lb (lower bound),

// ub (upper bound), and st (stride).  nproc is the number of threads relevant

// to the scheduling (often the number of threads in a team, but not always if

// hierarchical scheduling is used).  tid is the id of the thread calling

// the function within the group of nproc threads.  It will have a value

// between 0 and nproc - 1.  This is often just the thread id within a team, but

// is not necessarily the case when using hierarchical scheduling.

// loc is the source file location of the corresponding loop

// gtid is the global thread id

template <typename T>

void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,

                                   dispatch_private_info_template<T> *pr,

                                   enum sched_type schedule, T lb, T ub,

                                   typename traits_t<T>::signed_t st,

#if USE_ITT_BUILD

                                   kmp_uint64 *cur_chunk,

#endif

                                   typename traits_t<T>::signed_t chunk,

                                   T nproc, T tid) {

  typedef typename traits_t<T>::unsigned_t UT;

  typedef typename traits_t<T>::floating_t DBL;


  int active;

  T tc;

  kmp_info_t *th;

  kmp_team_t *team;

  int monotonicity;

  bool use_hier;


#ifdef KMP_DEBUG

  typedef typename traits_t<T>::signed_t ST;

  {

    char *buff;

    // create format specifiers before the debug output

    buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "

                            "pr:%%p lb:%%%s ub:%%%s st:%%%s "

                            "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",

                            traits_t<T>::spec, traits_t<T>::spec,

                            traits_t<ST>::spec, traits_t<ST>::spec,

                            traits_t<T>::spec, traits_t<T>::spec);

    KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));

    __kmp_str_free(&buff);

  }

#endif

  /* setup data */

  th = __kmp_threads[gtid];

  team = th->th.th_team;

  active = !team->t.t_serialized;


#if USE_ITT_BUILD

  int itt_need_metadata_reporting =

      __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&

      KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&

      team->t.t_active_level == 1;

#endif


#if KMP_USE_HIER_SCHED

  use_hier = pr->flags.use_hier;

#else

  use_hier = false;

#endif


  /* Pick up the nonmonotonic/monotonic bits from the scheduling type */

  monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);

  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);


  /* Pick up the nomerge/ordered bits from the scheduling type */

  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {

    pr->flags.nomerge = TRUE;

    schedule =

        (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));

  } else {

    pr->flags.nomerge = FALSE;

  }

  pr->type_size = traits_t<T>::type_size; // remember the size of variables

  if (kmp_ord_lower & schedule) {

    pr->flags.ordered = TRUE;

    schedule =

        (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));

  } else {

    pr->flags.ordered = FALSE;

  }

  // Ordered overrides nonmonotonic

  if (pr->flags.ordered) {

    monotonicity = SCHEDULE_MONOTONIC;

  }


  if (schedule == kmp_sch_static) {

    schedule = __kmp_static;

  } else {

    if (schedule == kmp_sch_runtime) {

      // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if

      // not specified)

      schedule = team->t.t_sched.r_sched_type;

      monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);

      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);

      if (pr->flags.ordered) // correct monotonicity for ordered loop if needed

        monotonicity = SCHEDULE_MONOTONIC;

      // Detail the schedule if needed (global controls are differentiated

      // appropriately)

      if (schedule == kmp_sch_guided_chunked) {

        schedule = __kmp_guided;

      } else if (schedule == kmp_sch_static) {

        schedule = __kmp_static;

      }

      // Use the chunk size specified by OMP_SCHEDULE (or default if not

      // specified)

      chunk = team->t.t_sched.chunk;

#if USE_ITT_BUILD

      if (cur_chunk)

        *cur_chunk = chunk;

#endif

#ifdef KMP_DEBUG

      {

        char *buff;

        // create format specifiers before the debug output

        buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "

                                "schedule:%%d chunk:%%%s\n",

                                traits_t<ST>::spec);

        KD_TRACE(10, (buff, gtid, schedule, chunk));

        __kmp_str_free(&buff);

      }

#endif

    } else {

      if (schedule == kmp_sch_guided_chunked) {

        schedule = __kmp_guided;

      }

      if (chunk <= 0) {

        chunk = KMP_DEFAULT_CHUNK;

      }

    }


    if (schedule == kmp_sch_auto) {

      // mapping and differentiation: in the __kmp_do_serial_initialize()

      schedule = __kmp_auto;

#ifdef KMP_DEBUG

      {

        char *buff;

        // create format specifiers before the debug output

        buff = __kmp_str_format(

            "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "

            "schedule:%%d chunk:%%%s\n",

            traits_t<ST>::spec);

        KD_TRACE(10, (buff, gtid, schedule, chunk));

        __kmp_str_free(&buff);

      }

#endif

    }

#if KMP_STATIC_STEAL_ENABLED

    // map nonmonotonic:dynamic to static steal

    if (schedule == kmp_sch_dynamic_chunked) {

      if (monotonicity == SCHEDULE_NONMONOTONIC)

        schedule = kmp_sch_static_steal;

    }

#endif

    /* guided analytical not safe for too many threads */

    if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {

      schedule = kmp_sch_guided_iterative_chunked;

      KMP_WARNING(DispatchManyThreads);

    }

    if (schedule == kmp_sch_runtime_simd) {

      // compiler provides simd_width in the chunk parameter

      schedule = team->t.t_sched.r_sched_type;

      monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);

      schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);

      // Detail the schedule if needed (global controls are differentiated

      // appropriately)

      if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||

          schedule == __kmp_static) {

        schedule = kmp_sch_static_balanced_chunked;

      } else {

        if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {

          schedule = kmp_sch_guided_simd;

        }

        chunk = team->t.t_sched.chunk * chunk;

      }

#if USE_ITT_BUILD

      if (cur_chunk)

        *cur_chunk = chunk;

#endif

#ifdef KMP_DEBUG

      {

        char *buff;

        // create format specifiers before the debug output

        buff = __kmp_str_format(

            "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"

            " chunk:%%%s\n",

            traits_t<ST>::spec);

        KD_TRACE(10, (buff, gtid, schedule, chunk));

        __kmp_str_free(&buff);

      }

#endif

    }

    pr->u.p.parm1 = chunk;

  }

  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),

              "unknown scheduling type");


  pr->u.p.count = 0;


  if (__kmp_env_consistency_check) {

    if (st == 0) {

      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,

                            (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);

    }

  }

  // compute trip count

  if (st == 1) { // most common case

    if (ub >= lb) {

      tc = ub - lb + 1;

    } else { // ub < lb

      tc = 0; // zero-trip

    }

  } else if (st < 0) {

    if (lb >= ub) {

      // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),

      // where the division needs to be unsigned regardless of the result type

      tc = (UT)(lb - ub) / (-st) + 1;

    } else { // lb < ub

      tc = 0; // zero-trip

    }

  } else { // st > 0

    if (ub >= lb) {

      // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),

      // where the division needs to be unsigned regardless of the result type

      tc = (UT)(ub - lb) / st + 1;

    } else { // ub < lb

      tc = 0; // zero-trip

    }

  }


#if KMP_STATS_ENABLED

  if (KMP_MASTER_GTID(gtid)) {

    KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);

  }

#endif


  pr->u.p.lb = lb;

  pr->u.p.ub = ub;

  pr->u.p.st = st;

  pr->u.p.tc = tc;


#if KMP_OS_WINDOWS

  pr->u.p.last_upper = ub + st;

#endif /* KMP_OS_WINDOWS */


  /* NOTE: only the active parallel region(s) has active ordered sections */


  if (active) {

    if (pr->flags.ordered) {

      pr->ordered_bumped = 0;

      pr->u.p.ordered_lower = 1;

      pr->u.p.ordered_upper = 0;

    }

  }


  switch (schedule) {

#if KMP_STATIC_STEAL_ENABLED

  case kmp_sch_static_steal: {

    T ntc, init = 0;


    KD_TRACE(100,

             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",

              gtid));


    ntc = (tc % chunk ? 1 : 0) + tc / chunk;

    if (nproc > 1 && ntc >= nproc) {

      KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);

      T id = tid;

      T small_chunk, extras, p_extra = 0;

      kmp_uint32 old = UNUSED;

      int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);

      if (traits_t<T>::type_size > 4) {

        // AC: TODO: check if 16-byte CAS available and use it to

        // improve performance (probably wait for explicit request

        // before spending time on this).

        // For now use dynamically allocated per-private-buffer lock,

        // free memory in __kmp_dispatch_next when status==0.

        pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));

        __kmp_init_lock(pr->u.p.steal_lock);

      }


#if KMP_WEIGHTED_ITERATIONS_SUPPORTED

      // Iterations are divided in a 60/40 skewed distribution among CORE and

      // ATOM processors for hybrid systems

      bool use_hybrid = false;

      kmp_hw_core_type_t core_type = KMP_HW_CORE_TYPE_UNKNOWN;

      T first_thread_with_ecore = 0;

      T num_procs_with_pcore = 0;

      T num_procs_with_ecore = 0;

      T p_ntc = 0, e_ntc = 0;

      if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&

          __kmp_affinity.type != affinity_explicit) {

        use_hybrid = true;

        core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;

        if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&

            __kmp_first_osid_with_ecore > -1) {

          for (int i = 0; i < team->t.t_nproc; ++i) {

            kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]

                                          ->th.th_topology_attrs.core_type;

            int id = team->t.t_threads[i]->th.th_topology_ids.os_id;

            if (id == __kmp_first_osid_with_ecore) {

              first_thread_with_ecore =

                  team->t.t_threads[i]->th.th_info.ds.ds_tid;

            }

            if (type == KMP_HW_CORE_TYPE_CORE) {

              num_procs_with_pcore++;

            } else if (type == KMP_HW_CORE_TYPE_ATOM) {

              num_procs_with_ecore++;

            } else {

              use_hybrid = false;

              break;

            }

          }

        }

        if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) {

          float multiplier = 60.0 / 40.0;

          float p_ratio = (float)num_procs_with_pcore / nproc;

          float e_ratio = (float)num_procs_with_ecore / nproc;

          float e_multiplier =

              (float)1 /

              (((multiplier * num_procs_with_pcore) / nproc) + e_ratio);

          float p_multiplier = multiplier * e_multiplier;

          p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);

          if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier))

            e_ntc =

                (int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier));

          else

            e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);

          KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);


          // Use regular static steal if not enough chunks for skewed

          // distribution

          use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&

                                       e_ntc >= num_procs_with_ecore)

                            ? true

                            : false);

        } else {

          use_hybrid = false;

        }

      }

      pr->flags.use_hybrid = use_hybrid;

      pr->u.p.pchunks = p_ntc;

      pr->u.p.num_procs_with_pcore = num_procs_with_pcore;

      pr->u.p.first_thread_with_ecore = first_thread_with_ecore;


      if (use_hybrid) {

        KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);

        T big_chunk = p_ntc / num_procs_with_pcore;

        small_chunk = e_ntc / num_procs_with_ecore;


        extras =

            (p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);


        p_extra = (big_chunk - small_chunk);


        if (core_type == KMP_HW_CORE_TYPE_CORE) {

          if (id < first_thread_with_ecore) {

            init =

                id * small_chunk + id * p_extra + (id < extras ? id : extras);

          } else {

            init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +

                   (id < extras ? id : extras);

          }

        } else {

          if (id == first_thread_with_ecore) {

            init =

                id * small_chunk + id * p_extra + (id < extras ? id : extras);

          } else {

            init = id * small_chunk + first_thread_with_ecore * p_extra +

                   (id < extras ? id : extras);

          }

        }

        p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;

      } else

#endif

      {

        small_chunk = ntc / nproc;

        extras = ntc % nproc;

        init = id * small_chunk + (id < extras ? id : extras);

        p_extra = 0;

      }

      pr->u.p.count = init;

      if (claimed) { // are we succeeded in claiming own buffer?

        pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);

        // Other threads will inspect steal_flag when searching for a victim.

        // READY means other threads may steal from this thread from now on.

        KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);

      } else {

        // other thread has stolen whole our range

        KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);

        pr->u.p.ub = init; // mark there is no iterations to work on

      }

      pr->u.p.parm2 = ntc; // save number of chunks

      // parm3 is the number of times to attempt stealing which is

      // nproc (just a heuristics, could be optimized later on).

      pr->u.p.parm3 = nproc;

      pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid

      break;

    } else {

      /* too few chunks: switching to kmp_sch_dynamic_chunked */

      schedule = kmp_sch_dynamic_chunked;

      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "

                     "kmp_sch_dynamic_chunked\n",

                     gtid));

      goto dynamic_init;

      break;

    } // if

  } // case

#endif

  case kmp_sch_static_balanced: {

    T init, limit;


    KD_TRACE(

        100,

        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",

         gtid));


    if (nproc > 1) {

      T id = tid;


      if (tc < nproc) {

        if (id < tc) {

          init = id;

          limit = id;

          pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */

        } else {

          pr->u.p.count = 1; /* means no more chunks to execute */

          pr->u.p.parm1 = FALSE;

          break;

        }

      } else {

        T small_chunk = tc / nproc;

        T extras = tc % nproc;

        init = id * small_chunk + (id < extras ? id : extras);

        limit = init + small_chunk - (id < extras ? 0 : 1);

        pr->u.p.parm1 = (id == nproc - 1);

      }

    } else {

      if (tc > 0) {

        init = 0;

        limit = tc - 1;

        pr->u.p.parm1 = TRUE;

      } else {

        // zero trip count

        pr->u.p.count = 1; /* means no more chunks to execute */

        pr->u.p.parm1 = FALSE;

        break;

      }

    }

#if USE_ITT_BUILD

    // Calculate chunk for metadata report

    if (itt_need_metadata_reporting)

      if (cur_chunk)

        *cur_chunk = limit - init + 1;

#endif

    if (st == 1) {

      pr->u.p.lb = lb + init;

      pr->u.p.ub = lb + limit;

    } else {

      // calculated upper bound, "ub" is user-defined upper bound

      T ub_tmp = lb + limit * st;

      pr->u.p.lb = lb + init * st;

      // adjust upper bound to "ub" if needed, so that MS lastprivate will match

      // it exactly

      if (st > 0) {

        pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);

      } else {

        pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);

      }

    }

    if (pr->flags.ordered) {

      pr->u.p.ordered_lower = init;

      pr->u.p.ordered_upper = limit;

    }

    break;

  } // case

  case kmp_sch_static_balanced_chunked: {

    // similar to balanced, but chunk adjusted to multiple of simd width

    T nth = nproc;

    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"

                   " -> falling-through to static_greedy\n",

                   gtid));

    schedule = kmp_sch_static_greedy;

    if (nth > 1)

      pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);

    else

      pr->u.p.parm1 = tc;

    break;

  } // case

  case kmp_sch_guided_simd:

  case kmp_sch_guided_iterative_chunked: {

    KD_TRACE(

        100,

        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"

         " case\n",

         gtid));


    if (nproc > 1) {

      if ((2L * chunk + 1) * nproc >= tc) {

        /* chunk size too large, switch to dynamic */

        schedule = kmp_sch_dynamic_chunked;

        goto dynamic_init;

      } else {

        // when remaining iters become less than parm2 - switch to dynamic

        pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);

        *(double *)&pr->u.p.parm3 =

            guided_flt_param / (double)nproc; // may occupy parm3 and parm4

      }

    } else {

      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "

                     "kmp_sch_static_greedy\n",

                     gtid));

      schedule = kmp_sch_static_greedy;

      /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */

      KD_TRACE(

          100,

          ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",

           gtid));

      pr->u.p.parm1 = tc;

    } // if

  } // case

  break;

  case kmp_sch_guided_analytical_chunked: {

    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "

                   "kmp_sch_guided_analytical_chunked case\n",

                   gtid));


    if (nproc > 1) {

      if ((2L * chunk + 1) * nproc >= tc) {

        /* chunk size too large, switch to dynamic */

        schedule = kmp_sch_dynamic_chunked;

        goto dynamic_init;

      } else {

        /* commonly used term: (2 nproc - 1)/(2 nproc) */

        DBL x;


#if KMP_USE_X87CONTROL

        /* Linux* OS already has 64-bit computation by default for long double,

           and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On

           Windows* OS on IA-32 architecture, we need to set precision to 64-bit

           instead of the default 53-bit. Even though long double doesn't work

           on Windows* OS on Intel(R) 64, the resulting lack of precision is not

           expected to impact the correctness of the algorithm, but this has not

           been mathematically proven. */

        // save original FPCW and set precision to 64-bit, as

        // Windows* OS on IA-32 architecture defaults to 53-bit

        unsigned int oldFpcw = _control87(0, 0);

        _control87(_PC_64, _MCW_PC); // 0,0x30000

#endif

        /* value used for comparison in solver for cross-over point */

        KMP_ASSERT(tc > 0);

        long double target = ((long double)chunk * 2 + 1) * nproc / tc;


        /* crossover point--chunk indexes equal to or greater than

           this point switch to dynamic-style scheduling */

        UT cross;


        /* commonly used term: (2 nproc - 1)/(2 nproc) */

        x = 1.0 - 0.5 / (double)nproc;


#ifdef KMP_DEBUG

        { // test natural alignment

          struct _test_a {

            char a;

            union {

              char b;

              DBL d;

            };

          } t;

          ptrdiff_t natural_alignment =

              (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;

          //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long

          // long)natural_alignment );

          KMP_DEBUG_ASSERT(

              (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);

        }

#endif // KMP_DEBUG


        /* save the term in thread private dispatch structure */

        *(DBL *)&pr->u.p.parm3 = x;


        /* solve for the crossover point to the nearest integer i for which C_i

           <= chunk */

        {

          UT left, right, mid;

          long double p;


          /* estimate initial upper and lower bound */


          /* doesn't matter what value right is as long as it is positive, but

             it affects performance of the solver */

          right = 229;

          p = __kmp_pow<UT>(x, right);

          if (p > target) {

            do {

              p *= p;

              right <<= 1;

            } while (p > target && right < (1 << 27));

            /* lower bound is previous (failed) estimate of upper bound */

            left = right >> 1;

          } else {

            left = 0;

          }


          /* bisection root-finding method */

          while (left + 1 < right) {

            mid = (left + right) / 2;

            if (__kmp_pow<UT>(x, mid) > target) {

              left = mid;

            } else {

              right = mid;

            }

          } // while

          cross = right;

        }

        /* assert sanity of computed crossover point */

        KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&

                   __kmp_pow<UT>(x, cross) <= target);


        /* save the crossover point in thread private dispatch structure */

        pr->u.p.parm2 = cross;


// C75803

#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))

#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)

#else

#define GUIDED_ANALYTICAL_WORKAROUND (x)

#endif

        /* dynamic-style scheduling offset */

        pr->u.p.count = tc -

                        __kmp_dispatch_guided_remaining(

                            tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -

                        cross * chunk;

#if KMP_USE_X87CONTROL

        // restore FPCW

        _control87(oldFpcw, _MCW_PC);

#endif

      } // if

    } else {

      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "

                     "kmp_sch_static_greedy\n",

                     gtid));

      schedule = kmp_sch_static_greedy;

      /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */

      pr->u.p.parm1 = tc;

    } // if

  } // case

  break;

  case kmp_sch_static_greedy:

    KD_TRACE(

        100,

        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",

         gtid));

    pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;

    break;

  case kmp_sch_static_chunked:

  case kmp_sch_dynamic_chunked:

  dynamic_init:

    if (tc == 0)

      break;

    if (pr->u.p.parm1 <= 0)

      pr->u.p.parm1 = KMP_DEFAULT_CHUNK;

    else if (pr->u.p.parm1 > tc)

      pr->u.p.parm1 = tc;

    // Store the total number of chunks to prevent integer overflow during

    // bounds calculations in the get next chunk routine.

    pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);

    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "

                   "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",

                   gtid));

    break;

  case kmp_sch_trapezoidal: {

    /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */


    T parm1, parm2, parm3, parm4;

    KD_TRACE(100,

             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",

              gtid));


    parm1 = chunk;


    /* F : size of the first cycle */

    parm2 = (tc / (2 * nproc));


    if (parm2 < 1) {

      parm2 = 1;

    }


    /* L : size of the last cycle.  Make sure the last cycle is not larger

       than the first cycle. */

    if (parm1 < 1) {

      parm1 = 1;

    } else if (parm1 > parm2) {

      parm1 = parm2;

    }


    /* N : number of cycles */

    parm3 = (parm2 + parm1);

    parm3 = (2 * tc + parm3 - 1) / parm3;


    if (parm3 < 2) {

      parm3 = 2;

    }


    /* sigma : decreasing incr of the trapezoid */

    parm4 = (parm3 - 1);

    parm4 = (parm2 - parm1) / parm4;


    // pointless check, because parm4 >= 0 always

    // if ( parm4 < 0 ) {

    //    parm4 = 0;

    //}


    pr->u.p.parm1 = parm1;

    pr->u.p.parm2 = parm2;

    pr->u.p.parm3 = parm3;

    pr->u.p.parm4 = parm4;

  } // case

  break;


  default: {

    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message

                KMP_HNT(GetNewerLibrary), // Hint

                __kmp_msg_null // Variadic argument list terminator

    );

  } break;

  } // switch

  pr->schedule = schedule;

}


#if KMP_USE_HIER_SCHED

template <typename T>

inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,

                                             typename traits_t<T>::signed_t st);

template <>

inline void

__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,

                                            kmp_int32 ub, kmp_int32 st) {

  __kmp_dispatch_init_hierarchy<kmp_int32>(

      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,

      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);

}

template <>

inline void

__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,

                                             kmp_uint32 ub, kmp_int32 st) {

  __kmp_dispatch_init_hierarchy<kmp_uint32>(

      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,

      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);

}

template <>

inline void

__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,

                                            kmp_int64 ub, kmp_int64 st) {

  __kmp_dispatch_init_hierarchy<kmp_int64>(

      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,

      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);

}

template <>

inline void

__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,

                                             kmp_uint64 ub, kmp_int64 st) {

  __kmp_dispatch_init_hierarchy<kmp_uint64>(

      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,

      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);

}


// free all the hierarchy scheduling memory associated with the team

void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {

  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;

  for (int i = 0; i < num_disp_buff; ++i) {

    // type does not matter here so use kmp_int32

    auto sh =

        reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(

            &team->t.t_disp_buffer[i]);

    if (sh->hier) {

      sh->hier->deallocate();

      __kmp_free(sh->hier);

    }

  }

}

#endif


// UT - unsigned flavor of T, ST - signed flavor of T,

// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8

template <typename T>

static void

__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,

                    T ub, typename traits_t<T>::signed_t st,

                    typename traits_t<T>::signed_t chunk, int push_ws) {

  typedef typename traits_t<T>::unsigned_t UT;


  int active;

  kmp_info_t *th;

  kmp_team_t *team;

  kmp_uint32 my_buffer_index;

  dispatch_private_info_template<T> *pr;

  dispatch_shared_info_template<T> volatile *sh;


  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==

                   sizeof(dispatch_private_info));

  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==

                   sizeof(dispatch_shared_info));

  __kmp_assert_valid_gtid(gtid);


  if (!TCR_4(__kmp_init_parallel))

    __kmp_parallel_initialize();


  __kmp_resume_if_soft_paused();


#if INCLUDE_SSC_MARKS

  SSC_MARK_DISPATCH_INIT();

#endif

#ifdef KMP_DEBUG

  typedef typename traits_t<T>::signed_t ST;

  {

    char *buff;

    // create format specifiers before the debug output

    buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "

                            "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",

                            traits_t<ST>::spec, traits_t<T>::spec,

                            traits_t<T>::spec, traits_t<ST>::spec);

    KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));

    __kmp_str_free(&buff);

  }

#endif

  /* setup data */

  th = __kmp_threads[gtid];

  team = th->th.th_team;

  active = !team->t.t_serialized;

  th->th.th_ident = loc;


  // Any half-decent optimizer will remove this test when the blocks are empty

  // since the macros expand to nothing

  // when statistics are disabled.

  if (schedule == __kmp_static) {

    KMP_COUNT_BLOCK(OMP_LOOP_STATIC);

  } else {

    KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);

  }


#if KMP_USE_HIER_SCHED

  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable

  // Hierarchical scheduling does not work with ordered, so if ordered is

  // detected, then revert back to threaded scheduling.

  bool ordered;

  enum sched_type my_sched = schedule;

  my_buffer_index = th->th.th_dispatch->th_disp_index;

  pr = reinterpret_cast<dispatch_private_info_template<T> *>(

      &th->th.th_dispatch

           ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);

  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);

  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))

    my_sched =

        (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));

  ordered = (kmp_ord_lower & my_sched);

  if (pr->flags.use_hier) {

    if (ordered) {

      KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "

                     "Disabling hierarchical scheduling.\n",

                     gtid));

      pr->flags.use_hier = FALSE;

    }

  }

  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {

    // Don't use hierarchical for ordered parallel loops and don't

    // use the runtime hierarchy if one was specified in the program

    if (!ordered && !pr->flags.use_hier)

      __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);

  }

#endif // KMP_USE_HIER_SCHED


#if USE_ITT_BUILD

  kmp_uint64 cur_chunk = chunk;

  int itt_need_metadata_reporting =

      __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&

      KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&

      team->t.t_active_level == 1;

#endif

  if (!active) {

    pr = reinterpret_cast<dispatch_private_info_template<T> *>(

        th->th.th_dispatch->th_disp_buffer); /* top of the stack */

  } else {

    KMP_DEBUG_ASSERT(th->th.th_dispatch ==

                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);


    my_buffer_index = th->th.th_dispatch->th_disp_index++;


    /* What happens when number of threads changes, need to resize buffer? */

    pr = reinterpret_cast<dispatch_private_info_template<T> *>(

        &th->th.th_dispatch

             ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);

    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(

        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);

    KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,

                  my_buffer_index));

    if (sh->buffer_index != my_buffer_index) { // too many loops in progress?

      KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"

                     " sh->buffer_index:%d\n",

                     gtid, my_buffer_index, sh->buffer_index));

      __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,

                             __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));

      // Note: KMP_WAIT() cannot be used there: buffer index and

      // my_buffer_index are *always* 32-bit integers.

      KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "

                     "sh->buffer_index:%d\n",

                     gtid, my_buffer_index, sh->buffer_index));

    }

  }


  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,

#if USE_ITT_BUILD

                                &cur_chunk,

#endif

                                chunk, (T)th->th.th_team_nproc,

                                (T)th->th.th_info.ds.ds_tid);

  if (active) {

    if (pr->flags.ordered == 0) {

      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;

      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;

    } else {

      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;

      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;

    }

    th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;

    th->th.th_dispatch->th_dispatch_sh_current =

        CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);

#if USE_ITT_BUILD

    if (pr->flags.ordered) {

      __kmp_itt_ordered_init(gtid);

    }

    // Report loop metadata

    if (itt_need_metadata_reporting) {

      // Only report metadata by primary thread of active team at level 1

      kmp_uint64 schedtype = 0;

      switch (schedule) {

      case kmp_sch_static_chunked:

      case kmp_sch_static_balanced: // Chunk is calculated in the switch above

        break;

      case kmp_sch_static_greedy:

        cur_chunk = pr->u.p.parm1;

        break;

      case kmp_sch_dynamic_chunked:

        schedtype = 1;

        break;

      case kmp_sch_guided_iterative_chunked:

      case kmp_sch_guided_analytical_chunked:

      case kmp_sch_guided_simd:

        schedtype = 2;

        break;

      default:

        // Should we put this case under "static"?

        // case kmp_sch_static_steal:

        schedtype = 3;

        break;

      }

      __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);

    }

#if KMP_USE_HIER_SCHED

    if (pr->flags.use_hier) {

      pr->u.p.count = 0;

      pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;

    }

#endif // KMP_USER_HIER_SCHED

#endif /* USE_ITT_BUILD */

  }


#ifdef KMP_DEBUG

  {

    char *buff;

    // create format specifiers before the debug output

    buff = __kmp_str_format(

        "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "

        "lb:%%%s ub:%%%s"

        " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"

        " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",

        traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,

        traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,

        traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,

        traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);

    KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,

                  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,

                  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,

                  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));

    __kmp_str_free(&buff);

  }

#endif

#if OMPT_SUPPORT && OMPT_OPTIONAL

  if (ompt_enabled.ompt_callback_work) {

    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);

    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);

    ompt_callbacks.ompt_callback(ompt_callback_work)(

        ompt_get_work_schedule(pr->schedule), ompt_scope_begin,

        &(team_info->parallel_data), &(task_info->task_data), pr->u.p.tc,

        OMPT_LOAD_RETURN_ADDRESS(gtid));

  }

#endif

  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);

}


/* For ordered loops, either __kmp_dispatch_finish() should be called after

 * every iteration, or __kmp_dispatch_finish_chunk() should be called after

 * every chunk of iterations.  If the ordered section(s) were not executed

 * for this iteration (or every iteration in this chunk), we need to set the

 * ordered iteration counters so that the next thread can proceed. */

template <typename UT>

static void __kmp_dispatch_finish(int gtid, ident_t *loc) {

  typedef typename traits_t<UT>::signed_t ST;

  __kmp_assert_valid_gtid(gtid);

  kmp_info_t *th = __kmp_threads[gtid];


  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));

  if (!th->th.th_team->t.t_serialized) {


    dispatch_private_info_template<UT> *pr =

        reinterpret_cast<dispatch_private_info_template<UT> *>(

            th->th.th_dispatch->th_dispatch_pr_current);

    dispatch_shared_info_template<UT> volatile *sh =

        reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(

            th->th.th_dispatch->th_dispatch_sh_current);

    KMP_DEBUG_ASSERT(pr);

    KMP_DEBUG_ASSERT(sh);

    KMP_DEBUG_ASSERT(th->th.th_dispatch ==

                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);


    if (pr->ordered_bumped) {

      KD_TRACE(

          1000,

          ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",

           gtid));

      pr->ordered_bumped = 0;

    } else {

      UT lower = pr->u.p.ordered_lower;


#ifdef KMP_DEBUG

      {

        char *buff;

        // create format specifiers before the debug output

        buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "

                                "ordered_iteration:%%%s lower:%%%s\n",

                                traits_t<UT>::spec, traits_t<UT>::spec);

        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));

        __kmp_str_free(&buff);

      }

#endif


      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,

                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));

      KMP_MB(); /* is this necessary? */

#ifdef KMP_DEBUG

      {

        char *buff;

        // create format specifiers before the debug output

        buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "

                                "ordered_iteration:%%%s lower:%%%s\n",

                                traits_t<UT>::spec, traits_t<UT>::spec);

        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));

        __kmp_str_free(&buff);

      }

#endif


      test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);

    } // if

  } // if

  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));

}


#ifdef KMP_GOMP_COMPAT


template <typename UT>

static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {

  typedef typename traits_t<UT>::signed_t ST;

  __kmp_assert_valid_gtid(gtid);

  kmp_info_t *th = __kmp_threads[gtid];


  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));

  if (!th->th.th_team->t.t_serialized) {

    dispatch_private_info_template<UT> *pr =

        reinterpret_cast<dispatch_private_info_template<UT> *>(

            th->th.th_dispatch->th_dispatch_pr_current);

    dispatch_shared_info_template<UT> volatile *sh =

        reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(

            th->th.th_dispatch->th_dispatch_sh_current);

    KMP_DEBUG_ASSERT(pr);

    KMP_DEBUG_ASSERT(sh);

    KMP_DEBUG_ASSERT(th->th.th_dispatch ==

                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);


    UT lower = pr->u.p.ordered_lower;

    UT upper = pr->u.p.ordered_upper;

    UT inc = upper - lower + 1;


    if (pr->ordered_bumped == inc) {

      KD_TRACE(

          1000,

          ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",

           gtid));

      pr->ordered_bumped = 0;

    } else {

      inc -= pr->ordered_bumped;


#ifdef KMP_DEBUG

      {

        char *buff;

        // create format specifiers before the debug output

        buff = __kmp_str_format(

            "__kmp_dispatch_finish_chunk: T#%%d before wait: "

            "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",

            traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);

        KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));

        __kmp_str_free(&buff);

      }

#endif


      __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,

                     __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));


      KMP_MB(); /* is this necessary? */

      KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "

                      "ordered_bumped to zero\n",

                      gtid));

      pr->ordered_bumped = 0;

//!!!!! TODO check if the inc should be unsigned, or signed???

#ifdef KMP_DEBUG

      {

        char *buff;

        // create format specifiers before the debug output

        buff = __kmp_str_format(

            "__kmp_dispatch_finish_chunk: T#%%d after wait: "

            "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",

            traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,

            traits_t<UT>::spec);

        KD_TRACE(1000,

                 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));

        __kmp_str_free(&buff);

      }

#endif


      test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);

    }

    //        }

  }

  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));

}


#endif /* KMP_GOMP_COMPAT */


template <typename T>

int __kmp_dispatch_next_algorithm(int gtid,

                                  dispatch_private_info_template<T> *pr,

                                  dispatch_shared_info_template<T> volatile *sh,

                                  kmp_int32 *p_last, T *p_lb, T *p_ub,

                                  typename traits_t<T>::signed_t *p_st, T nproc,

                                  T tid) {

  typedef typename traits_t<T>::unsigned_t UT;

  typedef typename traits_t<T>::signed_t ST;

  typedef typename traits_t<T>::floating_t DBL;

  int status = 0;

  bool last = false;

  T start;

  ST incr;

  UT limit, trip, init;

  kmp_info_t *th = __kmp_threads[gtid];

  kmp_team_t *team = th->th.th_team;


  KMP_DEBUG_ASSERT(th->th.th_dispatch ==

                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);

  KMP_DEBUG_ASSERT(pr);

  KMP_DEBUG_ASSERT(sh);

  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);

#ifdef KMP_DEBUG

  {

    char *buff;

    // create format specifiers before the debug output

    buff =

        __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "

                         "sh:%%p nproc:%%%s tid:%%%s\n",

                         traits_t<T>::spec, traits_t<T>::spec);

    KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));

    __kmp_str_free(&buff);

  }

#endif


  // zero trip count

  if (pr->u.p.tc == 0) {

    KD_TRACE(10,

             ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "

              "zero status:%d\n",

              gtid, status));

    return 0;

  }


  switch (pr->schedule) {

#if KMP_STATIC_STEAL_ENABLED

  case kmp_sch_static_steal: {

    T chunk = pr->u.p.parm1;

    UT nchunks = pr->u.p.parm2;

    KD_TRACE(100,

             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",

              gtid));


    trip = pr->u.p.tc - 1;


    if (traits_t<T>::type_size > 4) {

      // use lock for 8-byte induction variable.

      // TODO (optional): check presence and use 16-byte CAS

      kmp_lock_t *lck = pr->u.p.steal_lock;

      KMP_DEBUG_ASSERT(lck != NULL);

      if (pr->u.p.count < (UT)pr->u.p.ub) {

        KMP_DEBUG_ASSERT(pr->steal_flag == READY);

        __kmp_acquire_lock(lck, gtid);

        // try to get own chunk of iterations

        init = (pr->u.p.count)++;

        status = (init < (UT)pr->u.p.ub);

        __kmp_release_lock(lck, gtid);

      } else {

        status = 0; // no own chunks

      }

      if (!status) { // try to steal

        kmp_lock_t *lckv; // victim buffer's lock

        T while_limit = pr->u.p.parm3;

        T while_index = 0;

        int idx = (th->th.th_dispatch->th_disp_index - 1) %

                  __kmp_dispatch_num_buffers; // current loop index

        // note: victim thread can potentially execute another loop

        KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive

        while ((!status) && (while_limit != ++while_index)) {

          dispatch_private_info_template<T> *v;

          T remaining;

          T victimId = pr->u.p.parm4;

          T oldVictimId = victimId ? victimId - 1 : nproc - 1;

          v = reinterpret_cast<dispatch_private_info_template<T> *>(

              &team->t.t_dispatch[victimId].th_disp_buffer[idx]);

          KMP_DEBUG_ASSERT(v);

          while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&

                 oldVictimId != victimId) {

            victimId = (victimId + 1) % nproc;

            v = reinterpret_cast<dispatch_private_info_template<T> *>(

                &team->t.t_dispatch[victimId].th_disp_buffer[idx]);

            KMP_DEBUG_ASSERT(v);

          }

          if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {

            continue; // try once more (nproc attempts in total)

          }

          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {

            kmp_uint32 old = UNUSED;

            // try to steal whole range from inactive victim

            status = v->steal_flag.compare_exchange_strong(old, THIEF);

            if (status) {

              // initialize self buffer with victim's whole range of chunks

              T id = victimId;

              T small_chunk = 0, extras = 0, p_extra = 0;

              __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,

                                              init, small_chunk, extras,

                                              p_extra);

              __kmp_acquire_lock(lck, gtid);

              pr->u.p.count = init + 1; // exclude one we execute immediately

              pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);

              __kmp_release_lock(lck, gtid);

              pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid

              // no need to reinitialize other thread invariants: lb, st, etc.

#ifdef KMP_DEBUG

              {

                char *buff;

                // create format specifiers before the debug output

                buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "

                                        "stolen chunks from T#%%d, "

                                        "count:%%%s ub:%%%s\n",

                                        traits_t<UT>::spec, traits_t<T>::spec);

                KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));

                __kmp_str_free(&buff);

              }

#endif

              // activate non-empty buffer and let others steal from us

              if (pr->u.p.count < (UT)pr->u.p.ub)

                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);

              break;

            }

          }

          if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||

              v->u.p.count >= (UT)v->u.p.ub) {

            pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid

            continue; // no chunks to steal, try next victim

          }

          lckv = v->u.p.steal_lock;

          KMP_ASSERT(lckv != NULL);

          __kmp_acquire_lock(lckv, gtid);

          limit = v->u.p.ub; // keep initial ub

          if (v->u.p.count >= limit) {

            __kmp_release_lock(lckv, gtid);

            pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid

            continue; // no chunks to steal, try next victim

          }


          // stealing succeded, reduce victim's ub by 1/4 of undone chunks

          // TODO: is this heuristics good enough??

          remaining = limit - v->u.p.count;

          if (remaining > 7) {

            // steal 1/4 of remaining

            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);

            init = (v->u.p.ub -= (remaining >> 2));

          } else {

            // steal 1 chunk of 1..7 remaining

            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);

            init = (v->u.p.ub -= 1);

          }

          __kmp_release_lock(lckv, gtid);

#ifdef KMP_DEBUG

          {

            char *buff;

            // create format specifiers before the debug output

            buff = __kmp_str_format(

                "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "

                "count:%%%s ub:%%%s\n",

                traits_t<UT>::spec, traits_t<UT>::spec);

            KD_TRACE(10, (buff, gtid, victimId, init, limit));

            __kmp_str_free(&buff);

          }

#endif

          KMP_DEBUG_ASSERT(init + 1 <= limit);

          pr->u.p.parm4 = victimId; // remember victim to steal from

          status = 1;

          // now update own count and ub with stolen range excluding init chunk

          __kmp_acquire_lock(lck, gtid);

          pr->u.p.count = init + 1;

          pr->u.p.ub = limit;

          __kmp_release_lock(lck, gtid);

          // activate non-empty buffer and let others steal from us

          if (init + 1 < limit)

            KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);

        } // while (search for victim)

      } // if (try to find victim and steal)

    } else {

      // 4-byte induction variable, use 8-byte CAS for pair (count, ub)

      // as all operations on pair (count, ub) must be done atomically

      typedef union {

        struct {

          UT count;

          T ub;

        } p;

        kmp_int64 b;

      } union_i4;

      union_i4 vold, vnew;

      if (pr->u.p.count < (UT)pr->u.p.ub) {

        KMP_DEBUG_ASSERT(pr->steal_flag == READY);

        vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);

        vnew.b = vold.b;

        vnew.p.count++; // get chunk from head of self range

        while (!KMP_COMPARE_AND_STORE_REL64(

            (volatile kmp_int64 *)&pr->u.p.count,

            *VOLATILE_CAST(kmp_int64 *) & vold.b,

            *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {

          KMP_CPU_PAUSE();

          vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);

          vnew.b = vold.b;

          vnew.p.count++;

        }

        init = vold.p.count;

        status = (init < (UT)vold.p.ub);

      } else {

        status = 0; // no own chunks

      }

      if (!status) { // try to steal

        T while_limit = pr->u.p.parm3;

        T while_index = 0;

        int idx = (th->th.th_dispatch->th_disp_index - 1) %

                  __kmp_dispatch_num_buffers; // current loop index

        // note: victim thread can potentially execute another loop

        KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive

        while ((!status) && (while_limit != ++while_index)) {

          dispatch_private_info_template<T> *v;

          T remaining;

          T victimId = pr->u.p.parm4;

          T oldVictimId = victimId ? victimId - 1 : nproc - 1;

          v = reinterpret_cast<dispatch_private_info_template<T> *>(

              &team->t.t_dispatch[victimId].th_disp_buffer[idx]);

          KMP_DEBUG_ASSERT(v);

          while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&

                 oldVictimId != victimId) {

            victimId = (victimId + 1) % nproc;

            v = reinterpret_cast<dispatch_private_info_template<T> *>(

                &team->t.t_dispatch[victimId].th_disp_buffer[idx]);

            KMP_DEBUG_ASSERT(v);

          }

          if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {

            continue; // try once more (nproc attempts in total)

          }

          if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {

            kmp_uint32 old = UNUSED;

            // try to steal whole range from inactive victim

            status = v->steal_flag.compare_exchange_strong(old, THIEF);

            if (status) {

              // initialize self buffer with victim's whole range of chunks

              T id = victimId;

              T small_chunk = 0, extras = 0, p_extra = 0;

              __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,

                                              init, small_chunk, extras,

                                              p_extra);

              vnew.p.count = init + 1;

              vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);

              // write pair (count, ub) at once atomically

#if KMP_ARCH_X86

              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);

#else

              *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;

#endif

              pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid

              // no need to initialize other thread invariants: lb, st, etc.

#ifdef KMP_DEBUG

              {

                char *buff;

                // create format specifiers before the debug output

                buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "

                                        "stolen chunks from T#%%d, "

                                        "count:%%%s ub:%%%s\n",

                                        traits_t<UT>::spec, traits_t<T>::spec);

                KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));

                __kmp_str_free(&buff);

              }

#endif

              // activate non-empty buffer and let others steal from us

              if (pr->u.p.count < (UT)pr->u.p.ub)

                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);

              break;

            }

          }

          while (1) { // CAS loop with check if victim still has enough chunks

            // many threads may be stealing concurrently from same victim

            vold.b = *(volatile kmp_int64 *)(&v->u.p.count);

            if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||

                vold.p.count >= (UT)vold.p.ub) {

              pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id

              break; // no chunks to steal, try next victim

            }

            vnew.b = vold.b;

            remaining = vold.p.ub - vold.p.count;

            // try to steal 1/4 of remaining

            // TODO: is this heuristics good enough??

            if (remaining > 7) {

              vnew.p.ub -= remaining >> 2; // steal from tail of victim's range

            } else {

              vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining

            }

            KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);

            if (KMP_COMPARE_AND_STORE_REL64(

                    (volatile kmp_int64 *)&v->u.p.count,

                    *VOLATILE_CAST(kmp_int64 *) & vold.b,

                    *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {

              // stealing succedded

#ifdef KMP_DEBUG

              {

                char *buff;

                // create format specifiers before the debug output

                buff = __kmp_str_format(

                    "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "

                    "count:%%%s ub:%%%s\n",

                    traits_t<T>::spec, traits_t<T>::spec);

                KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));

                __kmp_str_free(&buff);

              }

#endif

              KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,

                                        vold.p.ub - vnew.p.ub);

              status = 1;

              pr->u.p.parm4 = victimId; // keep victim id

              // now update own count and ub

              init = vnew.p.ub;

              vold.p.count = init + 1;

#if KMP_ARCH_X86

              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);

#else

              *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;

#endif

              // activate non-empty buffer and let others steal from us

              if (vold.p.count < (UT)vold.p.ub)

                KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);

              break;

            } // if (check CAS result)

            KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt

          } // while (try to steal from particular victim)

        } // while (search for victim)

      } // if (try to find victim and steal)

    } // if (4-byte induction variable)

    if (!status) {

      *p_lb = 0;

      *p_ub = 0;

      if (p_st != NULL)

        *p_st = 0;

    } else {

      start = pr->u.p.lb;

      init *= chunk;

      limit = chunk + init - 1;

      incr = pr->u.p.st;

      KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);


      KMP_DEBUG_ASSERT(init <= trip);

      // keep track of done chunks for possible early exit from stealing

      // TODO: count executed chunks locally with rare update of shared location

      // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);

      if ((last = (limit >= trip)) != 0)

        limit = trip;

      if (p_st != NULL)

        *p_st = incr;


      if (incr == 1) {

        *p_lb = start + init;

        *p_ub = start + limit;

      } else {

        *p_lb = start + init * incr;

        *p_ub = start + limit * incr;

      }

    } // if

    break;

  } // case

#endif // KMP_STATIC_STEAL_ENABLED

  case kmp_sch_static_balanced: {

    KD_TRACE(

        10,

        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",

         gtid));

    /* check if thread has any iteration to do */

    if ((status = !pr->u.p.count) != 0) {

      pr->u.p.count = 1;

      *p_lb = pr->u.p.lb;

      *p_ub = pr->u.p.ub;

      last = (pr->u.p.parm1 != 0);

      if (p_st != NULL)

        *p_st = pr->u.p.st;

    } else { /* no iterations to do */

      pr->u.p.lb = pr->u.p.ub + pr->u.p.st;

    }

  } // case

  break;

  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was

                                 merged here */

  case kmp_sch_static_chunked: {

    T parm1;


    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "

                   "kmp_sch_static_[affinity|chunked] case\n",

                   gtid));

    parm1 = pr->u.p.parm1;


    trip = pr->u.p.tc - 1;

    init = parm1 * (pr->u.p.count + tid);


    if ((status = (init <= trip)) != 0) {

      start = pr->u.p.lb;

      incr = pr->u.p.st;

      limit = parm1 + init - 1;


      if ((last = (limit >= trip)) != 0)

        limit = trip;


      if (p_st != NULL)

        *p_st = incr;


      pr->u.p.count += nproc;


      if (incr == 1) {

        *p_lb = start + init;

        *p_ub = start + limit;

      } else {

        *p_lb = start + init * incr;

        *p_ub = start + limit * incr;

      }


      if (pr->flags.ordered) {

        pr->u.p.ordered_lower = init;

        pr->u.p.ordered_upper = limit;

      } // if

    } // if

  } // case

  break;


  case kmp_sch_dynamic_chunked: {

    UT chunk_number;

    UT chunk_size = pr->u.p.parm1;

    UT nchunks = pr->u.p.parm2;


    KD_TRACE(

        100,

        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",

         gtid));


    chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);

    status = (chunk_number < nchunks);

    if (!status) {

      *p_lb = 0;

      *p_ub = 0;

      if (p_st != NULL)

        *p_st = 0;

    } else {

      init = chunk_size * chunk_number;

      trip = pr->u.p.tc - 1;

      start = pr->u.p.lb;

      incr = pr->u.p.st;


      if ((last = (trip - init < (UT)chunk_size)))

        limit = trip;

      else

        limit = chunk_size + init - 1;


      if (p_st != NULL)

        *p_st = incr;


      if (incr == 1) {

        *p_lb = start + init;

        *p_ub = start + limit;

      } else {

        *p_lb = start + init * incr;

        *p_ub = start + limit * incr;

      }


      if (pr->flags.ordered) {

        pr->u.p.ordered_lower = init;

        pr->u.p.ordered_upper = limit;

      } // if

    } // if

  } // case

  break;


  case kmp_sch_guided_iterative_chunked: {

    T chunkspec = pr->u.p.parm1;

    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "

                   "iterative case\n",

                   gtid));

    trip = pr->u.p.tc;

    // Start atomic part of calculations

    while (1) {

      ST remaining; // signed, because can be < 0

      init = sh->u.s.iteration; // shared value

      remaining = trip - init;

      if (remaining <= 0) { // AC: need to compare with 0 first

        // nothing to do, don't try atomic op

        status = 0;

        break;

      }

      if ((T)remaining <

          pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default

        // use dynamic-style schedule

        // atomically increment iterations, get old value

        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),

                                 (ST)chunkspec);

        remaining = trip - init;

        if (remaining <= 0) {

          status = 0; // all iterations got by other threads

        } else {

          // got some iterations to work on

          status = 1;

          if ((T)remaining > chunkspec) {

            limit = init + chunkspec - 1;

          } else {

            last = true; // the last chunk

            limit = init + remaining - 1;

          } // if

        } // if

        break;

      } // if

      limit = init + (UT)((double)remaining *

                          *(double *)&pr->u.p.parm3); // divide by K*nproc

      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),

                               (ST)init, (ST)limit)) {

        // CAS was successful, chunk obtained

        status = 1;

        --limit;

        break;

      } // if

    } // while

    if (status != 0) {

      start = pr->u.p.lb;

      incr = pr->u.p.st;

      if (p_st != NULL)

        *p_st = incr;

      *p_lb = start + init * incr;

      *p_ub = start + limit * incr;

      if (pr->flags.ordered) {

        pr->u.p.ordered_lower = init;

        pr->u.p.ordered_upper = limit;

      } // if

    } else {

      *p_lb = 0;

      *p_ub = 0;

      if (p_st != NULL)

        *p_st = 0;

    } // if

  } // case

  break;


  case kmp_sch_guided_simd: {

    // same as iterative but curr-chunk adjusted to be multiple of given

    // chunk

    T chunk = pr->u.p.parm1;

    KD_TRACE(100,

             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",

              gtid));

    trip = pr->u.p.tc;

    // Start atomic part of calculations

    while (1) {

      ST remaining; // signed, because can be < 0

      init = sh->u.s.iteration; // shared value

      remaining = trip - init;

      if (remaining <= 0) { // AC: need to compare with 0 first

        status = 0; // nothing to do, don't try atomic op

        break;

      }

      KMP_DEBUG_ASSERT(chunk && init % chunk == 0);

      // compare with K*nproc*(chunk+1), K=2 by default

      if ((T)remaining < pr->u.p.parm2) {

        // use dynamic-style schedule

        // atomically increment iterations, get old value

        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),

                                 (ST)chunk);

        remaining = trip - init;

        if (remaining <= 0) {

          status = 0; // all iterations got by other threads

        } else {

          // got some iterations to work on

          status = 1;

          if ((T)remaining > chunk) {

            limit = init + chunk - 1;

          } else {

            last = true; // the last chunk

            limit = init + remaining - 1;

          } // if

        } // if

        break;

      } // if

      // divide by K*nproc

      UT span;

      __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),

                         &span);

      UT rem = span % chunk;

      if (rem) // adjust so that span%chunk == 0

        span += chunk - rem;

      limit = init + span;

      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),

                               (ST)init, (ST)limit)) {

        // CAS was successful, chunk obtained

        status = 1;

        --limit;

        break;

      } // if

    } // while

    if (status != 0) {

      start = pr->u.p.lb;

      incr = pr->u.p.st;

      if (p_st != NULL)

        *p_st = incr;

      *p_lb = start + init * incr;

      *p_ub = start + limit * incr;

      if (pr->flags.ordered) {

        pr->u.p.ordered_lower = init;

        pr->u.p.ordered_upper = limit;

      } // if

    } else {

      *p_lb = 0;

      *p_ub = 0;

      if (p_st != NULL)

        *p_st = 0;

    } // if

  } // case

  break;


  case kmp_sch_guided_analytical_chunked: {

    T chunkspec = pr->u.p.parm1;

    UT chunkIdx;

#if KMP_USE_X87CONTROL

    /* for storing original FPCW value for Windows* OS on

       IA-32 architecture 8-byte version */

    unsigned int oldFpcw;

    unsigned int fpcwSet = 0;

#endif

    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "

                   "kmp_sch_guided_analytical_chunked case\n",

                   gtid));


    trip = pr->u.p.tc;


    KMP_DEBUG_ASSERT(nproc > 1);

    KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);


    while (1) { /* this while loop is a safeguard against unexpected zero

                   chunk sizes */

      chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);

      if (chunkIdx >= (UT)pr->u.p.parm2) {

        --trip;

        /* use dynamic-style scheduling */

        init = chunkIdx * chunkspec + pr->u.p.count;

        /* need to verify init > 0 in case of overflow in the above

         * calculation */

        if ((status = (init > 0 && init <= trip)) != 0) {

          limit = init + chunkspec - 1;


          if ((last = (limit >= trip)) != 0)

            limit = trip;

        }

        break;

      } else {

/* use exponential-style scheduling */

/* The following check is to workaround the lack of long double precision on

   Windows* OS.

   This check works around the possible effect that init != 0 for chunkIdx == 0.

 */

#if KMP_USE_X87CONTROL

        /* If we haven't already done so, save original

           FPCW and set precision to 64-bit, as Windows* OS

           on IA-32 architecture defaults to 53-bit */

        if (!fpcwSet) {

          oldFpcw = _control87(0, 0);

          _control87(_PC_64, _MCW_PC);

          fpcwSet = 0x30000;

        }

#endif

        if (chunkIdx) {

          init = __kmp_dispatch_guided_remaining<T>(

              trip, *(DBL *)&pr->u.p.parm3, chunkIdx);

          KMP_DEBUG_ASSERT(init);

          init = trip - init;

        } else

          init = 0;

        limit = trip - __kmp_dispatch_guided_remaining<T>(

                           trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);

        KMP_ASSERT(init <= limit);

        if (init < limit) {

          KMP_DEBUG_ASSERT(limit <= trip);

          --limit;

          status = 1;

          break;

        } // if

      } // if

    } // while (1)

#if KMP_USE_X87CONTROL

    /* restore FPCW if necessary

       AC: check fpcwSet flag first because oldFpcw can be uninitialized here

    */

    if (fpcwSet && (oldFpcw & fpcwSet))

      _control87(oldFpcw, _MCW_PC);

#endif

    if (status != 0) {

      start = pr->u.p.lb;

      incr = pr->u.p.st;

      if (p_st != NULL)

        *p_st = incr;

      *p_lb = start + init * incr;

      *p_ub = start + limit * incr;

      if (pr->flags.ordered) {

        pr->u.p.ordered_lower = init;

        pr->u.p.ordered_upper = limit;

      }

    } else {

      *p_lb = 0;

      *p_ub = 0;

      if (p_st != NULL)

        *p_st = 0;

    }

  } // case

  break;


  case kmp_sch_trapezoidal: {

    UT index;

    T parm2 = pr->u.p.parm2;

    T parm3 = pr->u.p.parm3;

    T parm4 = pr->u.p.parm4;

    KD_TRACE(100,

             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",

              gtid));


    index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);


    init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;

    trip = pr->u.p.tc - 1;


    if ((status = ((T)index < parm3 && init <= trip)) == 0) {

      *p_lb = 0;

      *p_ub = 0;

      if (p_st != NULL)

        *p_st = 0;

    } else {

      start = pr->u.p.lb;

      limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;

      incr = pr->u.p.st;


      if ((last = (limit >= trip)) != 0)

        limit = trip;


      if (p_st != NULL)

        *p_st = incr;


      if (incr == 1) {

        *p_lb = start + init;

        *p_ub = start + limit;

      } else {

        *p_lb = start + init * incr;

        *p_ub = start + limit * incr;

      }


      if (pr->flags.ordered) {

        pr->u.p.ordered_lower = init;

        pr->u.p.ordered_upper = limit;

      } // if

    } // if

  } // case

  break;

  default: {

    status = 0; // to avoid complaints on uninitialized variable use

    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message

                KMP_HNT(GetNewerLibrary), // Hint

                __kmp_msg_null // Variadic argument list terminator

    );

  } break;

  } // switch

  if (p_last)

    *p_last = last;

#ifdef KMP_DEBUG

  if (pr->flags.ordered) {

    char *buff;

    // create format specifiers before the debug output

    buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "

                            "ordered_lower:%%%s ordered_upper:%%%s\n",

                            traits_t<UT>::spec, traits_t<UT>::spec);

    KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));

    __kmp_str_free(&buff);

  }

  {

    char *buff;

    // create format specifiers before the debug output

    buff = __kmp_str_format(

        "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "

        "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",

        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);

    KMP_DEBUG_ASSERT(p_last);

    KMP_DEBUG_ASSERT(p_st);

    KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));

    __kmp_str_free(&buff);

  }

#endif

  return status;

}


/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more

   work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()

   is not called. */

#if OMPT_SUPPORT && OMPT_OPTIONAL

#define OMPT_LOOP_END                                                          \

  if (status == 0) {                                                           \

    if (ompt_enabled.ompt_callback_work) {                                     \

      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \

      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \

      ompt_callbacks.ompt_callback(ompt_callback_work)(                        \

          ompt_get_work_schedule(pr->schedule), ompt_scope_end,                \

          &(team_info->parallel_data), &(task_info->task_data), 0, codeptr);   \

    }                                                                          \

  }

#define OMPT_LOOP_DISPATCH(lb, ub, st, status)                                 \

  if (ompt_enabled.ompt_callback_dispatch && status) {                         \

    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);                \

    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);              \

    ompt_dispatch_chunk_t chunk;                                               \

    ompt_data_t instance = ompt_data_none;                                     \

    OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st);                                \

    instance.ptr = &chunk;                                                     \

    ompt_callbacks.ompt_callback(ompt_callback_dispatch)(                      \

        &(team_info->parallel_data), &(task_info->task_data),                  \

        ompt_dispatch_ws_loop_chunk, instance);                                \

  }

// TODO: implement count

#else

#define OMPT_LOOP_END // no-op

#define OMPT_LOOP_DISPATCH(lb, ub, st, status) // no-op

#endif


#if KMP_STATS_ENABLED

#define KMP_STATS_LOOP_END                                                     \

  {                                                                            \

    kmp_int64 u, l, t, i;                                                      \

    l = (kmp_int64)(*p_lb);                                                    \

    u = (kmp_int64)(*p_ub);                                                    \

    i = (kmp_int64)(pr->u.p.st);                                               \

    if (status == 0) {                                                         \

      t = 0;                                                                   \

      KMP_POP_PARTITIONED_TIMER();                                             \

    } else if (i == 1) {                                                       \

      if (u >= l)                                                              \

        t = u - l + 1;                                                         \

      else                                                                     \

        t = 0;                                                                 \

    } else if (i < 0) {                                                        \

      if (l >= u)                                                              \

        t = (l - u) / (-i) + 1;                                                \

      else                                                                     \

        t = 0;                                                                 \

    } else {                                                                   \

      if (u >= l)                                                              \

        t = (u - l) / i + 1;                                                   \

      else                                                                     \

        t = 0;                                                                 \

    }                                                                          \

    KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \

  }

#else

#define KMP_STATS_LOOP_END /* Nothing */

#endif


template <typename T>

static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,

                               T *p_lb, T *p_ub,

                               typename traits_t<T>::signed_t *p_st

#if OMPT_SUPPORT && OMPT_OPTIONAL

                               ,

                               void *codeptr

#endif

) {


  typedef typename traits_t<T>::unsigned_t UT;

  typedef typename traits_t<T>::signed_t ST;

  // This is potentially slightly misleading, schedule(runtime) will appear here

  // even if the actual runtime schedule is static. (Which points out a

  // disadvantage of schedule(runtime): even when static scheduling is used it

  // costs more than a compile time choice to use static scheduling would.)

  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);


  int status;

  dispatch_private_info_template<T> *pr;

  __kmp_assert_valid_gtid(gtid);

  kmp_info_t *th = __kmp_threads[gtid];

  kmp_team_t *team = th->th.th_team;


  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL

  KD_TRACE(

      1000,

      ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",

       gtid, p_lb, p_ub, p_st, p_last));


  if (team->t.t_serialized) {

    /* NOTE: serialize this dispatch because we are not at the active level */

    pr = reinterpret_cast<dispatch_private_info_template<T> *>(

        th->th.th_dispatch->th_disp_buffer); /* top of the stack */

    KMP_DEBUG_ASSERT(pr);


    if ((status = (pr->u.p.tc != 0)) == 0) {

      *p_lb = 0;

      *p_ub = 0;

      //            if ( p_last != NULL )

      //                *p_last = 0;

      if (p_st != NULL)

        *p_st = 0;

      if (__kmp_env_consistency_check) {

        if (pr->pushed_ws != ct_none) {

          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);

        }

      }

    } else if (pr->flags.nomerge) {

      kmp_int32 last;

      T start;

      UT limit, trip, init;

      ST incr;

      T chunk = pr->u.p.parm1;


      KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",

                     gtid));


      init = chunk * pr->u.p.count++;

      trip = pr->u.p.tc - 1;


      if ((status = (init <= trip)) == 0) {

        *p_lb = 0;

        *p_ub = 0;

        //                if ( p_last != NULL )

        //                    *p_last = 0;

        if (p_st != NULL)

          *p_st = 0;

        if (__kmp_env_consistency_check) {

          if (pr->pushed_ws != ct_none) {

            pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);

          }

        }

      } else {

        start = pr->u.p.lb;

        limit = chunk + init - 1;

        incr = pr->u.p.st;


        if ((last = (limit >= trip)) != 0) {

          limit = trip;

#if KMP_OS_WINDOWS

          pr->u.p.last_upper = pr->u.p.ub;

#endif /* KMP_OS_WINDOWS */

        }

        if (p_last != NULL)

          *p_last = last;

        if (p_st != NULL)

          *p_st = incr;

        if (incr == 1) {

          *p_lb = start + init;

          *p_ub = start + limit;

        } else {

          *p_lb = start + init * incr;

          *p_ub = start + limit * incr;

        }


        if (pr->flags.ordered) {

          pr->u.p.ordered_lower = init;

          pr->u.p.ordered_upper = limit;

#ifdef KMP_DEBUG

          {

            char *buff;

            // create format specifiers before the debug output

            buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "

                                    "ordered_lower:%%%s ordered_upper:%%%s\n",

                                    traits_t<UT>::spec, traits_t<UT>::spec);

            KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,

                            pr->u.p.ordered_upper));

            __kmp_str_free(&buff);

          }

#endif

        } // if

      } // if

    } else {

      pr->u.p.tc = 0;

      *p_lb = pr->u.p.lb;

      *p_ub = pr->u.p.ub;

#if KMP_OS_WINDOWS

      pr->u.p.last_upper = *p_ub;

#endif /* KMP_OS_WINDOWS */

      if (p_last != NULL)

        *p_last = TRUE;

      if (p_st != NULL)

        *p_st = pr->u.p.st;

    } // if

#ifdef KMP_DEBUG

    {

      char *buff;

      // create format specifiers before the debug output

      buff = __kmp_str_format(

          "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "

          "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",

          traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);

      KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,

                    (p_last ? *p_last : 0), status));

      __kmp_str_free(&buff);

    }

#endif

#if INCLUDE_SSC_MARKS

    SSC_MARK_DISPATCH_NEXT();

#endif

    OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);

    OMPT_LOOP_END;

    KMP_STATS_LOOP_END;

    return status;

  } else {

    kmp_int32 last = 0;

    dispatch_shared_info_template<T> volatile *sh;


    KMP_DEBUG_ASSERT(th->th.th_dispatch ==

                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);


    pr = reinterpret_cast<dispatch_private_info_template<T> *>(

        th->th.th_dispatch->th_dispatch_pr_current);

    KMP_DEBUG_ASSERT(pr);

    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(

        th->th.th_dispatch->th_dispatch_sh_current);

    KMP_DEBUG_ASSERT(sh);


#if KMP_USE_HIER_SCHED

    if (pr->flags.use_hier)

      status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);

    else

#endif // KMP_USE_HIER_SCHED

      status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,

                                                p_st, th->th.th_team_nproc,

                                                th->th.th_info.ds.ds_tid);

    // status == 0: no more iterations to execute

    if (status == 0) {

      ST num_done;

      num_done = test_then_inc<ST>(&sh->u.s.num_done);

#ifdef KMP_DEBUG

      {

        char *buff;

        // create format specifiers before the debug output

        buff = __kmp_str_format(

            "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",

            traits_t<ST>::spec);

        KD_TRACE(10, (buff, gtid, sh->u.s.num_done));

        __kmp_str_free(&buff);

      }

#endif


#if KMP_USE_HIER_SCHED

      pr->flags.use_hier = FALSE;

#endif

      if (num_done == th->th.th_team_nproc - 1) {

#if KMP_STATIC_STEAL_ENABLED

        if (pr->schedule == kmp_sch_static_steal) {

          int i;

          int idx = (th->th.th_dispatch->th_disp_index - 1) %

                    __kmp_dispatch_num_buffers; // current loop index

          // loop complete, safe to destroy locks used for stealing

          for (i = 0; i < th->th.th_team_nproc; ++i) {

            dispatch_private_info_template<T> *buf =

                reinterpret_cast<dispatch_private_info_template<T> *>(

                    &team->t.t_dispatch[i].th_disp_buffer[idx]);

            KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive

            KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);

            if (traits_t<T>::type_size > 4) {

              // destroy locks used for stealing

              kmp_lock_t *lck = buf->u.p.steal_lock;

              KMP_ASSERT(lck != NULL);

              __kmp_destroy_lock(lck);

              __kmp_free(lck);

              buf->u.p.steal_lock = NULL;

            }

          }

        }

#endif

        /* NOTE: release shared buffer to be reused */


        KMP_MB(); /* Flush all pending memory write invalidates.  */


        sh->u.s.num_done = 0;

        sh->u.s.iteration = 0;


        /* TODO replace with general release procedure? */

        if (pr->flags.ordered) {

          sh->u.s.ordered_iteration = 0;

        }


        KMP_MB(); /* Flush all pending memory write invalidates.  */


        sh->buffer_index += __kmp_dispatch_num_buffers;

        KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",

                       gtid, sh->buffer_index));


        KMP_MB(); /* Flush all pending memory write invalidates.  */


      } // if

      if (__kmp_env_consistency_check) {

        if (pr->pushed_ws != ct_none) {

          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);

        }

      }


      th->th.th_dispatch->th_deo_fcn = NULL;

      th->th.th_dispatch->th_dxo_fcn = NULL;

      th->th.th_dispatch->th_dispatch_sh_current = NULL;

      th->th.th_dispatch->th_dispatch_pr_current = NULL;

    } // if (status == 0)

#if KMP_OS_WINDOWS

    else if (last) {

      pr->u.p.last_upper = pr->u.p.ub;

    }

#endif /* KMP_OS_WINDOWS */

    if (p_last != NULL && status != 0)

      *p_last = last;

  } // if


#ifdef KMP_DEBUG

  {

    char *buff;

    // create format specifiers before the debug output

    buff = __kmp_str_format(

        "__kmp_dispatch_next: T#%%d normal case: "

        "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",

        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);

    KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,

                  (p_last ? *p_last : 0), status));

    __kmp_str_free(&buff);

  }

#endif

#if INCLUDE_SSC_MARKS

  SSC_MARK_DISPATCH_NEXT();

#endif

  OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);

  OMPT_LOOP_END;

  KMP_STATS_LOOP_END;

  return status;

}


/*!

@ingroup WORK_SHARING

@param loc  source location information

@param global_tid  global thread number

@return Zero if the parallel region is not active and this thread should execute

all sections, non-zero otherwise.


Beginning of sections construct.

There are no implicit barriers in the "sections" calls, rather the compiler

should introduce an explicit barrier if it is required.


This implementation is based on __kmp_dispatch_init, using same constructs for

shared data (we can't have sections nested directly in omp for loop, there

should be a parallel region in between)

*/

kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid) {


  int active;

  kmp_info_t *th;

  kmp_team_t *team;

  kmp_uint32 my_buffer_index;

  dispatch_shared_info_template<kmp_int32> volatile *sh;


  KMP_DEBUG_ASSERT(__kmp_init_serial);


  if (!TCR_4(__kmp_init_parallel))

    __kmp_parallel_initialize();

  __kmp_resume_if_soft_paused();


  /* setup data */

  th = __kmp_threads[gtid];

  team = th->th.th_team;

  active = !team->t.t_serialized;

  th->th.th_ident = loc;


  KMP_COUNT_BLOCK(OMP_SECTIONS);

  KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));


  if (active) {

    // Setup sections in the same way as dynamic scheduled loops.

    // We need one shared data: which section is to execute next.

    // (in case parallel is not active, all sections will be executed on the

    // same thread)

    KMP_DEBUG_ASSERT(th->th.th_dispatch ==

                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);


    my_buffer_index = th->th.th_dispatch->th_disp_index++;


    // reuse shared data structures from dynamic sched loops:

    sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(

        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);

    KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,

                  my_buffer_index));


    th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;

    th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;


    KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "

                   "sh->buffer_index:%d\n",

                   gtid, my_buffer_index, sh->buffer_index));

    __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,

                           __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));

    // Note: KMP_WAIT() cannot be used there: buffer index and

    // my_buffer_index are *always* 32-bit integers.

    KMP_MB();

    KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "

                   "sh->buffer_index:%d\n",

                   gtid, my_buffer_index, sh->buffer_index));


    th->th.th_dispatch->th_dispatch_pr_current =

        nullptr; // sections construct doesn't need private data

    th->th.th_dispatch->th_dispatch_sh_current =

        CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);

  }


#if OMPT_SUPPORT && OMPT_OPTIONAL

  if (ompt_enabled.ompt_callback_work) {

    ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);

    ompt_task_info_t *task_info = __ompt_get_task_info_object(0);

    ompt_callbacks.ompt_callback(ompt_callback_work)(

        ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),

        &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));

  }

#endif

  KMP_PUSH_PARTITIONED_TIMER(OMP_sections);


  return active;

}


/*!

@ingroup WORK_SHARING

@param loc  source location information

@param global_tid  global thread number

@param numberOfSections  number of sections in the 'sections' construct

@return unsigned [from 0 to n) - number (id) of the section to execute next on

this thread. n (or any other number not in range) - nothing to execute on this

thread

*/


kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid,

                              kmp_int32 numberOfSections) {


  KMP_TIME_PARTITIONED_BLOCK(OMP_sections_overhead);


  kmp_info_t *th = __kmp_threads[gtid];

#ifdef KMP_DEBUG

  kmp_team_t *team = th->th.th_team;

#endif


  KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,

                  numberOfSections));


  // For serialized case we should not call this function:

  KMP_DEBUG_ASSERT(!team->t.t_serialized);


  dispatch_shared_info_template<kmp_int32> volatile *sh;


  KMP_DEBUG_ASSERT(th->th.th_dispatch ==

                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);


  KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));

  sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(

      th->th.th_dispatch->th_dispatch_sh_current);

  KMP_DEBUG_ASSERT(sh);


  kmp_int32 sectionIndex = 0;

  bool moreSectionsToExecute = true;


  // Find section to execute:

  sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);

  if (sectionIndex >= numberOfSections) {

    moreSectionsToExecute = false;

  }


  // status == 0: no more sections to execute;

  // OMPTODO: __kmpc_end_sections could be bypassed?

  if (!moreSectionsToExecute) {

    kmp_int32 num_done;


    num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));


    if (num_done == th->th.th_team_nproc - 1) {

      /* NOTE: release this buffer to be reused */


      KMP_MB(); /* Flush all pending memory write invalidates.  */


      sh->u.s.num_done = 0;

      sh->u.s.iteration = 0;


      KMP_MB(); /* Flush all pending memory write invalidates.  */


      sh->buffer_index += __kmp_dispatch_num_buffers;

      KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,

                     sh->buffer_index));


      KMP_MB(); /* Flush all pending memory write invalidates.  */


    } // if


    th->th.th_dispatch->th_deo_fcn = NULL;

    th->th.th_dispatch->th_dxo_fcn = NULL;

    th->th.th_dispatch->th_dispatch_sh_current = NULL;

    th->th.th_dispatch->th_dispatch_pr_current = NULL;


#if OMPT_SUPPORT && OMPT_OPTIONAL

    if (ompt_enabled.ompt_callback_dispatch) {

      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);

      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);

      ompt_data_t instance = ompt_data_none;

      instance.ptr = OMPT_GET_RETURN_ADDRESS(0);

      ompt_callbacks.ompt_callback(ompt_callback_dispatch)(

          &(team_info->parallel_data), &(task_info->task_data),

          ompt_dispatch_section, instance);

    }

#endif

  }


  return sectionIndex;

}


/*!

@ingroup WORK_SHARING

@param loc  source location information

@param global_tid  global thread number


End of "sections" construct.

Don't need to wait here: barrier is added separately when needed.

*/

void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid) {


  kmp_info_t *th = __kmp_threads[gtid];

  int active = !th->th.th_team->t.t_serialized;


  KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));


  if (!active) {

    // In active case call finalization is done in __kmpc_next_section

#if OMPT_SUPPORT && OMPT_OPTIONAL

    if (ompt_enabled.ompt_callback_work) {

      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);

      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);

      ompt_callbacks.ompt_callback(ompt_callback_work)(

          ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),

          &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));

    }

#endif

  }


  KMP_POP_PARTITIONED_TIMER();

  KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));

}


template <typename T>

static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,

                                  kmp_int32 *plastiter, T *plower, T *pupper,

                                  typename traits_t<T>::signed_t incr) {

  typedef typename traits_t<T>::unsigned_t UT;

  kmp_uint32 team_id;

  kmp_uint32 nteams;

  UT trip_count;

  kmp_team_t *team;

  kmp_info_t *th;


  KMP_DEBUG_ASSERT(plastiter && plower && pupper);

  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));

#ifdef KMP_DEBUG

  typedef typename traits_t<T>::signed_t ST;

  {

    char *buff;

    // create format specifiers before the debug output

    buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "

                            "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",

                            traits_t<T>::spec, traits_t<T>::spec,

                            traits_t<ST>::spec, traits_t<T>::spec);

    KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));

    __kmp_str_free(&buff);

  }

#endif


  if (__kmp_env_consistency_check) {

    if (incr == 0) {

      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,

                            loc);

    }

    if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {

      // The loop is illegal.

      // Some zero-trip loops maintained by compiler, e.g.:

      //   for(i=10;i<0;++i) // lower >= upper - run-time check

      //   for(i=0;i>10;--i) // lower <= upper - run-time check

      //   for(i=0;i>10;++i) // incr > 0       - compile-time check

      //   for(i=10;i<0;--i) // incr < 0       - compile-time check

      // Compiler does not check the following illegal loops:

      //   for(i=0;i<10;i+=incr) // where incr<0

      //   for(i=10;i>0;i-=incr) // where incr<0

      __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);

    }

  }

  __kmp_assert_valid_gtid(gtid);

  th = __kmp_threads[gtid];

  team = th->th.th_team;

  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct

  nteams = th->th.th_teams_size.nteams;

  team_id = team->t.t_master_tid;

  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);


  // compute global trip count

  if (incr == 1) {

    trip_count = *pupper - *plower + 1;

  } else if (incr == -1) {

    trip_count = *plower - *pupper + 1;

  } else if (incr > 0) {

    // upper-lower can exceed the limit of signed type

    trip_count = (UT)(*pupper - *plower) / incr + 1;

  } else {

    trip_count = (UT)(*plower - *pupper) / (-incr) + 1;

  }


  if (trip_count <= nteams) {

    KMP_DEBUG_ASSERT(

        __kmp_static == kmp_sch_static_greedy ||

        __kmp_static ==

            kmp_sch_static_balanced); // Unknown static scheduling type.

    // only some teams get single iteration, others get nothing

    if (team_id < trip_count) {

      *pupper = *plower = *plower + team_id * incr;

    } else {

      *plower = *pupper + incr; // zero-trip loop

    }

    if (plastiter != NULL)

      *plastiter = (team_id == trip_count - 1);

  } else {

    if (__kmp_static == kmp_sch_static_balanced) {

      UT chunk = trip_count / nteams;

      UT extras = trip_count % nteams;

      *plower +=

          incr * (team_id * chunk + (team_id < extras ? team_id : extras));

      *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);

      if (plastiter != NULL)

        *plastiter = (team_id == nteams - 1);

    } else {

      T chunk_inc_count =

          (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;

      T upper = *pupper;

      KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);

      // Unknown static scheduling type.

      *plower += team_id * chunk_inc_count;

      *pupper = *plower + chunk_inc_count - incr;

      // Check/correct bounds if needed

      if (incr > 0) {

        if (*pupper < *plower)

          *pupper = traits_t<T>::max_value;

        if (plastiter != NULL)

          *plastiter = *plower <= upper && *pupper > upper - incr;

        if (*pupper > upper)

          *pupper = upper; // tracker C73258

      } else {

        if (*pupper > *plower)

          *pupper = traits_t<T>::min_value;

        if (plastiter != NULL)

          *plastiter = *plower >= upper && *pupper < upper - incr;

        if (*pupper < upper)

          *pupper = upper; // tracker C73258

      }

    }

  }

}


//-----------------------------------------------------------------------------

// Dispatch routines

//    Transfer call to template< type T >

//    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,

//                         T lb, T ub, ST st, ST chunk )

extern "C" {


/*!

@ingroup WORK_SHARING

@{

@param loc Source location

@param gtid Global thread id

@param schedule Schedule type

@param lb  Lower bound

@param ub  Upper bound

@param st  Step (or increment if you prefer)

@param chunk The chunk size to block with


This function prepares the runtime to start a dynamically scheduled for loop,

saving the loop arguments.

These functions are all identical apart from the types of the arguments.

*/


void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,

                            enum sched_type schedule, kmp_int32 lb,

                            kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {

  KMP_DEBUG_ASSERT(__kmp_init_serial);

#if OMPT_SUPPORT && OMPT_OPTIONAL

  OMPT_STORE_RETURN_ADDRESS(gtid);

#endif

  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);

}

/*!

See @ref __kmpc_dispatch_init_4

*/

void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,

                             enum sched_type schedule, kmp_uint32 lb,

                             kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {

  KMP_DEBUG_ASSERT(__kmp_init_serial);

#if OMPT_SUPPORT && OMPT_OPTIONAL

  OMPT_STORE_RETURN_ADDRESS(gtid);

#endif

  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);

}


/*!

See @ref __kmpc_dispatch_init_4

*/

void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,

                            enum sched_type schedule, kmp_int64 lb,

                            kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {

  KMP_DEBUG_ASSERT(__kmp_init_serial);

#if OMPT_SUPPORT && OMPT_OPTIONAL

  OMPT_STORE_RETURN_ADDRESS(gtid);

#endif

  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);

}


/*!

See @ref __kmpc_dispatch_init_4

*/

void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,

                             enum sched_type schedule, kmp_uint64 lb,

                             kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {

  KMP_DEBUG_ASSERT(__kmp_init_serial);

#if OMPT_SUPPORT && OMPT_OPTIONAL

  OMPT_STORE_RETURN_ADDRESS(gtid);

#endif

  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);

}


/*!

See @ref __kmpc_dispatch_init_4


Difference from __kmpc_dispatch_init set of functions is these functions

are called for composite distribute parallel for construct. Thus before

regular iterations dispatching we need to calc per-team iteration space.


These functions are all identical apart from the types of the arguments.

*/

void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,

                                 enum sched_type schedule, kmp_int32 *p_last,

                                 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,

                                 kmp_int32 chunk) {

  KMP_DEBUG_ASSERT(__kmp_init_serial);

#if OMPT_SUPPORT && OMPT_OPTIONAL

  OMPT_STORE_RETURN_ADDRESS(gtid);

#endif

  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);

  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);

}


void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,

                                  enum sched_type schedule, kmp_int32 *p_last,

                                  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,

                                  kmp_int32 chunk) {

  KMP_DEBUG_ASSERT(__kmp_init_serial);

#if OMPT_SUPPORT && OMPT_OPTIONAL

  OMPT_STORE_RETURN_ADDRESS(gtid);

#endif

  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);

  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);

}


void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,

                                 enum sched_type schedule, kmp_int32 *p_last,

                                 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,

                                 kmp_int64 chunk) {

  KMP_DEBUG_ASSERT(__kmp_init_serial);

#if OMPT_SUPPORT && OMPT_OPTIONAL

  OMPT_STORE_RETURN_ADDRESS(gtid);

#endif

  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);

  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);

}


void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,

                                  enum sched_type schedule, kmp_int32 *p_last,

                                  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,

                                  kmp_int64 chunk) {

  KMP_DEBUG_ASSERT(__kmp_init_serial);

#if OMPT_SUPPORT && OMPT_OPTIONAL

  OMPT_STORE_RETURN_ADDRESS(gtid);

#endif

  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);

  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);

}


/*!

@param loc Source code location

@param gtid Global thread id

@param p_last Pointer to a flag set to one if this is the last chunk or zero

otherwise

@param p_lb   Pointer to the lower bound for the next chunk of work

@param p_ub   Pointer to the upper bound for the next chunk of work

@param p_st   Pointer to the stride for the next chunk of work

@return one if there is work to be done, zero otherwise


Get the next dynamically allocated chunk of work for this thread.

If there is no more work, then the lb,ub and stride need not be modified.

*/

int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,

                           kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {

#if OMPT_SUPPORT && OMPT_OPTIONAL

  OMPT_STORE_RETURN_ADDRESS(gtid);

#endif

  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st

#if OMPT_SUPPORT && OMPT_OPTIONAL

                                        ,

                                        OMPT_LOAD_RETURN_ADDRESS(gtid)

#endif

  );

}


/*!

See @ref __kmpc_dispatch_next_4

*/

int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,

                            kmp_uint32 *p_lb, kmp_uint32 *p_ub,

                            kmp_int32 *p_st) {

#if OMPT_SUPPORT && OMPT_OPTIONAL

  OMPT_STORE_RETURN_ADDRESS(gtid);

#endif

  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st

#if OMPT_SUPPORT && OMPT_OPTIONAL

                                         ,

                                         OMPT_LOAD_RETURN_ADDRESS(gtid)

#endif

  );

}


/*!

See @ref __kmpc_dispatch_next_4

*/

int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,

                           kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {

#if OMPT_SUPPORT && OMPT_OPTIONAL

  OMPT_STORE_RETURN_ADDRESS(gtid);

#endif

  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st

#if OMPT_SUPPORT && OMPT_OPTIONAL

                                        ,

                                        OMPT_LOAD_RETURN_ADDRESS(gtid)

#endif

  );

}


/*!

See @ref __kmpc_dispatch_next_4

*/

int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,

                            kmp_uint64 *p_lb, kmp_uint64 *p_ub,

                            kmp_int64 *p_st) {

#if OMPT_SUPPORT && OMPT_OPTIONAL

  OMPT_STORE_RETURN_ADDRESS(gtid);

#endif

  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st

#if OMPT_SUPPORT && OMPT_OPTIONAL

                                         ,

                                         OMPT_LOAD_RETURN_ADDRESS(gtid)

#endif

  );

}


/*!

@param loc Source code location

@param gtid Global thread id


Mark the end of a dynamic loop.

*/

void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {

  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);

}


/*!

See @ref __kmpc_dispatch_fini_4

*/

void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {

  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);

}


/*!

See @ref __kmpc_dispatch_fini_4

*/

void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {

  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);

}


/*!

See @ref __kmpc_dispatch_fini_4

*/

void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {

  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);

}


/*!

See @ref __kmpc_dispatch_deinit

*/

void __kmpc_dispatch_deinit(ident_t *loc, kmp_int32 gtid) {}

/*! @} */


//-----------------------------------------------------------------------------

// Non-template routines from kmp_dispatch.cpp used in other sources


kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {

  return value == checker;

}


kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {

  return value != checker;

}


kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {

  return value < checker;

}


kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {

  return value >= checker;

}


kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {

  return value <= checker;

}


kmp_uint32

__kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,

             kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),

             void *obj // Higher-level synchronization object, or NULL.

) {

  // note: we may not belong to a team at this point

  volatile kmp_uint32 *spin = spinner;

  kmp_uint32 check = checker;

  kmp_uint32 spins;

  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;

  kmp_uint32 r;

  kmp_uint64 time;


  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));

  KMP_INIT_YIELD(spins);

  KMP_INIT_BACKOFF(time);

  // main wait spin loop

  while (!f(r = TCR_4(*spin), check)) {

    KMP_FSYNC_SPIN_PREPARE(obj);

    /* GEH - remove this since it was accidentally introduced when kmp_wait was

       split. It causes problems with infinite recursion because of exit lock */

    /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)

        __kmp_abort_thread(); */

    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);

  }

  KMP_FSYNC_SPIN_ACQUIRED(obj);

  return r;

}


void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,

                      kmp_uint32 (*pred)(void *, kmp_uint32),

                      void *obj // Higher-level synchronization object, or NULL.

) {

  // note: we may not belong to a team at this point

  void *spin = spinner;

  kmp_uint32 check = checker;

  kmp_uint32 spins;

  kmp_uint32 (*f)(void *, kmp_uint32) = pred;

  kmp_uint64 time;


  KMP_FSYNC_SPIN_INIT(obj, spin);

  KMP_INIT_YIELD(spins);

  KMP_INIT_BACKOFF(time);

  // main wait spin loop

  while (!f(spin, check)) {

    KMP_FSYNC_SPIN_PREPARE(obj);

    /* if we have waited a bit, or are noversubscribed, yield */

    /* pause is in the following code */

    KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);

  }

  KMP_FSYNC_SPIN_ACQUIRED(obj);

}


} // extern "C"


#ifdef KMP_GOMP_COMPAT


void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,

                               enum sched_type schedule, kmp_int32 lb,

                               kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,

                               int push_ws) {

  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,

                                 push_ws);

}


void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,

                                enum sched_type schedule, kmp_uint32 lb,

                                kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,

                                int push_ws) {

  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,

                                  push_ws);

}


void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,

                               enum sched_type schedule, kmp_int64 lb,

                               kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,

                               int push_ws) {

  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,

                                 push_ws);

}


void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,

                                enum sched_type schedule, kmp_uint64 lb,

                                kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,

                                int push_ws) {

  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,

                                  push_ws);

}


void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {

  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);

}


void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {

  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);

}


void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {

  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);

}


void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {

  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);

}


#endif /* KMP_GOMP_COMPAT */


/* ------------------------------------------------------------------------ */

buf
char buf[BUFFER_SIZE]
Definition: affinity_values.c:34

f
void f()
Definition: api_calls_from_other_thread.cpp:7

target
void * target(void *task)
Definition: bug_nested_proxy_task.c:99

kmp_uint32

kmp_int64
int64_t kmp_int64
Definition: common.h:10

b
int b
Definition: kmp_set_dispatch_buf.c:27

__kmpc_dist_dispatch_init_4u
void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
Definition: kmp_dispatch.cpp:2871

__kmpc_dispatch_next_4
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
Definition: kmp_dispatch.cpp:2920

sched_type
sched_type
Describes the loop schedule to be used for a parallel for loop.
Definition: kmp.h:370

__kmpc_dist_dispatch_init_8u
void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
Definition: kmp_dispatch.cpp:2895

__kmpc_dispatch_fini_4
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
Definition: kmp_dispatch.cpp:2989

__kmpc_dispatch_deinit
void __kmpc_dispatch_deinit(ident_t *loc, kmp_int32 gtid)
See __kmpc_dispatch_deinit.
Definition: kmp_dispatch.cpp:3017

__kmpc_next_section
kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid, kmp_int32 numberOfSections)
Definition: kmp_dispatch.cpp:2551

__kmpc_dist_dispatch_init_8
void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)
Definition: kmp_dispatch.cpp:2883

__kmpc_dist_dispatch_init_4
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
See __kmpc_dispatch_init_4.
Definition: kmp_dispatch.cpp:2859

__kmpc_dispatch_next_4u
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
See __kmpc_dispatch_next_4.
Definition: kmp_dispatch.cpp:2936

__kmpc_end_sections
void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid)
Definition: kmp_dispatch.cpp:2640

__kmpc_dispatch_next_8u
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
See __kmpc_dispatch_next_4.
Definition: kmp_dispatch.cpp:2969

__kmpc_dispatch_fini_8
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
See __kmpc_dispatch_fini_4.
Definition: kmp_dispatch.cpp:2996

__kmpc_sections_init
kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid)
Definition: kmp_dispatch.cpp:2467

__kmpc_dispatch_next_8
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
See __kmpc_dispatch_next_4.
Definition: kmp_dispatch.cpp:2953

__kmpc_dispatch_fini_8u
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
See __kmpc_dispatch_fini_4.
Definition: kmp_dispatch.cpp:3010

__kmpc_dispatch_init_4
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
Definition: kmp_dispatch.cpp:2802

__kmpc_dispatch_init_4u
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
See __kmpc_dispatch_init_4.
Definition: kmp_dispatch.cpp:2814

__kmpc_dispatch_init_8u
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
See __kmpc_dispatch_init_4.
Definition: kmp_dispatch.cpp:2840

__kmpc_dispatch_fini_4u
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
See __kmpc_dispatch_fini_4.
Definition: kmp_dispatch.cpp:3003

__kmpc_dispatch_init_8
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)
See __kmpc_dispatch_init_4.
Definition: kmp_dispatch.cpp:2827

kmp_sch_runtime_simd
@ kmp_sch_runtime_simd
runtime with chunk adjustment
Definition: kmp.h:392

kmp_sch_auto
@ kmp_sch_auto
auto
Definition: kmp.h:377

kmp_sch_static
@ kmp_sch_static
static unspecialized
Definition: kmp.h:373

kmp_sch_guided_simd
@ kmp_sch_guided_simd
guided with chunk adjustment
Definition: kmp.h:391

kmp_sch_runtime
@ kmp_sch_runtime
Definition: kmp.h:376

kmp_sch_guided_chunked
@ kmp_sch_guided_chunked
guided unspecialized
Definition: kmp.h:375

kmp_sch_dynamic_chunked
@ kmp_sch_dynamic_chunked
Definition: kmp.h:374

kmp_sch_guided_analytical_chunked
@ kmp_sch_guided_analytical_chunked
Definition: kmp.h:385

kmp_sch_static_balanced
@ kmp_sch_static_balanced
Definition: kmp.h:382

kmp_sch_static_greedy
@ kmp_sch_static_greedy
Definition: kmp.h:381

kmp_sch_lower
@ kmp_sch_lower
lower bound for unordered values
Definition: kmp.h:371

kmp_sch_static_chunked
@ kmp_sch_static_chunked
Definition: kmp.h:372

kmp_sch_trapezoidal
@ kmp_sch_trapezoidal
Definition: kmp.h:378

kmp_nm_upper
@ kmp_nm_upper
upper bound for nomerge values
Definition: kmp.h:442

kmp_ord_lower
@ kmp_ord_lower
lower bound for ordered values, must be power of 2
Definition: kmp.h:397

kmp_sch_guided_iterative_chunked
@ kmp_sch_guided_iterative_chunked
Definition: kmp.h:384

kmp_sch_static_balanced_chunked
@ kmp_sch_static_balanced_chunked
Definition: kmp.h:390

kmp_sch_upper
@ kmp_sch_upper
upper bound for unordered values
Definition: kmp.h:395

kmp_nm_lower
@ kmp_nm_lower
lower bound for nomerge values
Definition: kmp.h:415

kmp_sch_static_steal
@ kmp_sch_static_steal
Definition: kmp.h:387

d
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d
Definition: ittnotify_static.h:200

instance
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance * instance
Definition: ittnotify_static.h:217

count
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t count
Definition: ittnotify_static.h:282

value
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long value
Definition: ittnotify_static.h:394

p
void const char const char int ITT_FORMAT __itt_group_sync p
Definition: ittnotify_static.h:154

int
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type size_t void ITT_FORMAT p const __itt_domain __itt_id __itt_string_handle const wchar_t size_t ITT_FORMAT lu const __itt_domain __itt_id __itt_relation __itt_id ITT_FORMAT p const wchar_t int ITT_FORMAT __itt_group_mark d int
Definition: ittnotify_static.h:462

type
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type type
Definition: ittnotify_static.h:415

kmp.h

__kmp_free
#define __kmp_free(ptr)
Definition: kmp.h:3765

KMP_CPU_PAUSE
#define KMP_CPU_PAUSE()
Definition: kmp.h:1573

KMP_DEFAULT_CHUNK
#define KMP_DEFAULT_CHUNK
Definition: kmp.h:1297

ct_ordered_in_pdo
@ ct_ordered_in_pdo
Definition: kmp.h:1679

ct_none
@ ct_none
Definition: kmp.h:1671

ct_pdo_ordered
@ ct_pdo_ordered
Definition: kmp.h:1674

ct_pdo
@ ct_pdo
Definition: kmp.h:1673

KMP_YIELD_OVERSUB_ELSE_SPIN
#define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time)
Definition: kmp.h:1649

dispatch_private_info_t
struct KMP_ALIGN_CACHE dispatch_private_info dispatch_private_info_t

SCHEDULE_MONOTONIC
#define SCHEDULE_MONOTONIC
Definition: kmp.h:476

SCHEDULE_HAS_MONOTONIC
#define SCHEDULE_HAS_MONOTONIC(s)
Definition: kmp.h:466

__kmp_auto
enum sched_type __kmp_auto
Definition: kmp_global.cpp:150

__kmp_static
enum sched_type __kmp_static
Definition: kmp_global.cpp:146

__kmp_force_monotonic
int __kmp_force_monotonic
Definition: kmp_global.cpp:173

__kmp_threads
kmp_info_t ** __kmp_threads
Definition: kmp_global.cpp:450

__kmp_dispatch_num_buffers
int __kmp_dispatch_num_buffers
Definition: kmp_global.cpp:135

SCHEDULE_WITHOUT_MODIFIERS
#define SCHEDULE_WITHOUT_MODIFIERS(s)
Definition: kmp.h:463

KMP_INIT_YIELD
#define KMP_INIT_YIELD(count)
Definition: kmp.h:1576

KMP_MASTER_GTID
#define KMP_MASTER_GTID(gtid)
Definition: kmp.h:1327

__kmp_parallel_initialize
void __kmp_parallel_initialize(void)
Definition: kmp_runtime.cpp:7529

KMP_INIT_BACKOFF
#define KMP_INIT_BACKOFF(time)
Definition: kmp.h:1579

__kmp_init_parallel
volatile int __kmp_init_parallel
Definition: kmp_global.cpp:49

__kmp_allocate
#define __kmp_allocate(size)
Definition: kmp.h:3763

TRUE
#define TRUE
Definition: kmp.h:1333

FALSE
#define FALSE
Definition: kmp.h:1332

__kmp_is_hybrid_cpu
static bool __kmp_is_hybrid_cpu()
Definition: kmp.h:3368

__kmp_env_consistency_check
int __kmp_env_consistency_check
Definition: kmp_global.cpp:423

SCHEDULE_HAS_NONMONOTONIC
#define SCHEDULE_HAS_NONMONOTONIC(s)
Definition: kmp.h:467

__kmp_gtid_from_tid
static int __kmp_gtid_from_tid(int tid, const kmp_team_t *team)
Definition: kmp.h:3633

__kmp_guided
enum sched_type __kmp_guided
Definition: kmp_global.cpp:148

__kmp_resume_if_soft_paused
void __kmp_resume_if_soft_paused()
Definition: kmp_runtime.cpp:9085

__kmp_assert_valid_gtid
static void __kmp_assert_valid_gtid(kmp_int32 gtid)
Definition: kmp.h:3653

kmp_hw_core_type_t
kmp_hw_core_type_t
Definition: kmp.h:645

KMP_HW_CORE_TYPE_UNKNOWN
@ KMP_HW_CORE_TYPE_UNKNOWN
Definition: kmp.h:646

__kmp_init_serial
volatile int __kmp_init_serial
Definition: kmp_global.cpp:44

__kmp_type_convert
static void __kmp_type_convert(T1 src, T2 *dest)
Definition: kmp.h:4886

kmp_info_t
union KMP_ALIGN_CACHE kmp_info kmp_info_t

SCHEDULE_NONMONOTONIC
#define SCHEDULE_NONMONOTONIC
Definition: kmp.h:475

double
KMP_ARCH_X86 KMP_ARCH_X86 long double
Definition: kmp_atomic.cpp:2069

kmp_uint32
KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86<<, 2i, 1, KMP_ARCH_X86) ATOMIC_CMPXCHG(fixed2, shr, kmp_int16, 16, > KMP_ARCH_X86 KMP_ARCH_X86 kmp_uint32
Definition: kmp_atomic.cpp:1031

KE_TRACE
#define KE_TRACE(d, x)
Definition: kmp_debug.h:161

KMP_ASSERT
#define KMP_ASSERT(cond)
Definition: kmp_debug.h:59

KMP_BUILD_ASSERT
#define KMP_BUILD_ASSERT(expr)
Definition: kmp_debug.h:26

KD_TRACE
#define KD_TRACE(d, x)
Definition: kmp_debug.h:160

KMP_DEBUG_ASSERT
#define KMP_DEBUG_ASSERT(cond)
Definition: kmp_debug.h:61

KMP_ASSERT2
#define KMP_ASSERT2(cond, msg)
Definition: kmp_debug.h:60

kmp_uint64
unsigned long long kmp_uint64
Definition: kmp_detach_tasks_t1.c:12

__kmp_ge_4
kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker)
Definition: kmp_dispatch.cpp:3035

GUIDED_ANALYTICAL_WORKAROUND
#define GUIDED_ANALYTICAL_WORKAROUND

OMPT_LOOP_END
#define OMPT_LOOP_END
Definition: kmp_dispatch.cpp:2143

__kmp_dispatch_next_algorithm
int __kmp_dispatch_next_algorithm(int gtid, dispatch_private_info_template< T > *pr, dispatch_shared_info_template< T > volatile *sh, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st, T nproc, T tid)
Definition: kmp_dispatch.cpp:1323

__kmp_wait_4
kmp_uint32 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, kmp_uint32(*pred)(kmp_uint32, kmp_uint32), void *obj)
Definition: kmp_dispatch.cpp:3044

__kmp_wait_4_ptr
void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, kmp_uint32(*pred)(void *, kmp_uint32), void *obj)
Definition: kmp_dispatch.cpp:3072

KMP_STATS_LOOP_END
#define KMP_STATS_LOOP_END
Definition: kmp_dispatch.cpp:2176

__kmp_dispatch_next
static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st)
Definition: kmp_dispatch.cpp:2180

__kmp_eq_4
kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker)
Definition: kmp_dispatch.cpp:3023

__kmp_initialize_self_buffer
void __kmp_initialize_self_buffer(kmp_team_t *team, T id, dispatch_private_info_template< T > *pr, typename traits_t< T >::unsigned_t nchunks, T nproc, typename traits_t< T >::unsigned_t &init, T &small_chunk, T &extras, T &p_extra)
Definition: kmp_dispatch.cpp:105

__kmp_dispatch_deo_error
void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref)
Definition: kmp_dispatch.cpp:42

__kmp_lt_4
kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker)
Definition: kmp_dispatch.cpp:3031

__kmp_dispatch_finish
static void __kmp_dispatch_finish(int gtid, ident_t *loc)
Definition: kmp_dispatch.cpp:1181

__kmp_neq_4
kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker)
Definition: kmp_dispatch.cpp:3027

__kmp_dist_get_bounds
static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, kmp_int32 *plastiter, T *plower, T *pupper, typename traits_t< T >::signed_t incr)
Definition: kmp_dispatch.cpp:2665

__kmp_dispatch_init_algorithm
void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, dispatch_private_info_template< T > *pr, enum sched_type schedule, T lb, T ub, typename traits_t< T >::signed_t st, typename traits_t< T >::signed_t chunk, T nproc, T tid)
Definition: kmp_dispatch.cpp:184

__kmp_le_4
kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker)
Definition: kmp_dispatch.cpp:3039

__kmp_dispatch_dxo_error
void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref)
Definition: kmp_dispatch.cpp:60

OMPT_LOOP_DISPATCH
#define OMPT_LOOP_DISPATCH(lb, ub, st, status)
Definition: kmp_dispatch.cpp:2144

__kmp_dispatch_init
static void __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, T ub, typename traits_t< T >::signed_t st, typename traits_t< T >::signed_t chunk, int push_ws)
Definition: kmp_dispatch.cpp:962

__kmp_get_monotonicity
static int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule, bool use_hier=false)
Definition: kmp_dispatch.cpp:72

kmp_dispatch.h

__kmp_dispatch_guided_remaining
static __inline traits_t< T >::unsigned_t __kmp_dispatch_guided_remaining(T tc, typename traits_t< T >::floating_t base, typename traits_t< T >::unsigned_t idx)
Definition: kmp_dispatch.h:487

guided_int_param
static const int guided_int_param
Definition: kmp_dispatch.h:511

test_then_inc< kmp_int32 >
__forceinline kmp_int32 test_then_inc< kmp_int32 >(volatile kmp_int32 *p)
Definition: kmp_dispatch.h:238

guided_flt_param
static const double guided_flt_param
Definition: kmp_dispatch.h:512

kmp_dispatch_hier.h

__kmp_hier_scheds
kmp_hier_sched_env_t __kmp_hier_scheds

__kmp_dispatch_free_hierarchies
void __kmp_dispatch_free_hierarchies(kmp_team_t *team)

__kmp_push_sync
void __kmp_push_sync(int gtid, enum cons_type ct, ident_t const *ident, kmp_user_lock_p lck)
Definition: kmp_error.cpp:338

__kmp_pop_workshare
enum cons_type __kmp_pop_workshare(int gtid, enum cons_type ct, ident_t const *ident)
Definition: kmp_error.cpp:383

__kmp_error_construct
void __kmp_error_construct(kmp_i18n_id_t id, enum cons_type ct, ident_t const *ident)
Definition: kmp_error.cpp:112

__kmp_pop_sync
void __kmp_pop_sync(int gtid, enum cons_type ct, ident_t const *ident)
Definition: kmp_error.cpp:411

kmp_error.h

status
static volatile kmp_i18n_cat_status_t status
Definition: kmp_i18n.cpp:48

__kmp_msg_null
kmp_msg_t __kmp_msg_null
Definition: kmp_i18n.cpp:36

__kmp_fatal
void __kmp_fatal(kmp_msg_t message,...)
Definition: kmp_i18n.cpp:864

kmp_i18n.h

KMP_WARNING
#define KMP_WARNING(...)
Definition: kmp_i18n.h:144

KMP_MSG
#define KMP_MSG(...)
Definition: kmp_i18n.h:121

KMP_HNT
#define KMP_HNT(...)
Definition: kmp_i18n.h:122

kmp_itt.h

KMP_FSYNC_SPIN_ACQUIRED
#define KMP_FSYNC_SPIN_ACQUIRED(obj)
Definition: kmp_itt.h:339

KMP_FSYNC_SPIN_PREPARE
#define KMP_FSYNC_SPIN_PREPARE(obj)
Definition: kmp_itt.h:338

USE_ITT_BUILD_ARG
#define USE_ITT_BUILD_ARG(x)
Definition: kmp_itt.h:346

KMP_FSYNC_SPIN_INIT
#define KMP_FSYNC_SPIN_INIT(obj, spin)
Definition: kmp_itt.h:337

kmp_lock.h

__kmp_acquire_lock
static int __kmp_acquire_lock(kmp_lock_t *lck, kmp_int32 gtid)
Definition: kmp_lock.h:559

__kmp_init_lock
static void __kmp_init_lock(kmp_lock_t *lck)
Definition: kmp_lock.h:571

__kmp_destroy_lock
static void __kmp_destroy_lock(kmp_lock_t *lck)
Definition: kmp_lock.h:575

__kmp_release_lock
static void __kmp_release_lock(kmp_lock_t *lck, kmp_int32 gtid)
Definition: kmp_lock.h:567

KMP_COMPARE_AND_STORE_REL64
#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv)
Definition: kmp_os.h:859

KMP_ATOMIC_ST_REL
#define KMP_ATOMIC_ST_REL(p, v)
Definition: kmp_os.h:1259

RCAST
#define RCAST(type, var)
Definition: kmp_os.h:291

KMP_XCHG_FIXED64
#define KMP_XCHG_FIXED64(p, v)
Definition: kmp_os.h:879

KMP_ATOMIC_LD_ACQ
#define KMP_ATOMIC_LD_ACQ(p)
Definition: kmp_os.h:1257

KMP_ATOMIC_ST_RLX
#define KMP_ATOMIC_ST_RLX(p, v)
Definition: kmp_os.h:1260

VOLATILE_CAST
#define VOLATILE_CAST(x)
Definition: kmp_os.h:1188

CCAST
#define CCAST(type, var)
Definition: kmp_os.h:290

KMP_MB
#define KMP_MB()
Definition: kmp_os.h:1064

TCR_4
#define TCR_4(a)
Definition: kmp_os.h:1135

KMP_ATOMIC_LD_RLX
#define KMP_ATOMIC_LD_RLX(p)
Definition: kmp_os.h:1258

kmp_stats.h
Functions for collecting statistics.

KMP_COUNT_VALUE
#define KMP_COUNT_VALUE(n, v)
Definition: kmp_stats.h:1000

KMP_PUSH_PARTITIONED_TIMER
#define KMP_PUSH_PARTITIONED_TIMER(name)
Definition: kmp_stats.h:1014

KMP_POP_PARTITIONED_TIMER
#define KMP_POP_PARTITIONED_TIMER()
Definition: kmp_stats.h:1015

KMP_COUNT_DEVELOPER_VALUE
#define KMP_COUNT_DEVELOPER_VALUE(n, v)
Definition: kmp_stats.h:1006

KMP_TIME_PARTITIONED_BLOCK
#define KMP_TIME_PARTITIONED_BLOCK(name)
Definition: kmp_stats.h:1013

KMP_COUNT_BLOCK
#define KMP_COUNT_BLOCK(n)
Definition: kmp_stats.h:1001

__kmp_str_format
char * __kmp_str_format(char const *format,...)
Definition: kmp_str.cpp:448

__kmp_str_free
void __kmp_str_free(char **str)
Definition: kmp_str.cpp:494

kmp_str.h

i
#define i
Definition: kmp_stub.cpp:87

ST
#define ST
Definition: kmp_taskloop_5.c:9

a
int a
Definition: llvm-issue-80664.c:20

check
Definition: check.py:1

kmp_int32
int32_t kmp_int32
Definition: omp__kmpc_fork_call_if.c:6

chunk_size
const int chunk_size
Definition: omp_for_schedule_dynamic.c:17

lck
omp_lock_t lck
Definition: omp_lock.c:7

r
int r
Definition: omp_task_red_taskloop.c:15

if
if(ret)
Definition: ompt-general.cpp:356

ompt_enabled
ompt_callbacks_active_t ompt_enabled
Definition: ompt-general.cpp:88

ompt_callbacks
ompt_callbacks_internal_t ompt_callbacks
Definition: ompt-general.cpp:102

OMPT_GET_RETURN_ADDRESS
#define OMPT_GET_RETURN_ADDRESS(level)
Definition: ompt-internal.h:112

__ompt_get_teaminfo
ompt_team_info_t * __ompt_get_teaminfo(int depth, int *size)
Definition: ompt-specific.cpp:51

__ompt_get_task_info_object
ompt_task_info_t * __ompt_get_task_info_object(int depth)
Definition: ompt-specific.cpp:103

ompt-specific.h

loc
static id loc
Definition: kmp_task_depend_all.c:168

checker
static int checker
Definition: kmp_task_depend_all.c:83

dispatch_private_info_template
Definition: kmp_dispatch.h:133

dispatch_private_info_template::type_size
kmp_uint32 type_size
Definition: kmp_dispatch.h:145

dispatch_private_info_template::steal_flag
std::atomic< kmp_uint32 > steal_flag
Definition: kmp_dispatch.h:142

dispatch_private_info_template::pushed_ws
enum cons_type pushed_ws
Definition: kmp_dispatch.h:153

dispatch_private_info_template::u
union KMP_ALIGN_CACHE dispatch_private_info_template::private_info_tmpl u

dispatch_private_info_template::ordered_bumped
kmp_uint32 ordered_bumped
Definition: kmp_dispatch.h:143

dispatch_private_info_template::flags
kmp_sched_flags_t flags
Definition: kmp_dispatch.h:141

dispatch_private_info_template::schedule
enum sched_type schedule
Definition: kmp_dispatch.h:140

dispatch_private_info
Definition: kmp.h:2018

dispatch_shared_infoXX_template::num_done
volatile ST num_done
Definition: kmp_dispatch.h:164

dispatch_shared_infoXX_template::iteration
volatile UT iteration
Definition: kmp_dispatch.h:163

dispatch_shared_infoXX_template::ordered_iteration
volatile UT ordered_iteration
Definition: kmp_dispatch.h:165

dispatch_shared_info_template
Definition: kmp_dispatch.h:171

dispatch_shared_info_template::u
union dispatch_shared_info_template::shared_info_tmpl u

dispatch_shared_info_template::buffer_index
volatile kmp_uint32 buffer_index
Definition: kmp_dispatch.h:178

dispatch_shared_info
Definition: kmp.h:2057

id
Definition: kmp_sch_simd_guided.c:34

ident_t
Definition: teams-no-par.c:23

kmp_hier_sched_env_t::size
int size
Definition: kmp_dispatch_hier.h:54

kmp_hier_sched_env_t::small_chunks
kmp_int32 * small_chunks
Definition: kmp_dispatch_hier.h:57

kmp_hier_sched_env_t::layers
kmp_hier_layer_e * layers
Definition: kmp_dispatch_hier.h:59

kmp_hier_sched_env_t::large_chunks
kmp_int64 * large_chunks
Definition: kmp_dispatch_hier.h:58

kmp_hier_sched_env_t::scheds
enum sched_type * scheds
Definition: kmp_dispatch_hier.h:56

kmp_sched_flags::ordered
unsigned ordered
Definition: kmp.h:1885

kmp_sched_flags::use_hier
unsigned use_hier
Definition: kmp.h:1888

kmp_sched_flags::use_hybrid
unsigned use_hybrid
Definition: kmp.h:1889

kmp_sched_flags::nomerge
unsigned nomerge
Definition: kmp.h:1886

ompt_task_info_t
Definition: ompt-internal.h:59

ompt_task_info_t::task_data
ompt_data_t task_data
Definition: ompt-internal.h:61

ompt_team_info_t
Definition: ompt-internal.h:67

ompt_team_info_t::parallel_data
ompt_data_t parallel_data
Definition: ompt-internal.h:68

dispatch_private_info_template::private_info_tmpl::p
dispatch_private_infoXX_template< T > p
Definition: kmp_dispatch.h:137

dispatch_shared_info_template::shared_info_tmpl::s
dispatch_shared_infoXX_template< UT > s
Definition: kmp_dispatch.h:175

kmp_team
Definition: kmp.h:3233

kmp_team::t
kmp_base_team_t t
Definition: kmp.h:3234

kmp_ticket_lock
Definition: kmp_lock.h:272