doxygen/kmp__dispatch__hier_8h_source.html

/*

 * kmp_dispatch_hier.h -- hierarchical scheduling methods and data structures

 */


//===----------------------------------------------------------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//


#ifndef KMP_DISPATCH_HIER_H

#define KMP_DISPATCH_HIER_H

#include "kmp.h"

#include "kmp_dispatch.h"


// Layer type for scheduling hierarchy

enum kmp_hier_layer_e {

  LAYER_THREAD = -1,

  LAYER_L1,

  LAYER_L2,

  LAYER_L3,

  LAYER_NUMA,

  LAYER_LOOP,

  LAYER_LAST

};


// Convert hierarchy type (LAYER_L1, LAYER_L2, etc.) to C-style string

static inline const char *__kmp_get_hier_str(kmp_hier_layer_e type) {

  switch (type) {

  case kmp_hier_layer_e::LAYER_THREAD:

    return "THREAD";

  case kmp_hier_layer_e::LAYER_L1:

    return "L1";

  case kmp_hier_layer_e::LAYER_L2:

    return "L2";

  case kmp_hier_layer_e::LAYER_L3:

    return "L3";

  case kmp_hier_layer_e::LAYER_NUMA:

    return "NUMA";

  case kmp_hier_layer_e::LAYER_LOOP:

    return "WHOLE_LOOP";

  case kmp_hier_layer_e::LAYER_LAST:

    return "LAST";

  }

  KMP_ASSERT(0);

  // Appease compilers, should never get here

  return "ERROR";

}


// Structure to store values parsed from OMP_SCHEDULE for scheduling hierarchy

typedef struct kmp_hier_sched_env_t {

  int size;

  int capacity;

  enum sched_type *scheds;

  kmp_int32 *small_chunks;

  kmp_int64 *large_chunks;

  kmp_hier_layer_e *layers;

  // Append a level of the hierarchy

  void append(enum sched_type sched, kmp_int32 chunk, kmp_hier_layer_e layer) {

    if (capacity == 0) {

      scheds = (enum sched_type *)__kmp_allocate(sizeof(enum sched_type) *

                                                 kmp_hier_layer_e::LAYER_LAST);

      small_chunks = (kmp_int32 *)__kmp_allocate(sizeof(kmp_int32) *

                                                 kmp_hier_layer_e::LAYER_LAST);

      large_chunks = (kmp_int64 *)__kmp_allocate(sizeof(kmp_int64) *

                                                 kmp_hier_layer_e::LAYER_LAST);

      layers = (kmp_hier_layer_e *)__kmp_allocate(sizeof(kmp_hier_layer_e) *

                                                  kmp_hier_layer_e::LAYER_LAST);

      capacity = kmp_hier_layer_e::LAYER_LAST;

    }

    int current_size = size;

    KMP_DEBUG_ASSERT(current_size < kmp_hier_layer_e::LAYER_LAST);

    scheds[current_size] = sched;

    layers[current_size] = layer;

    small_chunks[current_size] = chunk;

    large_chunks[current_size] = (kmp_int64)chunk;

    size++;

  }

  // Sort the hierarchy using selection sort, size will always be small

  // (less than LAYER_LAST) so it is not necessary to use an nlog(n) algorithm

  void sort() {

    if (size <= 1)

      return;

    for (int i = 0; i < size; ++i) {

      int switch_index = i;

      for (int j = i + 1; j < size; ++j) {

        if (layers[j] < layers[switch_index])

          switch_index = j;

      }

      if (switch_index != i) {

        kmp_hier_layer_e temp1 = layers[i];

        enum sched_type temp2 = scheds[i];

        kmp_int32 temp3 = small_chunks[i];

        kmp_int64 temp4 = large_chunks[i];

        layers[i] = layers[switch_index];

        scheds[i] = scheds[switch_index];

        small_chunks[i] = small_chunks[switch_index];

        large_chunks[i] = large_chunks[switch_index];

        layers[switch_index] = temp1;

        scheds[switch_index] = temp2;

        small_chunks[switch_index] = temp3;

        large_chunks[switch_index] = temp4;

      }

    }

  }

  // Free all memory

  void deallocate() {

    if (capacity > 0) {

      __kmp_free(scheds);

      __kmp_free(layers);

      __kmp_free(small_chunks);

      __kmp_free(large_chunks);

      scheds = NULL;

      layers = NULL;

      small_chunks = NULL;

      large_chunks = NULL;

    }

    size = 0;

    capacity = 0;

  }

} kmp_hier_sched_env_t;


extern int __kmp_dispatch_hand_threading;

extern kmp_hier_sched_env_t __kmp_hier_scheds;


// Sizes of layer arrays bounded by max number of detected L1s, L2s, etc.

extern int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];

extern int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];


extern int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type);

extern int __kmp_dispatch_get_id(int gtid, kmp_hier_layer_e type);

extern int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1,

                                        kmp_hier_layer_e t2);

extern void __kmp_dispatch_free_hierarchies(kmp_team_t *team);


template <typename T> struct kmp_hier_shared_bdata_t {

  typedef typename traits_t<T>::signed_t ST;

  volatile kmp_uint64 val[2];

  kmp_int32 status[2];

  T lb[2];

  T ub[2];

  ST st[2];

  dispatch_shared_info_template<T> sh[2];

  void zero() {

    val[0] = val[1] = 0;

    status[0] = status[1] = 0;

    lb[0] = lb[1] = 0;

    ub[0] = ub[1] = 0;

    st[0] = st[1] = 0;

    sh[0].u.s.iteration = sh[1].u.s.iteration = 0;

  }

  void set_next_hand_thread(T nlb, T nub, ST nst, kmp_int32 nstatus,

                            kmp_uint64 index) {

    lb[1 - index] = nlb;

    ub[1 - index] = nub;

    st[1 - index] = nst;

    status[1 - index] = nstatus;

  }

  void set_next(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index) {

    lb[1 - index] = nlb;

    ub[1 - index] = nub;

    st[1 - index] = nst;

    status[1 - index] = nstatus;

    sh[1 - index].u.s.iteration = 0;

  }


  kmp_int32 get_next_status(kmp_uint64 index) const {

    return status[1 - index];

  }

  T get_next_lb(kmp_uint64 index) const { return lb[1 - index]; }

  T get_next_ub(kmp_uint64 index) const { return ub[1 - index]; }

  ST get_next_st(kmp_uint64 index) const { return st[1 - index]; }

  dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {

    return &(sh[1 - index]);

  }


  kmp_int32 get_curr_status(kmp_uint64 index) const { return status[index]; }

  T get_curr_lb(kmp_uint64 index) const { return lb[index]; }

  T get_curr_ub(kmp_uint64 index) const { return ub[index]; }

  ST get_curr_st(kmp_uint64 index) const { return st[index]; }

  dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {

    return &(sh[index]);

  }

};


/*

 * In the barrier implementations, num_active is the number of threads that are

 * attached to the kmp_hier_top_unit_t structure in the scheduling hierarchy.

 * bdata is the shared barrier data that resides on the kmp_hier_top_unit_t

 * structure. tdata is the thread private data that resides on the thread

 * data structure.

 *

 * The reset_shared() method is used to initialize the barrier data on the

 * kmp_hier_top_unit_t hierarchy structure

 *

 * The reset_private() method is used to initialize the barrier data on the

 * thread's private dispatch buffer structure

 *

 * The barrier() method takes an id, which is that thread's id for the

 * kmp_hier_top_unit_t structure, and implements the barrier.  All threads wait

 * inside barrier() until all fellow threads who are attached to that

 * kmp_hier_top_unit_t structure have arrived.

 */


// Core barrier implementation

// Can be used in a unit with between 2 to 8 threads

template <typename T> class core_barrier_impl {

  static inline kmp_uint64 get_wait_val(int num_active) {

    kmp_uint64 wait_val = 0LL;

    switch (num_active) {

    case 2:

      wait_val = 0x0101LL;

      break;

    case 3:

      wait_val = 0x010101LL;

      break;

    case 4:

      wait_val = 0x01010101LL;

      break;

    case 5:

      wait_val = 0x0101010101LL;

      break;

    case 6:

      wait_val = 0x010101010101LL;

      break;

    case 7:

      wait_val = 0x01010101010101LL;

      break;

    case 8:

      wait_val = 0x0101010101010101LL;

      break;

    default:

      // don't use the core_barrier_impl for more than 8 threads

      KMP_ASSERT(0);

    }

    return wait_val;

  }


public:

  static void reset_private(kmp_int32 num_active,

                            kmp_hier_private_bdata_t *tdata);

  static void reset_shared(kmp_int32 num_active,

                           kmp_hier_shared_bdata_t<T> *bdata);

  static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,

                      kmp_hier_private_bdata_t *tdata);

};


template <typename T>

void core_barrier_impl<T>::reset_private(kmp_int32 num_active,

                                         kmp_hier_private_bdata_t *tdata) {

  tdata->num_active = num_active;

  tdata->index = 0;

  tdata->wait_val[0] = tdata->wait_val[1] = get_wait_val(num_active);

}

template <typename T>

void core_barrier_impl<T>::reset_shared(kmp_int32 num_active,

                                        kmp_hier_shared_bdata_t<T> *bdata) {

  bdata->val[0] = bdata->val[1] = 0LL;

  bdata->status[0] = bdata->status[1] = 0LL;

}

template <typename T>

void core_barrier_impl<T>::barrier(kmp_int32 id,

                                   kmp_hier_shared_bdata_t<T> *bdata,

                                   kmp_hier_private_bdata_t *tdata) {

  kmp_uint64 current_index = tdata->index;

  kmp_uint64 next_index = 1 - current_index;

  kmp_uint64 current_wait_value = tdata->wait_val[current_index];

  kmp_uint64 next_wait_value =

      (current_wait_value ? 0 : get_wait_val(tdata->num_active));

  KD_TRACE(10, ("core_barrier_impl::barrier(): T#%d current_index:%llu "

                "next_index:%llu curr_wait:%llu next_wait:%llu\n",

                __kmp_get_gtid(), current_index, next_index, current_wait_value,

                next_wait_value));

  char v = (current_wait_value ? '\1' : '\0');

  (RCAST(volatile char *, &(bdata->val[current_index])))[id] = v;

  __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,

                         __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL));

  tdata->wait_val[current_index] = next_wait_value;

  tdata->index = next_index;

}


// Counter barrier implementation

// Can be used in a unit with arbitrary number of active threads

template <typename T> class counter_barrier_impl {

public:

  static void reset_private(kmp_int32 num_active,

                            kmp_hier_private_bdata_t *tdata);

  static void reset_shared(kmp_int32 num_active,

                           kmp_hier_shared_bdata_t<T> *bdata);

  static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,

                      kmp_hier_private_bdata_t *tdata);

};


template <typename T>

void counter_barrier_impl<T>::reset_private(kmp_int32 num_active,

                                            kmp_hier_private_bdata_t *tdata) {

  tdata->num_active = num_active;

  tdata->index = 0;

  tdata->wait_val[0] = tdata->wait_val[1] = (kmp_uint64)num_active;

}

template <typename T>

void counter_barrier_impl<T>::reset_shared(kmp_int32 num_active,

                                           kmp_hier_shared_bdata_t<T> *bdata) {

  bdata->val[0] = bdata->val[1] = 0LL;

  bdata->status[0] = bdata->status[1] = 0LL;

}

template <typename T>

void counter_barrier_impl<T>::barrier(kmp_int32 id,

                                      kmp_hier_shared_bdata_t<T> *bdata,

                                      kmp_hier_private_bdata_t *tdata) {

  volatile kmp_int64 *val;

  kmp_uint64 current_index = tdata->index;

  kmp_uint64 next_index = 1 - current_index;

  kmp_uint64 current_wait_value = tdata->wait_val[current_index];

  kmp_uint64 next_wait_value = current_wait_value + tdata->num_active;


  KD_TRACE(10, ("counter_barrier_impl::barrier(): T#%d current_index:%llu "

                "next_index:%llu curr_wait:%llu next_wait:%llu\n",

                __kmp_get_gtid(), current_index, next_index, current_wait_value,

                next_wait_value));

  val = RCAST(volatile kmp_int64 *, &(bdata->val[current_index]));

  KMP_TEST_THEN_INC64(val);

  __kmp_wait<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,

                         __kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL));

  tdata->wait_val[current_index] = next_wait_value;

  tdata->index = next_index;

}


// Data associated with topology unit within a layer

// For example, one kmp_hier_top_unit_t corresponds to one L1 cache

template <typename T> struct kmp_hier_top_unit_t {

  typedef typename traits_t<T>::signed_t ST;

  typedef typename traits_t<T>::unsigned_t UT;

  kmp_int32 active; // number of topology units that communicate with this unit

  // chunk information (lower/upper bound, stride, etc.)

  dispatch_private_info_template<T> hier_pr;

  kmp_hier_top_unit_t<T> *hier_parent; // pointer to parent unit

  kmp_hier_shared_bdata_t<T> hier_barrier; // shared barrier data for this unit


  kmp_int32 get_hier_id() const { return hier_pr.hier_id; }

  void reset_shared_barrier() {

    KMP_DEBUG_ASSERT(active > 0);

    if (active == 1)

      return;

    hier_barrier.zero();

    if (active >= 2 && active <= 8) {

      core_barrier_impl<T>::reset_shared(active, &hier_barrier);

    } else {

      counter_barrier_impl<T>::reset_shared(active, &hier_barrier);

    }

  }

  void reset_private_barrier(kmp_hier_private_bdata_t *tdata) {

    KMP_DEBUG_ASSERT(tdata);

    KMP_DEBUG_ASSERT(active > 0);

    if (active == 1)

      return;

    if (active >= 2 && active <= 8) {

      core_barrier_impl<T>::reset_private(active, tdata);

    } else {

      counter_barrier_impl<T>::reset_private(active, tdata);

    }

  }

  void barrier(kmp_int32 id, kmp_hier_private_bdata_t *tdata) {

    KMP_DEBUG_ASSERT(tdata);

    KMP_DEBUG_ASSERT(active > 0);

    KMP_DEBUG_ASSERT(id >= 0 && id < active);

    if (active == 1) {

      tdata->index = 1 - tdata->index;

      return;

    }

    if (active >= 2 && active <= 8) {

      core_barrier_impl<T>::barrier(id, &hier_barrier, tdata);

    } else {

      counter_barrier_impl<T>::barrier(id, &hier_barrier, tdata);

    }

  }


  kmp_int32 get_next_status(kmp_uint64 index) const {

    return hier_barrier.get_next_status(index);

  }

  T get_next_lb(kmp_uint64 index) const {

    return hier_barrier.get_next_lb(index);

  }

  T get_next_ub(kmp_uint64 index) const {

    return hier_barrier.get_next_ub(index);

  }

  ST get_next_st(kmp_uint64 index) const {

    return hier_barrier.get_next_st(index);

  }

  dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {

    return hier_barrier.get_next_sh(index);

  }


  kmp_int32 get_curr_status(kmp_uint64 index) const {

    return hier_barrier.get_curr_status(index);

  }

  T get_curr_lb(kmp_uint64 index) const {

    return hier_barrier.get_curr_lb(index);

  }

  T get_curr_ub(kmp_uint64 index) const {

    return hier_barrier.get_curr_ub(index);

  }

  ST get_curr_st(kmp_uint64 index) const {

    return hier_barrier.get_curr_st(index);

  }

  dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {

    return hier_barrier.get_curr_sh(index);

  }


  void set_next_hand_thread(T lb, T ub, ST st, kmp_int32 status,

                            kmp_uint64 index) {

    hier_barrier.set_next_hand_thread(lb, ub, st, status, index);

  }

  void set_next(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index) {

    hier_barrier.set_next(lb, ub, st, status, index);

  }

  dispatch_private_info_template<T> *get_my_pr() { return &hier_pr; }

  kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }

  dispatch_private_info_template<T> *get_parent_pr() {

    return &(hier_parent->hier_pr);

  }


  kmp_int32 is_active() const { return active; }

  kmp_int32 get_num_active() const { return active; }

#ifdef KMP_DEBUG

  void print() {

    KD_TRACE(

        10,

        ("    kmp_hier_top_unit_t: active:%d pr:%p lb:%d ub:%d st:%d tc:%d\n",

         active, &hier_pr, hier_pr.u.p.lb, hier_pr.u.p.ub, hier_pr.u.p.st,

         hier_pr.u.p.tc));

  }

#endif

};


// Information regarding a single layer within the scheduling hierarchy

template <typename T> struct kmp_hier_layer_info_t {

  int num_active; // number of threads active in this level

  kmp_hier_layer_e type; // LAYER_L1, LAYER_L2, etc.

  enum sched_type sched; // static, dynamic, guided, etc.

  typename traits_t<T>::signed_t chunk; // chunk size associated with schedule

  int length; // length of the kmp_hier_top_unit_t array


#ifdef KMP_DEBUG

  // Print this layer's information

  void print() {

    const char *t = __kmp_get_hier_str(type);

    KD_TRACE(

        10,

        ("    kmp_hier_layer_info_t: num_active:%d type:%s sched:%d chunk:%d "

         "length:%d\n",

         num_active, t, sched, chunk, length));

  }

#endif

};


/*

 * Structure to implement entire hierarchy

 *

 * The hierarchy is kept as an array of arrays to represent the different

 * layers.  Layer 0 is the lowest layer to layer num_layers - 1 which is the

 * highest layer.

 * Example:

 * [ 2 ] -> [ L3 | L3 ]

 * [ 1 ] -> [ L2 | L2 | L2 | L2 ]

 * [ 0 ] -> [ L1 | L1 | L1 | L1 | L1 | L1 | L1 | L1 ]

 * There is also an array of layer_info_t which has information regarding

 * each layer

 */

template <typename T> struct kmp_hier_t {

public:

  typedef typename traits_t<T>::unsigned_t UT;

  typedef typename traits_t<T>::signed_t ST;


private:

  int next_recurse(ident_t *loc, int gtid, kmp_hier_top_unit_t<T> *current,

                   kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st,

                   kmp_int32 previous_id, int hier_level) {

    int status;

    kmp_info_t *th = __kmp_threads[gtid];

    auto parent = current->get_parent();

    bool last_layer = (hier_level == get_num_layers() - 1);

    KMP_DEBUG_ASSERT(th);

    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[hier_level]);

    KMP_DEBUG_ASSERT(current);

    KMP_DEBUG_ASSERT(hier_level >= 0);

    KMP_DEBUG_ASSERT(hier_level < get_num_layers());

    KMP_DEBUG_ASSERT(tdata);

    KMP_DEBUG_ASSERT(parent || last_layer);


    KD_TRACE(

        1, ("kmp_hier_t.next_recurse(): T#%d (%d) called\n", gtid, hier_level));


    T hier_id = (T)current->get_hier_id();

    // Attempt to grab next iteration range for this level

    if (previous_id == 0) {

      KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is primary of unit\n",

                   gtid, hier_level));

      kmp_int32 contains_last;

      T my_lb, my_ub;

      ST my_st;

      T nproc;

      dispatch_shared_info_template<T> volatile *my_sh;

      dispatch_private_info_template<T> *my_pr;

      if (last_layer) {

        // last layer below the very top uses the single shared buffer

        // from the team struct.

        KD_TRACE(10,

                 ("kmp_hier_t.next_recurse(): T#%d (%d) using top level sh\n",

                  gtid, hier_level));

        my_sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(

            th->th.th_dispatch->th_dispatch_sh_current);

        nproc = (T)get_top_level_nproc();

      } else {

        // middle layers use the shared buffer inside the kmp_hier_top_unit_t

        // structure

        KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) using hier sh\n",

                      gtid, hier_level));

        my_sh =

            parent->get_curr_sh(th->th.th_hier_bar_data[hier_level + 1].index);

        nproc = (T)parent->get_num_active();

      }

      my_pr = current->get_my_pr();

      KMP_DEBUG_ASSERT(my_sh);

      KMP_DEBUG_ASSERT(my_pr);

      enum sched_type schedule = get_sched(hier_level);

      ST chunk = (ST)get_chunk(hier_level);

      status = __kmp_dispatch_next_algorithm<T>(gtid, my_pr, my_sh,

                                                &contains_last, &my_lb, &my_ub,

                                                &my_st, nproc, hier_id);

      KD_TRACE(

          10,

          ("kmp_hier_t.next_recurse(): T#%d (%d) next_pr_sh() returned %d\n",

           gtid, hier_level, status));

      // When no iterations are found (status == 0) and this is not the last

      // layer, attempt to go up the hierarchy for more iterations

      if (status == 0 && !last_layer) {

        kmp_int32 hid;

        __kmp_type_convert(hier_id, &hid);

        status = next_recurse(loc, gtid, parent, &contains_last, &my_lb, &my_ub,

                              &my_st, hid, hier_level + 1);

        KD_TRACE(

            10,

            ("kmp_hier_t.next_recurse(): T#%d (%d) hier_next() returned %d\n",

             gtid, hier_level, status));

        if (status == 1) {

          kmp_hier_private_bdata_t *upper_tdata =

              &(th->th.th_hier_bar_data[hier_level + 1]);

          my_sh = parent->get_curr_sh(upper_tdata->index);

          KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) about to init\n",

                        gtid, hier_level));

          __kmp_dispatch_init_algorithm(loc, gtid, my_pr, schedule,

                                        parent->get_curr_lb(upper_tdata->index),

                                        parent->get_curr_ub(upper_tdata->index),

                                        parent->get_curr_st(upper_tdata->index),

#if USE_ITT_BUILD

                                        NULL,

#endif

                                        chunk, nproc, hier_id);

          status = __kmp_dispatch_next_algorithm<T>(

              gtid, my_pr, my_sh, &contains_last, &my_lb, &my_ub, &my_st, nproc,

              hier_id);

          if (!status) {

            KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) status not 1 "

                          "setting to 2!\n",

                          gtid, hier_level));

            status = 2;

          }

        }

      }

      current->set_next(my_lb, my_ub, my_st, status, tdata->index);

      // Propagate whether a unit holds the actual global last iteration

      // The contains_last attribute is sent downwards from the top to the

      // bottom of the hierarchy via the contains_last flag inside the

      // private dispatch buffers in the hierarchy's middle layers

      if (contains_last) {

        // If the next_algorithm() method returns 1 for p_last and it is the

        // last layer or our parent contains the last serial chunk, then the

        // chunk must contain the last serial iteration.

        if (last_layer || parent->hier_pr.flags.contains_last) {

          KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) Setting this pr "

                        "to contain last.\n",

                        gtid, hier_level));

          current->hier_pr.flags.contains_last = contains_last;

        }

        if (!current->hier_pr.flags.contains_last)

          contains_last = FALSE;

      }

      if (p_last)

        *p_last = contains_last;

    } // if primary thread of this unit

    if (hier_level > 0 || !__kmp_dispatch_hand_threading) {

      KD_TRACE(10,

               ("kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n",

                gtid, hier_level));

      current->barrier(previous_id, tdata);

      KD_TRACE(10,

               ("kmp_hier_t.next_recurse(): T#%d (%d) released and exit %d\n",

                gtid, hier_level, current->get_curr_status(tdata->index)));

    } else {

      KMP_DEBUG_ASSERT(previous_id == 0);

      return status;

    }

    return current->get_curr_status(tdata->index);

  }


public:

  int top_level_nproc;

  int num_layers;

  bool valid;

  int type_size;

  kmp_hier_layer_info_t<T> *info;

  kmp_hier_top_unit_t<T> **layers;

  // Deallocate all memory from this hierarchy

  void deallocate() {

    for (int i = 0; i < num_layers; ++i)

      if (layers[i] != NULL) {

        __kmp_free(layers[i]);

      }

    if (layers != NULL) {

      __kmp_free(layers);

      layers = NULL;

    }

    if (info != NULL) {

      __kmp_free(info);

      info = NULL;

    }

    num_layers = 0;

    valid = false;

  }

  // Returns true if reallocation is needed else false

  bool need_to_reallocate(int n, const kmp_hier_layer_e *new_layers,

                          const enum sched_type *new_scheds,

                          const ST *new_chunks) const {

    if (!valid || layers == NULL || info == NULL ||

        traits_t<T>::type_size != type_size || n != num_layers)

      return true;

    for (int i = 0; i < n; ++i) {

      if (info[i].type != new_layers[i])

        return true;

      if (info[i].sched != new_scheds[i])

        return true;

      if (info[i].chunk != new_chunks[i])

        return true;

    }

    return false;

  }

  // A single thread should call this function while the other threads wait

  // create a new scheduling hierarchy consisting of new_layers, new_scheds

  // and new_chunks.  These should come pre-sorted according to

  // kmp_hier_layer_e value.  This function will try to avoid reallocation

  // if it can

  void allocate_hier(int n, const kmp_hier_layer_e *new_layers,

                     const enum sched_type *new_scheds, const ST *new_chunks) {

    top_level_nproc = 0;

    if (!need_to_reallocate(n, new_layers, new_scheds, new_chunks)) {

      KD_TRACE(

          10,

          ("kmp_hier_t<T>::allocate_hier: T#0 do not need to reallocate\n"));

      for (int i = 0; i < n; ++i) {

        info[i].num_active = 0;

        for (int j = 0; j < get_length(i); ++j)

          layers[i][j].active = 0;

      }

      return;

    }

    KD_TRACE(10, ("kmp_hier_t<T>::allocate_hier: T#0 full alloc\n"));

    deallocate();

    type_size = traits_t<T>::type_size;

    num_layers = n;

    info = (kmp_hier_layer_info_t<T> *)__kmp_allocate(

        sizeof(kmp_hier_layer_info_t<T>) * n);

    layers = (kmp_hier_top_unit_t<T> **)__kmp_allocate(

        sizeof(kmp_hier_top_unit_t<T> *) * n);

    for (int i = 0; i < n; ++i) {

      int max = 0;

      kmp_hier_layer_e layer = new_layers[i];

      info[i].num_active = 0;

      info[i].type = layer;

      info[i].sched = new_scheds[i];

      info[i].chunk = new_chunks[i];

      max = __kmp_hier_max_units[layer + 1];

      if (max == 0) {

        valid = false;

        KMP_WARNING(HierSchedInvalid, __kmp_get_hier_str(layer));

        deallocate();

        return;

      }

      info[i].length = max;

      layers[i] = (kmp_hier_top_unit_t<T> *)__kmp_allocate(

          sizeof(kmp_hier_top_unit_t<T>) * max);

      for (int j = 0; j < max; ++j) {

        layers[i][j].active = 0;

        layers[i][j].hier_pr.flags.use_hier = TRUE;

      }

    }

    valid = true;

  }

  // loc - source file location

  // gtid - global thread identifier

  // pr - this thread's private dispatch buffer (corresponding with gtid)

  // p_last (return value) - pointer to flag indicating this set of iterations

  // contains last

  //          iteration

  // p_lb (return value) - lower bound for this chunk of iterations

  // p_ub (return value) - upper bound for this chunk of iterations

  // p_st (return value) - stride for this chunk of iterations

  //

  // Returns 1 if there are more iterations to perform, 0 otherwise

  int next(ident_t *loc, int gtid, dispatch_private_info_template<T> *pr,

           kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st) {

    int status;

    kmp_int32 contains_last = 0;

    kmp_info_t *th = __kmp_threads[gtid];

    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[0]);

    auto parent = pr->get_parent();

    KMP_DEBUG_ASSERT(parent);

    KMP_DEBUG_ASSERT(th);

    KMP_DEBUG_ASSERT(tdata);

    KMP_DEBUG_ASSERT(parent);

    T nproc = (T)parent->get_num_active();

    T unit_id = (T)pr->get_hier_id();

    KD_TRACE(

        10,

        ("kmp_hier_t.next(): T#%d THREAD LEVEL nproc:%d unit_id:%d called\n",

         gtid, nproc, unit_id));

    // Handthreading implementation

    // Each iteration is performed by all threads on last unit (typically

    // cores/tiles)

    // e.g., threads 0,1,2,3 all execute iteration 0

    //       threads 0,1,2,3 all execute iteration 1

    //       threads 4,5,6,7 all execute iteration 2

    //       threads 4,5,6,7 all execute iteration 3

    //       ... etc.

    if (__kmp_dispatch_hand_threading) {

      KD_TRACE(10,

               ("kmp_hier_t.next(): T#%d THREAD LEVEL using hand threading\n",

                gtid));

      if (unit_id == 0) {

        // For hand threading, the sh buffer on the lowest level is only ever

        // modified and read by the primary thread on that level.  Because of

        // this, we can always use the first sh buffer.

        auto sh = &(parent->hier_barrier.sh[0]);

        KMP_DEBUG_ASSERT(sh);

        status = __kmp_dispatch_next_algorithm<T>(

            gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);

        if (!status) {

          bool done = false;

          while (!done) {

            done = true;

            kmp_int32 uid;

            __kmp_type_convert(unit_id, &uid);

            status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,

                                  p_st, uid, 0);

            if (status == 1) {

              __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,

                                            parent->get_next_lb(tdata->index),

                                            parent->get_next_ub(tdata->index),

                                            parent->get_next_st(tdata->index),

#if USE_ITT_BUILD

                                            NULL,

#endif

                                            pr->u.p.parm1, nproc, unit_id);

              sh->u.s.iteration = 0;

              status = __kmp_dispatch_next_algorithm<T>(

                  gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc,

                  unit_id);

              if (!status) {

                KD_TRACE(10,

                         ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "

                          "after next_pr_sh()"

                          "trying again.\n",

                          gtid));

                done = false;

              }

            } else if (status == 2) {

              KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "

                            "trying again.\n",

                            gtid));

              done = false;

            }

          }

        }

        parent->set_next_hand_thread(*p_lb, *p_ub, *p_st, status, tdata->index);

      } // if primary thread of lowest unit level

      parent->barrier(pr->get_hier_id(), tdata);

      if (unit_id != 0) {

        *p_lb = parent->get_curr_lb(tdata->index);

        *p_ub = parent->get_curr_ub(tdata->index);

        *p_st = parent->get_curr_st(tdata->index);

        status = parent->get_curr_status(tdata->index);

      }

    } else {

      // Normal implementation

      // Each thread grabs an iteration chunk and executes it (no cooperation)

      auto sh = parent->get_curr_sh(tdata->index);

      KMP_DEBUG_ASSERT(sh);

      status = __kmp_dispatch_next_algorithm<T>(

          gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);

      KD_TRACE(10,

               ("kmp_hier_t.next(): T#%d THREAD LEVEL next_algorithm status:%d "

                "contains_last:%d p_lb:%d p_ub:%d p_st:%d\n",

                gtid, status, contains_last, *p_lb, *p_ub, *p_st));

      if (!status) {

        bool done = false;

        while (!done) {

          done = true;

          kmp_int32 uid;

          __kmp_type_convert(unit_id, &uid);

          status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,

                                p_st, uid, 0);

          if (status == 1) {

            sh = parent->get_curr_sh(tdata->index);

            __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,

                                          parent->get_curr_lb(tdata->index),

                                          parent->get_curr_ub(tdata->index),

                                          parent->get_curr_st(tdata->index),

#if USE_ITT_BUILD

                                          NULL,

#endif

                                          pr->u.p.parm1, nproc, unit_id);

            status = __kmp_dispatch_next_algorithm<T>(

                gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);

            if (!status) {

              KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "

                            "after next_pr_sh()"

                            "trying again.\n",

                            gtid));

              done = false;

            }

          } else if (status == 2) {

            KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "

                          "trying again.\n",

                          gtid));

            done = false;

          }

        }

      }

    }

    if (contains_last && !parent->hier_pr.flags.contains_last) {

      KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL resetting "

                    "contains_last to FALSE\n",

                    gtid));

      contains_last = FALSE;

    }

    if (p_last)

      *p_last = contains_last;

    KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL exit status %d\n", gtid,

                  status));

    return status;

  }

  // These functions probe the layer info structure

  // Returns the type of topology unit given level

  kmp_hier_layer_e get_type(int level) const {

    KMP_DEBUG_ASSERT(level >= 0);

    KMP_DEBUG_ASSERT(level < num_layers);

    return info[level].type;

  }

  // Returns the schedule type at given level

  enum sched_type get_sched(int level) const {

    KMP_DEBUG_ASSERT(level >= 0);

    KMP_DEBUG_ASSERT(level < num_layers);

    return info[level].sched;

  }

  // Returns the chunk size at given level

  ST get_chunk(int level) const {

    KMP_DEBUG_ASSERT(level >= 0);

    KMP_DEBUG_ASSERT(level < num_layers);

    return info[level].chunk;

  }

  // Returns the number of active threads at given level

  int get_num_active(int level) const {

    KMP_DEBUG_ASSERT(level >= 0);

    KMP_DEBUG_ASSERT(level < num_layers);

    return info[level].num_active;

  }

  // Returns the length of topology unit array at given level

  int get_length(int level) const {

    KMP_DEBUG_ASSERT(level >= 0);

    KMP_DEBUG_ASSERT(level < num_layers);

    return info[level].length;

  }

  // Returns the topology unit given the level and index

  kmp_hier_top_unit_t<T> *get_unit(int level, int index) {

    KMP_DEBUG_ASSERT(level >= 0);

    KMP_DEBUG_ASSERT(level < num_layers);

    KMP_DEBUG_ASSERT(index >= 0);

    KMP_DEBUG_ASSERT(index < get_length(level));

    return &(layers[level][index]);

  }

  // Returns the number of layers in the hierarchy

  int get_num_layers() const { return num_layers; }

  // Returns the number of threads in the top layer

  // This is necessary because we don't store a topology unit as

  // the very top level and the scheduling algorithms need this information

  int get_top_level_nproc() const { return top_level_nproc; }

  // Return whether this hierarchy is valid or not

  bool is_valid() const { return valid; }

#ifdef KMP_DEBUG

  // Print the hierarchy

  void print() {

    KD_TRACE(10, ("kmp_hier_t:\n"));

    for (int i = num_layers - 1; i >= 0; --i) {

      KD_TRACE(10, ("Info[%d] = ", i));

      info[i].print();

    }

    for (int i = num_layers - 1; i >= 0; --i) {

      KD_TRACE(10, ("Layer[%d] =\n", i));

      for (int j = 0; j < info[i].length; ++j) {

        layers[i][j].print();

      }

    }

  }

#endif

};


template <typename T>

void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,

                                   kmp_hier_layer_e *new_layers,

                                   enum sched_type *new_scheds,

                                   typename traits_t<T>::signed_t *new_chunks,

                                   T lb, T ub,

                                   typename traits_t<T>::signed_t st) {

  int tid, gtid, num_hw_threads, num_threads_per_layer1, active;

  unsigned int my_buffer_index;

  kmp_info_t *th;

  kmp_team_t *team;

  dispatch_private_info_template<T> *pr;

  dispatch_shared_info_template<T> volatile *sh;

  gtid = __kmp_entry_gtid();

  tid = __kmp_tid_from_gtid(gtid);

#ifdef KMP_DEBUG

  KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d called: %d layer(s)\n",

                gtid, n));

  for (int i = 0; i < n; ++i) {

    const char *layer = __kmp_get_hier_str(new_layers[i]);

    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d: new_layers[%d] = %s, "

                  "new_scheds[%d] = %d, new_chunks[%d] = %u\n",

                  gtid, i, layer, i, (int)new_scheds[i], i, new_chunks[i]));

  }

#endif // KMP_DEBUG

  KMP_DEBUG_ASSERT(n > 0);

  KMP_DEBUG_ASSERT(new_layers);

  KMP_DEBUG_ASSERT(new_scheds);

  KMP_DEBUG_ASSERT(new_chunks);

  if (!TCR_4(__kmp_init_parallel))

    __kmp_parallel_initialize();

  __kmp_resume_if_soft_paused();


  th = __kmp_threads[gtid];

  team = th->th.th_team;

  active = !team->t.t_serialized;

  th->th.th_ident = loc;

  num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];

  KMP_DEBUG_ASSERT(th->th.th_dispatch ==

                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);

  my_buffer_index = th->th.th_dispatch->th_disp_index;

  pr = reinterpret_cast<dispatch_private_info_template<T> *>(

      &th->th.th_dispatch

           ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);

  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(

      &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);

  if (!active) {

    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d not active parallel. "

                  "Using normal dispatch functions.\n",

                  gtid));

    KMP_DEBUG_ASSERT(pr);

    pr->flags.use_hier = FALSE;

    pr->flags.contains_last = FALSE;

    return;

  }

  KMP_DEBUG_ASSERT(pr);

  KMP_DEBUG_ASSERT(sh);

  pr->flags.use_hier = TRUE;

  pr->u.p.tc = 0;

  // Have primary thread allocate the hierarchy

  if (__kmp_tid_from_gtid(gtid) == 0) {

    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating "

                  "hierarchy\n",

                  gtid, pr, sh));

    if (sh->hier == NULL) {

      sh->hier = (kmp_hier_t<T> *)__kmp_allocate(sizeof(kmp_hier_t<T>));

    }

    sh->hier->allocate_hier(n, new_layers, new_scheds, new_chunks);

    sh->u.s.iteration = 0;

  }

  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);

  // Check to make sure the hierarchy is valid

  kmp_hier_t<T> *hier = sh->hier;

  if (!sh->hier->is_valid()) {

    pr->flags.use_hier = FALSE;

    return;

  }

  // Have threads allocate their thread-private barrier data if it hasn't

  // already been allocated

  if (th->th.th_hier_bar_data == NULL) {

    th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)__kmp_allocate(

        sizeof(kmp_hier_private_bdata_t) * kmp_hier_layer_e::LAYER_LAST);

  }

  // Have threads "register" themselves by modifying the active count for each

  // level they are involved in. The active count will act as nthreads for that

  // level regarding the scheduling algorithms

  for (int i = 0; i < n; ++i) {

    int index = __kmp_dispatch_get_index(tid, hier->get_type(i));

    kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);

    // Setup the thread's private dispatch buffer's hierarchy pointers

    if (i == 0)

      pr->hier_parent = my_unit;

    // If this unit is already active, then increment active count and wait

    if (my_unit->is_active()) {

      KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "

                    "is already active (%d)\n",

                    gtid, my_unit, my_unit->active));

      KMP_TEST_THEN_INC32(&(my_unit->active));

      break;

    }

    // Flag that this unit is active

    if (KMP_COMPARE_AND_STORE_ACQ32(&(my_unit->active), 0, 1)) {

      // Do not setup parent pointer for top level unit since it has no parent

      if (i < n - 1) {

        // Setup middle layer pointers to parents

        my_unit->get_my_pr()->hier_id =

            index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),

                                                 hier->get_type(i + 1));

        int parent_index = __kmp_dispatch_get_index(tid, hier->get_type(i + 1));

        my_unit->hier_parent = hier->get_unit(i + 1, parent_index);

      } else {

        // Setup top layer information (no parent pointers are set)

        my_unit->get_my_pr()->hier_id =

            index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),

                                                 kmp_hier_layer_e::LAYER_LOOP);

        KMP_TEST_THEN_INC32(&(hier->top_level_nproc));

        my_unit->hier_parent = nullptr;

      }

      // Set trip count to 0 so that next() operation will initially climb up

      // the hierarchy to get more iterations (early exit in next() for tc == 0)

      my_unit->get_my_pr()->u.p.tc = 0;

      // Increment this layer's number of active units

      KMP_TEST_THEN_INC32(&(hier->info[i].num_active));

      KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "

                    "incrementing num_active\n",

                    gtid, my_unit));

    } else {

      KMP_TEST_THEN_INC32(&(my_unit->active));

      break;

    }

  }

  // Set this thread's id

  num_threads_per_layer1 = __kmp_dispatch_get_t1_per_t2(

      kmp_hier_layer_e::LAYER_THREAD, hier->get_type(0));

  pr->hier_id = tid % num_threads_per_layer1;

  // For oversubscribed threads, increment their index within the lowest unit

  // This is done to prevent having two or more threads with id 0, id 1, etc.

  if (tid >= num_hw_threads)

    pr->hier_id += ((tid / num_hw_threads) * num_threads_per_layer1);

  KD_TRACE(

      10, ("__kmp_dispatch_init_hierarchy: T#%d setting lowest hier_id to %d\n",

           gtid, pr->hier_id));


  pr->flags.contains_last = FALSE;

  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);


  // Now that the number of active threads at each level is determined,

  // the barrier data for each unit can be initialized and the last layer's

  // loop information can be initialized.

  int prev_id = pr->get_hier_id();

  for (int i = 0; i < n; ++i) {

    if (prev_id != 0)

      break;

    int index = __kmp_dispatch_get_index(tid, hier->get_type(i));

    kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);

    // Only primary threads of this unit within the hierarchy do initialization

    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n",

                  gtid, i));

    my_unit->reset_shared_barrier();

    my_unit->hier_pr.flags.contains_last = FALSE;

    // Last layer, initialize the private buffers with entire loop information

    // Now the next next_algorithm() call will get the first chunk of

    // iterations properly

    if (i == n - 1) {

      __kmp_dispatch_init_algorithm<T>(

          loc, gtid, my_unit->get_my_pr(), hier->get_sched(i), lb, ub, st,

#if USE_ITT_BUILD

          NULL,

#endif

          hier->get_chunk(i), hier->get_num_active(i), my_unit->get_hier_id());

    }

    prev_id = my_unit->get_hier_id();

  }

  // Initialize each layer of the thread's private barrier data

  kmp_hier_top_unit_t<T> *unit = pr->hier_parent;

  for (int i = 0; i < n && unit; ++i, unit = unit->get_parent()) {

    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[i]);

    unit->reset_private_barrier(tdata);

  }

  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);


#ifdef KMP_DEBUG

  if (__kmp_tid_from_gtid(gtid) == 0) {

    for (int i = 0; i < n; ++i) {

      KD_TRACE(10,

               ("__kmp_dispatch_init_hierarchy: T#%d active count[%d] = %d\n",

                gtid, i, hier->get_num_active(i)));

    }

    hier->print();

  }

  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);

#endif // KMP_DEBUG

}

#endif

core_barrier_impl
Definition: kmp_dispatch_hier.h:209

core_barrier_impl::reset_private
static void reset_private(kmp_int32 num_active, kmp_hier_private_bdata_t *tdata)
Definition: kmp_dispatch_hier.h:251

core_barrier_impl::reset_shared
static void reset_shared(kmp_int32 num_active, kmp_hier_shared_bdata_t< T > *bdata)
Definition: kmp_dispatch_hier.h:258

core_barrier_impl::barrier
static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t< T > *bdata, kmp_hier_private_bdata_t *tdata)
Definition: kmp_dispatch_hier.h:264

counter_barrier_impl
Definition: kmp_dispatch_hier.h:286

counter_barrier_impl::reset_shared
static void reset_shared(kmp_int32 num_active, kmp_hier_shared_bdata_t< T > *bdata)
Definition: kmp_dispatch_hier.h:304

counter_barrier_impl::barrier
static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t< T > *bdata, kmp_hier_private_bdata_t *tdata)
Definition: kmp_dispatch_hier.h:310

counter_barrier_impl::reset_private
static void reset_private(kmp_int32 num_active, kmp_hier_private_bdata_t *tdata)
Definition: kmp_dispatch_hier.h:297

kmp_int64
int64_t kmp_int64
Definition: common.h:10

sched_type
sched_type
Describes the loop schedule to be used for a parallel for loop.
Definition: kmp.h:370

parent
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id parent
Definition: ittnotify_static.h:349

type
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type type
Definition: ittnotify_static.h:415

kmp.h

__kmp_free
#define __kmp_free(ptr)
Definition: kmp.h:3765

__kmp_barrier
int __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size, void *reduce_data, void(*reduce)(void *, void *))
Definition: kmp_barrier.cpp:2100

__kmp_entry_gtid
#define __kmp_entry_gtid()
Definition: kmp.h:3610

__kmp_tid_from_gtid
static int __kmp_tid_from_gtid(int gtid)
Definition: kmp.h:3628

__kmp_threads
kmp_info_t ** __kmp_threads
Definition: kmp_global.cpp:450

__kmp_dispatch_num_buffers
int __kmp_dispatch_num_buffers
Definition: kmp_global.cpp:135

__kmp_parallel_initialize
void __kmp_parallel_initialize(void)
Definition: kmp_runtime.cpp:7529

__kmp_init_parallel
volatile int __kmp_init_parallel
Definition: kmp_global.cpp:49

__kmp_allocate
#define __kmp_allocate(size)
Definition: kmp.h:3763

TRUE
#define TRUE
Definition: kmp.h:1333

FALSE
#define FALSE
Definition: kmp.h:1332

bs_plain_barrier
@ bs_plain_barrier
Definition: kmp.h:2141

__kmp_get_gtid
#define __kmp_get_gtid()
Definition: kmp.h:3609

__kmp_resume_if_soft_paused
void __kmp_resume_if_soft_paused()
Definition: kmp_runtime.cpp:9085

__kmp_type_convert
static void __kmp_type_convert(T1 src, T2 *dest)
Definition: kmp.h:4886

kmp_info_t
union KMP_ALIGN_CACHE kmp_info kmp_info_t

KMP_ASSERT
#define KMP_ASSERT(cond)
Definition: kmp_debug.h:59

KD_TRACE
#define KD_TRACE(d, x)
Definition: kmp_debug.h:160

KMP_DEBUG_ASSERT
#define KMP_DEBUG_ASSERT(cond)
Definition: kmp_debug.h:61

kmp_uint64
unsigned long long kmp_uint64
Definition: kmp_detach_tasks_t1.c:12

__kmp_dispatch_init_algorithm
void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, dispatch_private_info_template< T > *pr, enum sched_type schedule, T lb, T ub, typename traits_t< T >::signed_t st, typename traits_t< T >::signed_t chunk, T nproc, T tid)
Definition: kmp_dispatch.cpp:184

kmp_dispatch.h

__kmp_dispatch_init_hierarchy
void __kmp_dispatch_init_hierarchy(ident_t *loc, int n, kmp_hier_layer_e *new_layers, enum sched_type *new_scheds, typename traits_t< T >::signed_t *new_chunks, T lb, T ub, typename traits_t< T >::signed_t st)
Definition: kmp_dispatch_hier.h:920

__kmp_hier_scheds
kmp_hier_sched_env_t __kmp_hier_scheds

__kmp_dispatch_free_hierarchies
void __kmp_dispatch_free_hierarchies(kmp_team_t *team)

__kmp_hier_threads_per
int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST+1]

kmp_hier_layer_e
kmp_hier_layer_e
Definition: kmp_dispatch_hier.h:19

LAYER_THREAD
@ LAYER_THREAD
Definition: kmp_dispatch_hier.h:20

LAYER_NUMA
@ LAYER_NUMA
Definition: kmp_dispatch_hier.h:24

LAYER_L1
@ LAYER_L1
Definition: kmp_dispatch_hier.h:21

LAYER_LOOP
@ LAYER_LOOP
Definition: kmp_dispatch_hier.h:25

LAYER_L2
@ LAYER_L2
Definition: kmp_dispatch_hier.h:22

LAYER_LAST
@ LAYER_LAST
Definition: kmp_dispatch_hier.h:26

LAYER_L3
@ LAYER_L3
Definition: kmp_dispatch_hier.h:23

__kmp_get_hier_str
static const char * __kmp_get_hier_str(kmp_hier_layer_e type)
Definition: kmp_dispatch_hier.h:30

__kmp_dispatch_get_index
int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type)

__kmp_dispatch_hand_threading
int __kmp_dispatch_hand_threading

__kmp_dispatch_get_id
int __kmp_dispatch_get_id(int gtid, kmp_hier_layer_e type)

__kmp_hier_max_units
int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST+1]

__kmp_dispatch_get_t1_per_t2
int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2)

status
static volatile kmp_i18n_cat_status_t status
Definition: kmp_i18n.cpp:48

KMP_WARNING
#define KMP_WARNING(...)
Definition: kmp_i18n.h:144

USE_ITT_BUILD_ARG
#define USE_ITT_BUILD_ARG(x)
Definition: kmp_itt.h:346

KMP_TEST_THEN_INC32
#define KMP_TEST_THEN_INC32(p)
Definition: kmp_os.h:724

RCAST
#define RCAST(type, var)
Definition: kmp_os.h:291

TCR_4
#define TCR_4(a)
Definition: kmp_os.h:1135

KMP_COMPARE_AND_STORE_ACQ32
#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv)
Definition: kmp_os.h:813

KMP_TEST_THEN_INC64
#define KMP_TEST_THEN_INC64(p)
Definition: kmp_os.h:734

sched
sched
Definition: kmp_sch_simd_guided.c:26

i
#define i
Definition: kmp_stub.cpp:87

kmp_int32
int32_t kmp_int32
Definition: omp__kmpc_fork_call_if.c:6

level
int level
Definition: omp_foreign_thread_team_reuse.c:18

val
int val
Definition: omp_record_replay_deps.cpp:9

j
int j
Definition: omp_single_copyprivate.c:6

loc
static id loc
Definition: kmp_task_depend_all.c:168

dispatch_private_info_template
Definition: kmp_dispatch.h:133

dispatch_private_info_template::u
union KMP_ALIGN_CACHE dispatch_private_info_template::private_info_tmpl u

dispatch_private_info_template::flags
kmp_sched_flags_t flags
Definition: kmp_dispatch.h:141

dispatch_private_info_template::schedule
enum sched_type schedule
Definition: kmp_dispatch.h:140

dispatch_shared_infoXX_template::iteration
volatile UT iteration
Definition: kmp_dispatch.h:163

dispatch_shared_info_template
Definition: kmp_dispatch.h:171

dispatch_shared_info_template::u
union dispatch_shared_info_template::shared_info_tmpl u

ident_t
Definition: teams-no-par.c:23

kmp_hier_layer_info_t
Definition: kmp_dispatch_hier.h:439

kmp_hier_layer_info_t::chunk
traits_t< T >::signed_t chunk
Definition: kmp_dispatch_hier.h:443

kmp_hier_layer_info_t::length
int length
Definition: kmp_dispatch_hier.h:444

kmp_hier_layer_info_t::sched
enum sched_type sched
Definition: kmp_dispatch_hier.h:442

kmp_hier_layer_info_t::num_active
int num_active
Definition: kmp_dispatch_hier.h:440

kmp_hier_layer_info_t::type
kmp_hier_layer_e type
Definition: kmp_dispatch_hier.h:441

kmp_hier_sched_env_t
Definition: kmp_dispatch_hier.h:53

kmp_hier_sched_env_t::append
void append(enum sched_type sched, kmp_int32 chunk, kmp_hier_layer_e layer)
Definition: kmp_dispatch_hier.h:61

kmp_hier_sched_env_t::size
int size
Definition: kmp_dispatch_hier.h:54

kmp_hier_sched_env_t::small_chunks
kmp_int32 * small_chunks
Definition: kmp_dispatch_hier.h:57

kmp_hier_sched_env_t::capacity
int capacity
Definition: kmp_dispatch_hier.h:55

kmp_hier_sched_env_t::layers
kmp_hier_layer_e * layers
Definition: kmp_dispatch_hier.h:59

kmp_hier_sched_env_t::sort
void sort()
Definition: kmp_dispatch_hier.h:83

kmp_hier_sched_env_t::large_chunks
kmp_int64 * large_chunks
Definition: kmp_dispatch_hier.h:58

kmp_hier_sched_env_t::scheds
enum sched_type * scheds
Definition: kmp_dispatch_hier.h:56

kmp_hier_sched_env_t::deallocate
void deallocate()
Definition: kmp_dispatch_hier.h:109

kmp_hier_shared_bdata_t
Definition: kmp_dispatch_hier.h:138

kmp_hier_shared_bdata_t::set_next
void set_next(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index)
Definition: kmp_dispatch_hier.h:161

kmp_hier_shared_bdata_t::get_next_status
kmp_int32 get_next_status(kmp_uint64 index) const
Definition: kmp_dispatch_hier.h:169

kmp_hier_shared_bdata_t::get_next_ub
T get_next_ub(kmp_uint64 index) const
Definition: kmp_dispatch_hier.h:173

kmp_hier_shared_bdata_t::st
ST st[2]
Definition: kmp_dispatch_hier.h:144

kmp_hier_shared_bdata_t::lb
T lb[2]
Definition: kmp_dispatch_hier.h:142

kmp_hier_shared_bdata_t::ub
T ub[2]
Definition: kmp_dispatch_hier.h:143

kmp_hier_shared_bdata_t::get_curr_ub
T get_curr_ub(kmp_uint64 index) const
Definition: kmp_dispatch_hier.h:181

kmp_hier_shared_bdata_t::get_curr_lb
T get_curr_lb(kmp_uint64 index) const
Definition: kmp_dispatch_hier.h:180

kmp_hier_shared_bdata_t::ST
traits_t< T >::signed_t ST
Definition: kmp_dispatch_hier.h:139

kmp_hier_shared_bdata_t::get_curr_st
ST get_curr_st(kmp_uint64 index) const
Definition: kmp_dispatch_hier.h:182

kmp_hier_shared_bdata_t::val
volatile kmp_uint64 val[2]
Definition: kmp_dispatch_hier.h:140

kmp_hier_shared_bdata_t::set_next_hand_thread
void set_next_hand_thread(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index)
Definition: kmp_dispatch_hier.h:154

kmp_hier_shared_bdata_t::get_curr_sh
dispatch_shared_info_template< T > volatile * get_curr_sh(kmp_uint64 index)
Definition: kmp_dispatch_hier.h:183

kmp_hier_shared_bdata_t::get_next_lb
T get_next_lb(kmp_uint64 index) const
Definition: kmp_dispatch_hier.h:172

kmp_hier_shared_bdata_t::get_next_sh
dispatch_shared_info_template< T > volatile * get_next_sh(kmp_uint64 index)
Definition: kmp_dispatch_hier.h:175

kmp_hier_shared_bdata_t::status
kmp_int32 status[2]
Definition: kmp_dispatch_hier.h:141

kmp_hier_shared_bdata_t::sh
dispatch_shared_info_template< T > sh[2]
Definition: kmp_dispatch_hier.h:145

kmp_hier_shared_bdata_t::zero
void zero()
Definition: kmp_dispatch_hier.h:146

kmp_hier_shared_bdata_t::get_next_st
ST get_next_st(kmp_uint64 index) const
Definition: kmp_dispatch_hier.h:174

kmp_hier_shared_bdata_t::get_curr_status
kmp_int32 get_curr_status(kmp_uint64 index) const
Definition: kmp_dispatch_hier.h:179

kmp_hier_t
Definition: kmp_dispatch_hier.h:472

kmp_hier_t::get_sched
enum sched_type get_sched(int level) const
Definition: kmp_dispatch_hier.h:862

kmp_hier_t::ST
traits_t< T >::signed_t ST
Definition: kmp_dispatch_hier.h:475

kmp_hier_t::top_level_nproc
int top_level_nproc
Definition: kmp_dispatch_hier.h:610

kmp_hier_t::get_num_layers
int get_num_layers() const
Definition: kmp_dispatch_hier.h:894

kmp_hier_t::num_layers
int num_layers
Definition: kmp_dispatch_hier.h:611

kmp_hier_t::info
kmp_hier_layer_info_t< T > * info
Definition: kmp_dispatch_hier.h:614

kmp_hier_t::type_size
int type_size
Definition: kmp_dispatch_hier.h:613

kmp_hier_t::get_top_level_nproc
int get_top_level_nproc() const
Definition: kmp_dispatch_hier.h:898

kmp_hier_t::UT
traits_t< T >::unsigned_t UT
Definition: kmp_dispatch_hier.h:474

kmp_hier_t::is_valid
bool is_valid() const
Definition: kmp_dispatch_hier.h:900

kmp_hier_t::get_type
kmp_hier_layer_e get_type(int level) const
Definition: kmp_dispatch_hier.h:856

kmp_hier_t::need_to_reallocate
bool need_to_reallocate(int n, const kmp_hier_layer_e *new_layers, const enum sched_type *new_scheds, const ST *new_chunks) const
Definition: kmp_dispatch_hier.h:634

kmp_hier_t::get_length
int get_length(int level) const
Definition: kmp_dispatch_hier.h:880

kmp_hier_t::get_chunk
ST get_chunk(int level) const
Definition: kmp_dispatch_hier.h:868

kmp_hier_t::get_num_active
int get_num_active(int level) const
Definition: kmp_dispatch_hier.h:874

kmp_hier_t::next
int next(ident_t *loc, int gtid, dispatch_private_info_template< T > *pr, kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st)
Definition: kmp_dispatch_hier.h:712

kmp_hier_t::allocate_hier
void allocate_hier(int n, const kmp_hier_layer_e *new_layers, const enum sched_type *new_scheds, const ST *new_chunks)
Definition: kmp_dispatch_hier.h:655

kmp_hier_t::deallocate
void deallocate()
Definition: kmp_dispatch_hier.h:617

kmp_hier_t::layers
kmp_hier_top_unit_t< T > ** layers
Definition: kmp_dispatch_hier.h:615

kmp_hier_t::get_unit
kmp_hier_top_unit_t< T > * get_unit(int level, int index)
Definition: kmp_dispatch_hier.h:886

kmp_hier_t::valid
bool valid
Definition: kmp_dispatch_hier.h:612

kmp_hier_top_unit_t
Definition: kmp_dispatch_hier.h:333

kmp_hier_top_unit_t::get_next_sh
dispatch_shared_info_template< T > volatile * get_next_sh(kmp_uint64 index)
Definition: kmp_dispatch_hier.h:392

kmp_hier_top_unit_t::barrier
void barrier(kmp_int32 id, kmp_hier_private_bdata_t *tdata)
Definition: kmp_dispatch_hier.h:365

kmp_hier_top_unit_t::get_next_st
ST get_next_st(kmp_uint64 index) const
Definition: kmp_dispatch_hier.h:389

kmp_hier_top_unit_t::get_curr_ub
T get_curr_ub(kmp_uint64 index) const
Definition: kmp_dispatch_hier.h:402

kmp_hier_top_unit_t::get_curr_status
kmp_int32 get_curr_status(kmp_uint64 index) const
Definition: kmp_dispatch_hier.h:396

kmp_hier_top_unit_t::set_next_hand_thread
void set_next_hand_thread(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index)
Definition: kmp_dispatch_hier.h:412

kmp_hier_top_unit_t::hier_pr
dispatch_private_info_template< T > hier_pr
Definition: kmp_dispatch_hier.h:338

kmp_hier_top_unit_t::is_active
kmp_int32 is_active() const
Definition: kmp_dispatch_hier.h:425

kmp_hier_top_unit_t::get_next_ub
T get_next_ub(kmp_uint64 index) const
Definition: kmp_dispatch_hier.h:386

kmp_hier_top_unit_t::get_my_pr
dispatch_private_info_template< T > * get_my_pr()
Definition: kmp_dispatch_hier.h:419

kmp_hier_top_unit_t::get_curr_sh
dispatch_shared_info_template< T > volatile * get_curr_sh(kmp_uint64 index)
Definition: kmp_dispatch_hier.h:408

kmp_hier_top_unit_t::get_curr_st
ST get_curr_st(kmp_uint64 index) const
Definition: kmp_dispatch_hier.h:405

kmp_hier_top_unit_t::get_next_status
kmp_int32 get_next_status(kmp_uint64 index) const
Definition: kmp_dispatch_hier.h:380

kmp_hier_top_unit_t::get_parent
kmp_hier_top_unit_t< T > * get_parent()
Definition: kmp_dispatch_hier.h:420

kmp_hier_top_unit_t::UT
traits_t< T >::unsigned_t UT
Definition: kmp_dispatch_hier.h:335

kmp_hier_top_unit_t::active
kmp_int32 active
Definition: kmp_dispatch_hier.h:336

kmp_hier_top_unit_t::ST
traits_t< T >::signed_t ST
Definition: kmp_dispatch_hier.h:334

kmp_hier_top_unit_t::reset_shared_barrier
void reset_shared_barrier()
Definition: kmp_dispatch_hier.h:343

kmp_hier_top_unit_t::hier_barrier
kmp_hier_shared_bdata_t< T > hier_barrier
Definition: kmp_dispatch_hier.h:340

kmp_hier_top_unit_t::get_hier_id
kmp_int32 get_hier_id() const
Definition: kmp_dispatch_hier.h:342

kmp_hier_top_unit_t::get_next_lb
T get_next_lb(kmp_uint64 index) const
Definition: kmp_dispatch_hier.h:383

kmp_hier_top_unit_t::hier_parent
kmp_hier_top_unit_t< T > * hier_parent
Definition: kmp_dispatch_hier.h:339

kmp_hier_top_unit_t::get_curr_lb
T get_curr_lb(kmp_uint64 index) const
Definition: kmp_dispatch_hier.h:399

kmp_hier_top_unit_t::reset_private_barrier
void reset_private_barrier(kmp_hier_private_bdata_t *tdata)
Definition: kmp_dispatch_hier.h:354

kmp_hier_top_unit_t::get_parent_pr
dispatch_private_info_template< T > * get_parent_pr()
Definition: kmp_dispatch_hier.h:421

kmp_hier_top_unit_t::get_num_active
kmp_int32 get_num_active() const
Definition: kmp_dispatch_hier.h:426

kmp_hier_top_unit_t::set_next
void set_next(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index)
Definition: kmp_dispatch_hier.h:416

kmp_sched_flags::contains_last
unsigned contains_last
Definition: kmp.h:1887

kmp_sched_flags::use_hier
unsigned use_hier
Definition: kmp.h:1888

dispatch_private_info_template::private_info_tmpl::p
dispatch_private_infoXX_template< T > p
Definition: kmp_dispatch.h:137

dispatch_shared_info_template::shared_info_tmpl::s
dispatch_shared_infoXX_template< UT > s
Definition: kmp_dispatch.h:175

kmp_team
Definition: kmp.h:3233

kmp_team::t
kmp_base_team_t t
Definition: kmp.h:3234