13#ifndef KMP_DISPATCH_HIER_H
14#define KMP_DISPATCH_HIER_H
64 kmp_hier_layer_e::LAYER_LAST);
66 kmp_hier_layer_e::LAYER_LAST);
68 kmp_hier_layer_e::LAYER_LAST);
70 kmp_hier_layer_e::LAYER_LAST);
71 capacity = kmp_hier_layer_e::LAYER_LAST;
73 int current_size =
size;
76 layers[current_size] = layer;
86 for (
int i = 0;
i <
size; ++
i) {
92 if (switch_index !=
i) {
101 layers[switch_index] = temp1;
102 scheds[switch_index] = temp2;
139 typedef typename traits_t<T>::signed_t
ST;
152 sh[0].u.s.iteration =
sh[1].u.s.iteration = 0;
159 status[1 - index] = nstatus;
165 status[1 - index] = nstatus;
166 sh[1 - index].u.s.iteration = 0;
176 return &(
sh[1 - index]);
210 static inline kmp_uint64 get_wait_val(
int num_active) {
212 switch (num_active) {
217 wait_val = 0x010101LL;
220 wait_val = 0x01010101LL;
223 wait_val = 0x0101010101LL;
226 wait_val = 0x010101010101LL;
229 wait_val = 0x01010101010101LL;
232 wait_val = 0x0101010101010101LL;
243 kmp_hier_private_bdata_t *tdata);
247 kmp_hier_private_bdata_t *tdata);
252 kmp_hier_private_bdata_t *tdata) {
253 tdata->num_active = num_active;
255 tdata->wait_val[0] = tdata->wait_val[1] = get_wait_val(num_active);
260 bdata->
val[0] = bdata->
val[1] = 0LL;
266 kmp_hier_private_bdata_t *tdata) {
269 kmp_uint64 current_wait_value = tdata->wait_val[current_index];
271 (current_wait_value ? 0 : get_wait_val(tdata->num_active));
272 KD_TRACE(10, (
"core_barrier_impl::barrier(): T#%d current_index:%llu "
273 "next_index:%llu curr_wait:%llu next_wait:%llu\n",
276 char v = (current_wait_value ?
'\1' :
'\0');
277 (
RCAST(
volatile char *, &(bdata->
val[current_index])))[
id] = v;
278 __kmp_wait<kmp_uint64>(&(bdata->
val[current_index]), current_wait_value,
280 tdata->wait_val[current_index] = next_wait_value;
281 tdata->index = next_index;
289 kmp_hier_private_bdata_t *tdata);
293 kmp_hier_private_bdata_t *tdata);
298 kmp_hier_private_bdata_t *tdata) {
299 tdata->num_active = num_active;
301 tdata->wait_val[0] = tdata->wait_val[1] = (
kmp_uint64)num_active;
306 bdata->
val[0] = bdata->
val[1] = 0LL;
312 kmp_hier_private_bdata_t *tdata) {
316 kmp_uint64 current_wait_value = tdata->wait_val[current_index];
317 kmp_uint64 next_wait_value = current_wait_value + tdata->num_active;
319 KD_TRACE(10, (
"counter_barrier_impl::barrier(): T#%d current_index:%llu "
320 "next_index:%llu curr_wait:%llu next_wait:%llu\n",
325 __kmp_wait<kmp_uint64>(&(bdata->
val[current_index]), current_wait_value,
327 tdata->wait_val[current_index] = next_wait_value;
328 tdata->index = next_index;
334 typedef typename traits_t<T>::signed_t
ST;
335 typedef typename traits_t<T>::unsigned_t
UT;
370 tdata->index = 1 - tdata->index;
431 (
" kmp_hier_top_unit_t: active:%d pr:%p lb:%d ub:%d st:%d tc:%d\n",
443 typename traits_t<T>::signed_t
chunk;
452 (
" kmp_hier_layer_info_t: num_active:%d type:%s sched:%d chunk:%d "
474 typedef typename traits_t<T>::unsigned_t
UT;
475 typedef typename traits_t<T>::signed_t
ST;
486 kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[hier_level]);
494 1, (
"kmp_hier_t.next_recurse(): T#%d (%d) called\n", gtid, hier_level));
498 if (previous_id == 0) {
499 KD_TRACE(1, (
"kmp_hier_t.next_recurse(): T#%d (%d) is primary of unit\n",
511 (
"kmp_hier_t.next_recurse(): T#%d (%d) using top level sh\n",
514 th->th.th_dispatch->th_dispatch_sh_current);
519 KD_TRACE(10, (
"kmp_hier_t.next_recurse(): T#%d (%d) using hier sh\n",
522 parent->get_curr_sh(th->th.th_hier_bar_data[hier_level + 1].index);
523 nproc = (T)
parent->get_num_active();
530 status = __kmp_dispatch_next_algorithm<T>(gtid, my_pr, my_sh,
531 &contains_last, &my_lb, &my_ub,
532 &my_st, nproc, hier_id);
535 (
"kmp_hier_t.next_recurse(): T#%d (%d) next_pr_sh() returned %d\n",
536 gtid, hier_level,
status));
539 if (
status == 0 && !last_layer) {
542 status = next_recurse(
loc, gtid,
parent, &contains_last, &my_lb, &my_ub,
543 &my_st, hid, hier_level + 1);
546 (
"kmp_hier_t.next_recurse(): T#%d (%d) hier_next() returned %d\n",
547 gtid, hier_level,
status));
549 kmp_hier_private_bdata_t *upper_tdata =
550 &(th->th.th_hier_bar_data[hier_level + 1]);
551 my_sh =
parent->get_curr_sh(upper_tdata->index);
552 KD_TRACE(10, (
"kmp_hier_t.next_recurse(): T#%d (%d) about to init\n",
555 parent->get_curr_lb(upper_tdata->index),
556 parent->get_curr_ub(upper_tdata->index),
557 parent->get_curr_st(upper_tdata->index),
561 chunk, nproc, hier_id);
562 status = __kmp_dispatch_next_algorithm<T>(
563 gtid, my_pr, my_sh, &contains_last, &my_lb, &my_ub, &my_st, nproc,
566 KD_TRACE(10, (
"kmp_hier_t.next_recurse(): T#%d (%d) status not 1 "
582 if (last_layer ||
parent->hier_pr.flags.contains_last) {
583 KD_TRACE(10, (
"kmp_hier_t.next_recurse(): T#%d (%d) Setting this pr "
584 "to contain last.\n",
586 current->
hier_pr.flags.contains_last = contains_last;
588 if (!current->
hier_pr.flags.contains_last)
589 contains_last =
FALSE;
592 *p_last = contains_last;
596 (
"kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n",
598 current->
barrier(previous_id, tdata);
600 (
"kmp_hier_t.next_recurse(): T#%d (%d) released and exit %d\n",
636 const ST *new_chunks)
const {
640 for (
int i = 0;
i < n; ++
i) {
645 if (
info[
i].chunk != new_chunks[
i])
656 const enum sched_type *new_scheds,
const ST *new_chunks) {
661 (
"kmp_hier_t<T>::allocate_hier: T#0 do not need to reallocate\n"));
662 for (
int i = 0;
i < n; ++
i) {
663 info[
i].num_active = 0;
669 KD_TRACE(10, (
"kmp_hier_t<T>::allocate_hier: T#0 full alloc\n"));
677 for (
int i = 0;
i < n; ++
i) {
680 info[
i].num_active = 0;
681 info[
i].type = layer;
682 info[
i].sched = new_scheds[
i];
683 info[
i].chunk = new_chunks[
i];
691 info[
i].length = max;
694 for (
int j = 0;
j < max; ++
j) {
717 kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[0]);
718 auto parent = pr->get_parent();
723 T nproc = (T)
parent->get_num_active();
724 T unit_id = (T)pr->get_hier_id();
727 (
"kmp_hier_t.next(): T#%d THREAD LEVEL nproc:%d unit_id:%d called\n",
728 gtid, nproc, unit_id));
739 (
"kmp_hier_t.next(): T#%d THREAD LEVEL using hand threading\n",
745 auto sh = &(
parent->hier_barrier.sh[0]);
747 status = __kmp_dispatch_next_algorithm<T>(
748 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
759 parent->get_next_lb(tdata->index),
760 parent->get_next_ub(tdata->index),
761 parent->get_next_st(tdata->index),
765 pr->
u.
p.parm1, nproc, unit_id);
766 sh->u.s.iteration = 0;
767 status = __kmp_dispatch_next_algorithm<T>(
768 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc,
772 (
"kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
779 KD_TRACE(10, (
"kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
786 parent->set_next_hand_thread(*p_lb, *p_ub, *p_st,
status, tdata->index);
788 parent->barrier(pr->get_hier_id(), tdata);
790 *p_lb =
parent->get_curr_lb(tdata->index);
791 *p_ub =
parent->get_curr_ub(tdata->index);
792 *p_st =
parent->get_curr_st(tdata->index);
798 auto sh =
parent->get_curr_sh(tdata->index);
800 status = __kmp_dispatch_next_algorithm<T>(
801 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
803 (
"kmp_hier_t.next(): T#%d THREAD LEVEL next_algorithm status:%d "
804 "contains_last:%d p_lb:%d p_ub:%d p_st:%d\n",
805 gtid,
status, contains_last, *p_lb, *p_ub, *p_st));
815 sh =
parent->get_curr_sh(tdata->index);
817 parent->get_curr_lb(tdata->index),
818 parent->get_curr_ub(tdata->index),
819 parent->get_curr_st(tdata->index),
823 pr->
u.
p.parm1, nproc, unit_id);
824 status = __kmp_dispatch_next_algorithm<T>(
825 gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
827 KD_TRACE(10, (
"kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
834 KD_TRACE(10, (
"kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
842 if (contains_last && !
parent->hier_pr.flags.contains_last) {
843 KD_TRACE(10, (
"kmp_hier_t.next(): T#%d THREAD LEVEL resetting "
844 "contains_last to FALSE\n",
846 contains_last =
FALSE;
849 *p_last = contains_last;
850 KD_TRACE(10, (
"kmp_hier_t.next(): T#%d THREAD LEVEL exit status %d\n", gtid,
911 for (
int j = 0;
j <
info[
i].length; ++
j) {
923 typename traits_t<T>::signed_t *new_chunks,
925 typename traits_t<T>::signed_t st) {
926 int tid, gtid, num_hw_threads, num_threads_per_layer1, active;
927 unsigned int my_buffer_index;
935 KD_TRACE(10, (
"__kmp_dispatch_init_hierarchy: T#%d called: %d layer(s)\n",
937 for (
int i = 0;
i < n; ++
i) {
939 KD_TRACE(10, (
"__kmp_dispatch_init_hierarchy: T#%d: new_layers[%d] = %s, "
940 "new_scheds[%d] = %d, new_chunks[%d] = %u\n",
941 gtid,
i, layer,
i, (
int)new_scheds[
i],
i, new_chunks[
i]));
953 team = th->th.th_team;
954 active = !team->
t.t_serialized;
955 th->th.th_ident =
loc;
958 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
959 my_buffer_index = th->th.th_dispatch->th_disp_index;
966 KD_TRACE(10, (
"__kmp_dispatch_init_hierarchy: T#%d not active parallel. "
967 "Using normal dispatch functions.\n",
980 KD_TRACE(10, (
"__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating "
983 if (sh->hier == NULL) {
986 sh->hier->allocate_hier(n, new_layers, new_scheds, new_chunks);
992 if (!sh->hier->is_valid()) {
998 if (th->th.th_hier_bar_data == NULL) {
999 th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)
__kmp_allocate(
1005 for (
int i = 0;
i < n; ++
i) {
1010 pr->hier_parent = my_unit;
1013 KD_TRACE(10, (
"__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
1014 "is already active (%d)\n",
1015 gtid, my_unit, my_unit->
active));
1042 KD_TRACE(10, (
"__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
1043 "incrementing num_active\n",
1053 pr->hier_id = tid % num_threads_per_layer1;
1056 if (tid >= num_hw_threads)
1057 pr->hier_id += ((tid / num_hw_threads) * num_threads_per_layer1);
1059 10, (
"__kmp_dispatch_init_hierarchy: T#%d setting lowest hier_id to %d\n",
1060 gtid, pr->hier_id));
1068 int prev_id = pr->get_hier_id();
1069 for (
int i = 0;
i < n; ++
i) {
1075 KD_TRACE(10, (
"__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n",
1083 __kmp_dispatch_init_algorithm<T>(
1094 for (
int i = 0;
i < n && unit; ++
i, unit = unit->
get_parent()) {
1095 kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[
i]);
1102 for (
int i = 0;
i < n; ++
i) {
1104 (
"__kmp_dispatch_init_hierarchy: T#%d active count[%d] = %d\n",
static void reset_private(kmp_int32 num_active, kmp_hier_private_bdata_t *tdata)
static void reset_shared(kmp_int32 num_active, kmp_hier_shared_bdata_t< T > *bdata)
static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t< T > *bdata, kmp_hier_private_bdata_t *tdata)
static void reset_shared(kmp_int32 num_active, kmp_hier_shared_bdata_t< T > *bdata)
static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t< T > *bdata, kmp_hier_private_bdata_t *tdata)
static void reset_private(kmp_int32 num_active, kmp_hier_private_bdata_t *tdata)
sched_type
Describes the loop schedule to be used for a parallel for loop.
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id parent
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type type
int __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size, void *reduce_data, void(*reduce)(void *, void *))
#define __kmp_entry_gtid()
static int __kmp_tid_from_gtid(int gtid)
kmp_info_t ** __kmp_threads
int __kmp_dispatch_num_buffers
void __kmp_parallel_initialize(void)
volatile int __kmp_init_parallel
#define __kmp_allocate(size)
void __kmp_resume_if_soft_paused()
static void __kmp_type_convert(T1 src, T2 *dest)
union KMP_ALIGN_CACHE kmp_info kmp_info_t
#define KMP_DEBUG_ASSERT(cond)
unsigned long long kmp_uint64
void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, dispatch_private_info_template< T > *pr, enum sched_type schedule, T lb, T ub, typename traits_t< T >::signed_t st, typename traits_t< T >::signed_t chunk, T nproc, T tid)
void __kmp_dispatch_init_hierarchy(ident_t *loc, int n, kmp_hier_layer_e *new_layers, enum sched_type *new_scheds, typename traits_t< T >::signed_t *new_chunks, T lb, T ub, typename traits_t< T >::signed_t st)
kmp_hier_sched_env_t __kmp_hier_scheds
void __kmp_dispatch_free_hierarchies(kmp_team_t *team)
int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST+1]
static const char * __kmp_get_hier_str(kmp_hier_layer_e type)
int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type)
int __kmp_dispatch_hand_threading
int __kmp_dispatch_get_id(int gtid, kmp_hier_layer_e type)
int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST+1]
int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2)
static volatile kmp_i18n_cat_status_t status
#define USE_ITT_BUILD_ARG(x)
#define KMP_TEST_THEN_INC32(p)
#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv)
#define KMP_TEST_THEN_INC64(p)
union KMP_ALIGN_CACHE dispatch_private_info_template::private_info_tmpl u
union dispatch_shared_info_template::shared_info_tmpl u
traits_t< T >::signed_t chunk
void append(enum sched_type sched, kmp_int32 chunk, kmp_hier_layer_e layer)
kmp_hier_layer_e * layers
void set_next(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index)
kmp_int32 get_next_status(kmp_uint64 index) const
T get_next_ub(kmp_uint64 index) const
T get_curr_ub(kmp_uint64 index) const
T get_curr_lb(kmp_uint64 index) const
traits_t< T >::signed_t ST
ST get_curr_st(kmp_uint64 index) const
volatile kmp_uint64 val[2]
void set_next_hand_thread(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index)
dispatch_shared_info_template< T > volatile * get_curr_sh(kmp_uint64 index)
T get_next_lb(kmp_uint64 index) const
dispatch_shared_info_template< T > volatile * get_next_sh(kmp_uint64 index)
dispatch_shared_info_template< T > sh[2]
ST get_next_st(kmp_uint64 index) const
kmp_int32 get_curr_status(kmp_uint64 index) const
enum sched_type get_sched(int level) const
traits_t< T >::signed_t ST
int get_num_layers() const
kmp_hier_layer_info_t< T > * info
int get_top_level_nproc() const
traits_t< T >::unsigned_t UT
kmp_hier_layer_e get_type(int level) const
bool need_to_reallocate(int n, const kmp_hier_layer_e *new_layers, const enum sched_type *new_scheds, const ST *new_chunks) const
int get_length(int level) const
ST get_chunk(int level) const
int get_num_active(int level) const
int next(ident_t *loc, int gtid, dispatch_private_info_template< T > *pr, kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st)
void allocate_hier(int n, const kmp_hier_layer_e *new_layers, const enum sched_type *new_scheds, const ST *new_chunks)
kmp_hier_top_unit_t< T > ** layers
kmp_hier_top_unit_t< T > * get_unit(int level, int index)
dispatch_shared_info_template< T > volatile * get_next_sh(kmp_uint64 index)
void barrier(kmp_int32 id, kmp_hier_private_bdata_t *tdata)
ST get_next_st(kmp_uint64 index) const
T get_curr_ub(kmp_uint64 index) const
kmp_int32 get_curr_status(kmp_uint64 index) const
void set_next_hand_thread(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index)
dispatch_private_info_template< T > hier_pr
kmp_int32 is_active() const
T get_next_ub(kmp_uint64 index) const
dispatch_private_info_template< T > * get_my_pr()
dispatch_shared_info_template< T > volatile * get_curr_sh(kmp_uint64 index)
ST get_curr_st(kmp_uint64 index) const
kmp_int32 get_next_status(kmp_uint64 index) const
kmp_hier_top_unit_t< T > * get_parent()
traits_t< T >::unsigned_t UT
traits_t< T >::signed_t ST
void reset_shared_barrier()
kmp_hier_shared_bdata_t< T > hier_barrier
kmp_int32 get_hier_id() const
T get_next_lb(kmp_uint64 index) const
kmp_hier_top_unit_t< T > * hier_parent
T get_curr_lb(kmp_uint64 index) const
void reset_private_barrier(kmp_hier_private_bdata_t *tdata)
dispatch_private_info_template< T > * get_parent_pr()
kmp_int32 get_num_active() const
void set_next(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index)
dispatch_private_infoXX_template< T > p
dispatch_shared_infoXX_template< UT > s