LLVM OpenMP 22.0.0git
kmp_barrier.cpp
Go to the documentation of this file.
1/*
2 * kmp_barrier.cpp
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp_wait_release.h"
14#include "kmp_barrier.h"
15#include "kmp_itt.h"
16#include "kmp_os.h"
17#include "kmp_stats.h"
18#include "ompt-specific.h"
19// for distributed barrier
20#include "kmp_affinity.h"
21
22#if KMP_MIC
23#include <immintrin.h>
24#define USE_NGO_STORES 1
25#endif // KMP_MIC
26
27#if KMP_MIC && USE_NGO_STORES
28// ICV copying
29#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32#define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33#else
34#define ngo_load(src) ((void)0)
35#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37#define ngo_sync() ((void)0)
38#endif /* KMP_MIC && USE_NGO_STORES */
39
40void __kmp_print_structure(void); // Forward declaration
41
42// ---------------------------- Barrier Algorithms ----------------------------
43// Distributed barrier
44
45// Compute how many threads to have polling each cache-line.
46// We want to limit the number of writes to IDEAL_GO_RESOLUTION.
47void distributedBarrier::computeVarsForN(size_t n) {
48 int nsockets = 1;
49 if (__kmp_topology) {
50 int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
51 int core_level = __kmp_topology->get_level(KMP_HW_CORE);
52 int ncores_per_socket =
53 __kmp_topology->calculate_ratio(core_level, socket_level);
54 nsockets = __kmp_topology->get_count(socket_level);
55
56 if (nsockets <= 0)
57 nsockets = 1;
58 if (ncores_per_socket <= 0)
59 ncores_per_socket = 1;
60
61 threads_per_go = ncores_per_socket >> 1;
62 if (!fix_threads_per_go) {
63 // Minimize num_gos
64 if (threads_per_go > 4) {
67 }
68 if (threads_per_go > 4 && nsockets == 1)
70 }
71 }
72 if (threads_per_go == 0)
74 fix_threads_per_go = true;
76 if (n % threads_per_go)
77 num_gos++;
78 if (nsockets == 1 || num_gos == 1)
79 num_groups = 1;
80 else {
81 num_groups = num_gos / nsockets;
82 if (num_gos % nsockets)
83 num_groups++;
84 }
85 if (num_groups <= 0)
86 num_groups = 1;
88 if (num_gos % num_groups)
91 } else {
93 if (n % threads_per_go)
94 num_gos++;
95 if (num_gos == 1)
96 num_groups = 1;
97 else {
98 num_groups = num_gos / 2;
99 if (num_gos % 2)
100 num_groups++;
101 }
103 if (num_gos % num_groups)
106 }
107}
108
109void distributedBarrier::computeGo(size_t n) {
110 // Minimize num_gos
111 for (num_gos = 1;; num_gos++)
112 if (IDEAL_CONTENTION * num_gos >= n)
113 break;
115 if (n % num_gos)
117 while (num_gos > MAX_GOS) {
120 if (n % threads_per_go)
121 num_gos++;
122 }
123 computeVarsForN(n);
124}
125
126// This function is to resize the barrier arrays when the new number of threads
127// exceeds max_threads, which is the current size of all the arrays
128void distributedBarrier::resize(size_t nthr) {
130
131 // expand to requested size * 2
132 max_threads = nthr * 2;
133
134 // allocate arrays to new max threads
135 for (int i = 0; i < MAX_ITERS; ++i) {
136 if (flags[i])
137 flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
138 max_threads * sizeof(flags_s));
139 else
140 flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));
141 }
142
143 if (go)
144 go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));
145 else
146 go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));
147
148 if (iter)
149 iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));
150 else
151 iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));
152
153 if (sleep)
154 sleep =
155 (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));
156 else
157 sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));
158}
159
160// This function is to set all the go flags that threads might be waiting
161// on, and when blocktime is not infinite, it should be followed by a wake-up
162// call to each thread
165 for (size_t j = 0; j < num_gos; j++) {
166 go[j].go.store(next_go);
167 }
168 return next_go;
169}
170
172 for (size_t j = 0; j < max_threads; ++j) {
173 for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
174 flags[i][j].stillNeed = 1;
175 }
176 go[j].go.store(0);
177 iter[j].iter = 0;
178 }
179}
180
181// This function inits/re-inits the distributed barrier for a particular number
182// of threads. If a resize of arrays is needed, it calls the resize function.
183void distributedBarrier::init(size_t nthr) {
184 size_t old_max = max_threads;
185 if (nthr > max_threads) { // need more space in arrays
186 resize(nthr);
187 }
188
189 for (size_t i = 0; i < max_threads; i++) {
190 for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
191 flags[j][i].stillNeed = 1;
192 }
193 go[i].go.store(0);
194 iter[i].iter = 0;
195 if (i >= old_max)
196 sleep[i].sleep = false;
197 }
198
199 // Recalculate num_gos, etc. based on new nthr
200 computeVarsForN(nthr);
201
202 num_threads = nthr;
203
204 if (team_icvs == NULL)
206}
207
209 for (int i = 0; i < MAX_ITERS; ++i) {
210 if (db->flags[i])
212 db->flags[i] = NULL;
213 }
214 if (db->go) {
216 db->go = NULL;
217 }
218 if (db->iter) {
220 db->iter = NULL;
221 }
222 if (db->sleep) {
224 db->sleep = NULL;
225 }
226 if (db->team_icvs) {
228 db->team_icvs = NULL;
229 }
231}
232
233// This function is used only when KMP_BLOCKTIME is not infinite.
234// static
236 size_t start, size_t stop, size_t inc,
237 size_t tid) {
239 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
240 return;
241
242 kmp_info_t **other_threads = team->t.t_threads;
243 for (size_t thr = start; thr < stop; thr += inc) {
244 KMP_DEBUG_ASSERT(other_threads[thr]);
245 int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
246 // Wake up worker regardless of if it appears to be sleeping or not
248 }
249}
250
252 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
253 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
255 kmp_team_t *team;
257 kmp_info_t **other_threads;
258 kmp_uint64 my_current_iter, my_next_iter;
259 kmp_uint32 nproc;
260 bool group_leader;
261
262 team = this_thr->th.th_team;
263 nproc = this_thr->th.th_team_nproc;
264 other_threads = team->t.t_threads;
265 b = team->t.b;
266 my_current_iter = b->iter[tid].iter;
267 my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
268 group_leader = ((tid % b->threads_per_group) == 0);
269
270 KA_TRACE(20,
271 ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
272 gtid, team->t.t_id, tid, bt));
273
274#if USE_ITT_BUILD && USE_ITT_NOTIFY
275 // Barrier imbalance - save arrive time to the thread
276 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
277 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
278 __itt_get_timestamp();
279 }
280#endif
281
282 if (group_leader) {
283 // Start from the thread after the group leader
284 size_t group_start = tid + 1;
285 size_t group_end = tid + b->threads_per_group;
286 size_t threads_pending = 0;
287
288 if (group_end > nproc)
289 group_end = nproc;
290 do { // wait for threads in my group
291 threads_pending = 0;
292 // Check all the flags every time to avoid branch misspredict
293 for (size_t thr = group_start; thr < group_end; thr++) {
294 // Each thread uses a different cache line
295 threads_pending += b->flags[my_current_iter][thr].stillNeed;
296 }
297 // Execute tasks here
299 kmp_task_team_t *task_team = this_thr->th.th_task_team;
300 if (task_team != NULL) {
301 if (TCR_SYNC_4(task_team->tt.tt_active)) {
302 if (KMP_TASKING_ENABLED(task_team)) {
303 int tasks_completed = FALSE;
305 this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
306 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
307 } else
308 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
309 }
310 } else {
311 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
312 } // if
313 }
314 if (TCR_4(__kmp_global.g.g_done)) {
315 if (__kmp_global.g.g_abort)
317 break;
319 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
320 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
321 }
322 } while (threads_pending > 0);
323
324 if (reduce) { // Perform reduction if needed
325 OMPT_REDUCTION_DECL(this_thr, gtid);
327 // Group leader reduces all threads in group
328 for (size_t thr = group_start; thr < group_end; thr++) {
329 (*reduce)(this_thr->th.th_local.reduce_data,
330 other_threads[thr]->th.th_local.reduce_data);
331 }
333 }
334
335 // Set flag for next iteration
336 b->flags[my_next_iter][tid].stillNeed = 1;
337 // Each thread uses a different cache line; resets stillNeed to 0 to
338 // indicate it has reached the barrier
339 b->flags[my_current_iter][tid].stillNeed = 0;
340
341 do { // wait for all group leaders
342 threads_pending = 0;
343 for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
344 threads_pending += b->flags[my_current_iter][thr].stillNeed;
345 }
346 // Execute tasks here
348 kmp_task_team_t *task_team = this_thr->th.th_task_team;
349 if (task_team != NULL) {
350 if (TCR_SYNC_4(task_team->tt.tt_active)) {
351 if (KMP_TASKING_ENABLED(task_team)) {
352 int tasks_completed = FALSE;
354 this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
355 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
356 } else
357 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
358 }
359 } else {
360 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
361 } // if
362 }
363 if (TCR_4(__kmp_global.g.g_done)) {
364 if (__kmp_global.g.g_abort)
366 break;
368 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
369 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
370 }
371 } while (threads_pending > 0);
372
373 if (reduce) { // Perform reduction if needed
374 if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
375 OMPT_REDUCTION_DECL(this_thr, gtid);
377 for (size_t thr = b->threads_per_group; thr < nproc;
378 thr += b->threads_per_group) {
379 (*reduce)(this_thr->th.th_local.reduce_data,
380 other_threads[thr]->th.th_local.reduce_data);
381 }
383 }
384 }
385 } else {
386 // Set flag for next iteration
387 b->flags[my_next_iter][tid].stillNeed = 1;
388 // Each thread uses a different cache line; resets stillNeed to 0 to
389 // indicate it has reached the barrier
390 b->flags[my_current_iter][tid].stillNeed = 0;
391 }
392
393 KMP_MFENCE();
394
395 KA_TRACE(20,
396 ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
397 gtid, team->t.t_id, tid, bt));
398}
399
401 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
402 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
403 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
404 kmp_team_t *team;
406 kmp_bstate_t *thr_bar;
407 kmp_uint64 my_current_iter, next_go;
408 size_t my_go_index;
409 bool group_leader;
410
411 KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
412 gtid, tid, bt));
413
414 thr_bar = &this_thr->th.th_bar[bt].bb;
415
416 if (!KMP_MASTER_TID(tid)) {
417 // workers and non-master group leaders need to check their presence in team
418 do {
419 if (this_thr->th.th_used_in_team.load() != 1 &&
420 this_thr->th.th_used_in_team.load() != 3) {
421 // Thread is not in use in a team. Wait on location in tid's thread
422 // struct. The 0 value tells anyone looking that this thread is spinning
423 // or sleeping until this location becomes 3 again; 3 is the transition
424 // state to get to 1 which is waiting on go and being in the team
425 kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
426 if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
427 0) ||
428 this_thr->th.th_used_in_team.load() == 0) {
429 my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
430 }
431#if USE_ITT_BUILD && USE_ITT_NOTIFY
432 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
433 // In fork barrier where we could not get the object reliably
434 itt_sync_obj =
435 __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
436 // Cancel wait on previous parallel region...
437 __kmp_itt_task_starting(itt_sync_obj);
438
439 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
440 return;
441
442 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
443 if (itt_sync_obj != NULL)
444 // Call prepare as early as possible for "new" barrier
445 __kmp_itt_task_finished(itt_sync_obj);
446 } else
447#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
448 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
449 return;
450 }
451 if (this_thr->th.th_used_in_team.load() != 1 &&
452 this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?
453 continue;
454 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
455 return;
456
457 // At this point, the thread thinks it is in use in a team, or in
458 // transition to be used in a team, but it might have reached this barrier
459 // before it was marked unused by the team. Unused threads are awoken and
460 // shifted to wait on local thread struct elsewhere. It also might reach
461 // this point by being picked up for use by a different team. Either way,
462 // we need to update the tid.
463 tid = __kmp_tid_from_gtid(gtid);
464 team = this_thr->th.th_team;
465 KMP_DEBUG_ASSERT(tid >= 0);
466 KMP_DEBUG_ASSERT(team);
467 b = team->t.b;
468 my_current_iter = b->iter[tid].iter;
469 next_go = my_current_iter + distributedBarrier::MAX_ITERS;
470 my_go_index = tid / b->threads_per_go;
471 if (this_thr->th.th_used_in_team.load() == 3) {
472 (void)KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3,
473 1);
474 }
475 // Check if go flag is set
476 if (b->go[my_go_index].go.load() != next_go) {
477 // Wait on go flag on team
479 &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
480 my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
481 KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
482 b->iter[tid].iter == 0);
483 KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
484 }
485
486 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
487 return;
488 // At this point, the thread's go location was set. This means the primary
489 // thread is safely in the barrier, and so this thread's data is
490 // up-to-date, but we should check again that this thread is really in
491 // use in the team, as it could have been woken up for the purpose of
492 // changing team size, or reaping threads at shutdown.
493 if (this_thr->th.th_used_in_team.load() == 1)
494 break;
495 } while (1);
496
497 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
498 return;
499
500 group_leader = ((tid % b->threads_per_group) == 0);
501 if (group_leader) {
502 // Tell all the threads in my group they can go!
503 for (size_t go_idx = my_go_index + 1;
504 go_idx < my_go_index + b->gos_per_group; go_idx++) {
505 b->go[go_idx].go.store(next_go);
506 }
507 // Fence added so that workers can see changes to go. sfence inadequate.
508 KMP_MFENCE();
509 }
510
511#if KMP_BARRIER_ICV_PUSH
512 if (propagate_icvs) { // copy ICVs to final dest
513 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
514 tid, FALSE);
515 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
516 (kmp_internal_control_t *)team->t.b->team_icvs);
517 copy_icvs(&thr_bar->th_fixed_icvs,
518 &team->t.t_implicit_task_taskdata[tid].td_icvs);
519 }
520#endif
521 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
522 // This thread is now awake and participating in the barrier;
523 // wake up the other threads in the group
524 size_t nproc = this_thr->th.th_team_nproc;
525 size_t group_end = tid + b->threads_per_group;
526 if (nproc < group_end)
527 group_end = nproc;
528 __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
529 }
530 } else { // Primary thread
531 team = this_thr->th.th_team;
532 b = team->t.b;
533 my_current_iter = b->iter[tid].iter;
534 next_go = my_current_iter + distributedBarrier::MAX_ITERS;
535#if KMP_BARRIER_ICV_PUSH
536 if (propagate_icvs) {
537 // primary thread has ICVs in final destination; copy
538 copy_icvs(&thr_bar->th_fixed_icvs,
539 &team->t.t_implicit_task_taskdata[tid].td_icvs);
540 }
541#endif
542 // Tell all the group leaders they can go!
543 for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
544 b->go[go_idx].go.store(next_go);
545 }
546
548 // Wake-up the group leaders
549 size_t nproc = this_thr->th.th_team_nproc;
550 __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
551 b->threads_per_group, tid);
552 }
553
554 // Tell all the threads in my group they can go!
555 for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
556 b->go[go_idx].go.store(next_go);
557 }
558
559 // Fence added so that workers can see changes to go. sfence inadequate.
560 KMP_MFENCE();
561
563 // Wake-up the other threads in my group
564 size_t nproc = this_thr->th.th_team_nproc;
565 size_t group_end = tid + b->threads_per_group;
566 if (nproc < group_end)
567 group_end = nproc;
568 __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
569 }
570 }
571 // Update to next iteration
572 KMP_ASSERT(my_current_iter == b->iter[tid].iter);
573 b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
574
575 KA_TRACE(
576 20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
577 gtid, team->t.t_id, tid, bt));
578}
579
580// Linear Barrier
581template <bool cancellable = false>
583 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
584 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
585 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
586 kmp_team_t *team = this_thr->th.th_team;
587 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
588 kmp_info_t **other_threads = team->t.t_threads;
589
590 KA_TRACE(
591 20,
592 ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
593 gtid, team->t.t_id, tid, bt));
594 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
595
596#if USE_ITT_BUILD && USE_ITT_NOTIFY
597 // Barrier imbalance - save arrive time to the thread
598 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
599 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
600 __itt_get_timestamp();
601 }
602#endif
603 // We now perform a linear reduction to signal that all of the threads have
604 // arrived.
605 if (!KMP_MASTER_TID(tid)) {
606 KA_TRACE(20,
607 ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
608 "arrived(%p): %llu => %llu\n",
609 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
610 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
611 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
612 // Mark arrival to primary thread
613 /* After performing this write, a worker thread may not assume that the team
614 is valid any more - it could be deallocated by the primary thread at any
615 time. */
616 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
617 flag.release();
618 } else {
619 kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
620 int nproc = this_thr->th.th_team_nproc;
621 int i;
622 // Don't have to worry about sleep bit here or atomic since team setting
623 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
624
625 // Collect all the worker team member threads.
626 for (i = 1; i < nproc; ++i) {
627#if KMP_CACHE_MANAGE
628 // Prefetch next thread's arrived count
629 if (i + 1 < nproc)
630 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
631#endif /* KMP_CACHE_MANAGE */
632 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
633 "arrived(%p) == %llu\n",
634 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
635 team->t.t_id, i,
636 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
637
638 // Wait for worker thread to arrive
639 if (cancellable) {
641 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
642 if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
643 return true;
644 } else {
645 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
646 new_state);
647 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
648 }
649#if USE_ITT_BUILD && USE_ITT_NOTIFY
650 // Barrier imbalance - write min of the thread time and the other thread
651 // time to the thread.
652 if (__kmp_forkjoin_frames_mode == 2) {
653 this_thr->th.th_bar_min_time = KMP_MIN(
654 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
655 }
656#endif
657 if (reduce) {
658 KA_TRACE(100,
659 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
660 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
661 team->t.t_id, i));
662 OMPT_REDUCTION_DECL(this_thr, gtid);
664 (*reduce)(this_thr->th.th_local.reduce_data,
665 other_threads[i]->th.th_local.reduce_data);
667 }
668 }
669 // Don't have to worry about sleep bit here or atomic since team setting
670 team_bar->b_arrived = new_state;
671 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
672 "arrived(%p) = %llu\n",
673 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
674 new_state));
675 }
676 KA_TRACE(
677 20,
678 ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
679 gtid, team->t.t_id, tid, bt));
680 return false;
681}
682
683template <bool cancellable = false>
685 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
686 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
687 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
688 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
689 kmp_team_t *team;
690
691 if (KMP_MASTER_TID(tid)) {
692 unsigned int i;
693 kmp_uint32 nproc = this_thr->th.th_team_nproc;
694 kmp_info_t **other_threads;
695
696 team = __kmp_threads[gtid]->th.th_team;
697 KMP_DEBUG_ASSERT(team != NULL);
698 other_threads = team->t.t_threads;
699
700 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
701 "barrier type %d\n",
702 gtid, team->t.t_id, tid, bt));
703
704 if (nproc > 1) {
705#if KMP_BARRIER_ICV_PUSH
706 {
708 if (propagate_icvs) {
709 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
710 for (i = 1; i < nproc; ++i) {
711 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
712 team, i, FALSE);
713 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
714 &team->t.t_implicit_task_taskdata[0].td_icvs);
715 }
716 ngo_sync();
717 }
718 }
719#endif // KMP_BARRIER_ICV_PUSH
720
721 // Now, release all of the worker threads
722 for (i = 1; i < nproc; ++i) {
723#if KMP_CACHE_MANAGE
724 // Prefetch next thread's go flag
725 if (i + 1 < nproc)
726 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
727#endif /* KMP_CACHE_MANAGE */
728 KA_TRACE(
729 20,
730 ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
731 "go(%p): %u => %u\n",
732 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
733 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
734 other_threads[i]->th.th_bar[bt].bb.b_go,
735 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
736 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
737 other_threads[i]);
738 flag.release();
739 }
740 }
741 } else { // Wait for the PRIMARY thread to release us
742 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
743 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
744 if (cancellable) {
746 if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
747 return true;
748 } else {
750 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
751 }
752#if USE_ITT_BUILD && USE_ITT_NOTIFY
753 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
754 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
755 // disabled)
756 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
757 // Cancel wait on previous parallel region...
758 __kmp_itt_task_starting(itt_sync_obj);
759
760 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
761 return false;
762
763 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
764 if (itt_sync_obj != NULL)
765 // Call prepare as early as possible for "new" barrier
766 __kmp_itt_task_finished(itt_sync_obj);
767 } else
768#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
769 // Early exit for reaping threads releasing forkjoin barrier
770 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
771 return false;
772// The worker thread may now assume that the team is valid.
773#ifdef KMP_DEBUG
774 tid = __kmp_tid_from_gtid(gtid);
775 team = __kmp_threads[gtid]->th.th_team;
776#endif
777 KMP_DEBUG_ASSERT(team != NULL);
778 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
779 KA_TRACE(20,
780 ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
781 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
782 KMP_MB(); // Flush all pending memory write invalidates.
783 }
784 KA_TRACE(
785 20,
786 ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
787 gtid, team->t.t_id, tid, bt));
788 return false;
789}
790
792 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
793 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
795 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
796}
797
799 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
800 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
802 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
803}
804
806 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
807 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
809 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
810}
811
813 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
814 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
816 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
817}
818
819// Tree barrier
821 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
822 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
824 kmp_team_t *team = this_thr->th.th_team;
825 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
826 kmp_info_t **other_threads = team->t.t_threads;
827 kmp_uint32 nproc = this_thr->th.th_team_nproc;
829 kmp_uint32 branch_factor = 1 << branch_bits;
830 kmp_uint32 child;
831 kmp_uint32 child_tid;
832 kmp_uint64 new_state = 0;
833
834 KA_TRACE(
835 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
836 gtid, team->t.t_id, tid, bt));
837 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
838
839#if USE_ITT_BUILD && USE_ITT_NOTIFY
840 // Barrier imbalance - save arrive time to the thread
841 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
842 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
843 __itt_get_timestamp();
844 }
845#endif
846 // Perform tree gather to wait until all threads have arrived; reduce any
847 // required data as we go
848 child_tid = (tid << branch_bits) + 1;
849 if (child_tid < nproc) {
850 // Parent threads wait for all their children to arrive
851 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
852 child = 1;
853 do {
854 kmp_info_t *child_thr = other_threads[child_tid];
855 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
856#if KMP_CACHE_MANAGE
857 // Prefetch next thread's arrived count
858 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
860 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
861#endif /* KMP_CACHE_MANAGE */
862 KA_TRACE(20,
863 ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
864 "arrived(%p) == %llu\n",
865 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
866 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
867 // Wait for child to arrive
868 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
869 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
870#if USE_ITT_BUILD && USE_ITT_NOTIFY
871 // Barrier imbalance - write min of the thread time and a child time to
872 // the thread.
873 if (__kmp_forkjoin_frames_mode == 2) {
874 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
875 child_thr->th.th_bar_min_time);
876 }
877#endif
878 if (reduce) {
879 KA_TRACE(100,
880 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
881 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
882 team->t.t_id, child_tid));
883 OMPT_REDUCTION_DECL(this_thr, gtid);
885 (*reduce)(this_thr->th.th_local.reduce_data,
886 child_thr->th.th_local.reduce_data);
888 }
889 child++;
890 child_tid++;
891 } while (child <= branch_factor && child_tid < nproc);
892 }
893
894 if (!KMP_MASTER_TID(tid)) { // Worker threads
895 kmp_int32 parent_tid = (tid - 1) >> branch_bits;
896
897 KA_TRACE(20,
898 ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
899 "arrived(%p): %llu => %llu\n",
900 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
901 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
902 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
903
904 // Mark arrival to parent thread
905 /* After performing this write, a worker thread may not assume that the team
906 is valid any more - it could be deallocated by the primary thread at any
907 time. */
908 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
909 flag.release();
910 } else {
911 // Need to update the team arrived pointer if we are the primary thread
912 if (nproc > 1) // New value was already computed above
913 team->t.t_bar[bt].b_arrived = new_state;
914 else
915 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
916 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
917 "arrived(%p) = %llu\n",
918 gtid, team->t.t_id, tid, team->t.t_id,
919 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
920 }
921 KA_TRACE(20,
922 ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
923 gtid, team->t.t_id, tid, bt));
924}
925
927 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
928 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
929 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
930 kmp_team_t *team;
931 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
932 kmp_uint32 nproc;
934 kmp_uint32 branch_factor = 1 << branch_bits;
935 kmp_uint32 child;
936 kmp_uint32 child_tid;
937
938 // Perform a tree release for all of the threads that have been gathered
939 if (!KMP_MASTER_TID(
940 tid)) { // Handle fork barrier workers who aren't part of a team yet
941 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
942 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
943 // Wait for parent thread to release us
945 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
946#if USE_ITT_BUILD && USE_ITT_NOTIFY
947 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
948 // In fork barrier where we could not get the object reliably (or
949 // ITTNOTIFY is disabled)
950 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
951 // Cancel wait on previous parallel region...
952 __kmp_itt_task_starting(itt_sync_obj);
953
954 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
955 return;
956
957 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
958 if (itt_sync_obj != NULL)
959 // Call prepare as early as possible for "new" barrier
960 __kmp_itt_task_finished(itt_sync_obj);
961 } else
962#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
963 // Early exit for reaping threads releasing forkjoin barrier
964 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
965 return;
966
967 // The worker thread may now assume that the team is valid.
968 team = __kmp_threads[gtid]->th.th_team;
969 KMP_DEBUG_ASSERT(team != NULL);
970 tid = __kmp_tid_from_gtid(gtid);
971
972 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
973 KA_TRACE(20,
974 ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
975 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
976 KMP_MB(); // Flush all pending memory write invalidates.
977 } else {
978 team = __kmp_threads[gtid]->th.th_team;
979 KMP_DEBUG_ASSERT(team != NULL);
980 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
981 "barrier type %d\n",
982 gtid, team->t.t_id, tid, bt));
983 }
984 nproc = this_thr->th.th_team_nproc;
985 child_tid = (tid << branch_bits) + 1;
986
987 if (child_tid < nproc) {
988 kmp_info_t **other_threads = team->t.t_threads;
989 child = 1;
990 // Parent threads release all their children
991 do {
992 kmp_info_t *child_thr = other_threads[child_tid];
993 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
994#if KMP_CACHE_MANAGE
995 // Prefetch next thread's go count
996 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
998 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
999#endif /* KMP_CACHE_MANAGE */
1000
1001#if KMP_BARRIER_ICV_PUSH
1002 {
1004 if (propagate_icvs) {
1005 __kmp_init_implicit_task(team->t.t_ident,
1006 team->t.t_threads[child_tid], team,
1007 child_tid, FALSE);
1008 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
1009 &team->t.t_implicit_task_taskdata[0].td_icvs);
1010 }
1011 }
1012#endif // KMP_BARRIER_ICV_PUSH
1013 KA_TRACE(20,
1014 ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1015 "go(%p): %u => %u\n",
1016 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1017 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1018 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1019 // Release child from barrier
1020 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1021 flag.release();
1022 child++;
1023 child_tid++;
1024 } while (child <= branch_factor && child_tid < nproc);
1025 }
1026 KA_TRACE(
1027 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1028 gtid, team->t.t_id, tid, bt));
1029}
1030
1031// Hyper Barrier
1033 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1034 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1035 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
1036 kmp_team_t *team = this_thr->th.th_team;
1037 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1038 kmp_info_t **other_threads = team->t.t_threads;
1040 kmp_uint32 num_threads = this_thr->th.th_team_nproc;
1042 kmp_uint32 branch_factor = 1 << branch_bits;
1043 kmp_uint32 offset;
1045
1046 KA_TRACE(
1047 20,
1048 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1049 gtid, team->t.t_id, tid, bt));
1050 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1051
1052#if USE_ITT_BUILD && USE_ITT_NOTIFY
1053 // Barrier imbalance - save arrive time to the thread
1054 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1055 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
1056 __itt_get_timestamp();
1057 }
1058#endif
1059 /* Perform a hypercube-embedded tree gather to wait until all of the threads
1060 have arrived, and reduce any required data as we go. */
1061 kmp_flag_64<> p_flag(&thr_bar->b_arrived);
1062 for (level = 0, offset = 1; offset < num_threads;
1063 level += branch_bits, offset <<= branch_bits) {
1064 kmp_uint32 child;
1065 kmp_uint32 child_tid;
1066
1067 if (((tid >> level) & (branch_factor - 1)) != 0) {
1068 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
1069
1070 KMP_MB(); // Synchronize parent and child threads.
1071 KA_TRACE(20,
1072 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1073 "arrived(%p): %llu => %llu\n",
1074 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
1075 team->t.t_id, parent_tid, &thr_bar->b_arrived,
1076 thr_bar->b_arrived,
1077 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1078 // Mark arrival to parent thread
1079 /* After performing this write (in the last iteration of the enclosing for
1080 loop), a worker thread may not assume that the team is valid any more
1081 - it could be deallocated by the primary thread at any time. */
1082 p_flag.set_waiter(other_threads[parent_tid]);
1083 p_flag.release();
1084 break;
1085 }
1086
1087 // Parent threads wait for children to arrive
1088 if (new_state == KMP_BARRIER_UNUSED_STATE)
1089 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1090 for (child = 1, child_tid = tid + (1 << level);
1091 child < branch_factor && child_tid < num_threads;
1092 child++, child_tid += (1 << level)) {
1093 kmp_info_t *child_thr = other_threads[child_tid];
1094 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1095#if KMP_CACHE_MANAGE
1096 kmp_uint32 next_child_tid = child_tid + (1 << level);
1097 // Prefetch next thread's arrived count
1098 if (child + 1 < branch_factor && next_child_tid < num_threads)
1100 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
1101#endif /* KMP_CACHE_MANAGE */
1102 KA_TRACE(20,
1103 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1104 "arrived(%p) == %llu\n",
1105 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1106 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
1107 // Wait for child to arrive
1108 kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
1109 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1110 KMP_MB(); // Synchronize parent and child threads.
1111#if USE_ITT_BUILD && USE_ITT_NOTIFY
1112 // Barrier imbalance - write min of the thread time and a child time to
1113 // the thread.
1114 if (__kmp_forkjoin_frames_mode == 2) {
1115 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
1116 child_thr->th.th_bar_min_time);
1117 }
1118#endif
1119 if (reduce) {
1120 KA_TRACE(100,
1121 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1122 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1123 team->t.t_id, child_tid));
1124 OMPT_REDUCTION_DECL(this_thr, gtid);
1126 (*reduce)(this_thr->th.th_local.reduce_data,
1127 child_thr->th.th_local.reduce_data);
1129 }
1130 }
1131 }
1132
1133 if (KMP_MASTER_TID(tid)) {
1134 // Need to update the team arrived pointer if we are the primary thread
1135 if (new_state == KMP_BARRIER_UNUSED_STATE)
1136 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
1137 else
1138 team->t.t_bar[bt].b_arrived = new_state;
1139 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
1140 "arrived(%p) = %llu\n",
1141 gtid, team->t.t_id, tid, team->t.t_id,
1142 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1143 }
1144 KA_TRACE(
1145 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1146 gtid, team->t.t_id, tid, bt));
1147}
1148
1149// The reverse versions seem to beat the forward versions overall
1150#define KMP_REVERSE_HYPER_BAR
1152 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1153 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1154 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
1155 kmp_team_t *team;
1156 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1157 kmp_info_t **other_threads;
1158 kmp_uint32 num_threads;
1160 kmp_uint32 branch_factor = 1 << branch_bits;
1161 kmp_uint32 child;
1162 kmp_uint32 child_tid;
1163 kmp_uint32 offset;
1165
1166 /* Perform a hypercube-embedded tree release for all of the threads that have
1167 been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
1168 are released in the reverse order of the corresponding gather, otherwise
1169 threads are released in the same order. */
1170 if (KMP_MASTER_TID(tid)) { // primary thread
1171 team = __kmp_threads[gtid]->th.th_team;
1172 KMP_DEBUG_ASSERT(team != NULL);
1173 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
1174 "barrier type %d\n",
1175 gtid, team->t.t_id, tid, bt));
1176#if KMP_BARRIER_ICV_PUSH
1177 if (propagate_icvs) { // primary already has ICVs in final destination; copy
1178 copy_icvs(&thr_bar->th_fixed_icvs,
1179 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1180 }
1181#endif
1182 } else { // Handle fork barrier workers who aren't part of a team yet
1183 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
1184 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
1185 // Wait for parent thread to release us
1186 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1187 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1188#if USE_ITT_BUILD && USE_ITT_NOTIFY
1189 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
1190 // In fork barrier where we could not get the object reliably
1191 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
1192 // Cancel wait on previous parallel region...
1193 __kmp_itt_task_starting(itt_sync_obj);
1194
1195 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1196 return;
1197
1198 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1199 if (itt_sync_obj != NULL)
1200 // Call prepare as early as possible for "new" barrier
1201 __kmp_itt_task_finished(itt_sync_obj);
1202 } else
1203#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1204 // Early exit for reaping threads releasing forkjoin barrier
1205 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1206 return;
1207
1208 // The worker thread may now assume that the team is valid.
1209 team = __kmp_threads[gtid]->th.th_team;
1210 KMP_DEBUG_ASSERT(team != NULL);
1211 tid = __kmp_tid_from_gtid(gtid);
1212
1213 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1214 KA_TRACE(20,
1215 ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1216 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1217 KMP_MB(); // Flush all pending memory write invalidates.
1218 }
1219 num_threads = this_thr->th.th_team_nproc;
1220 other_threads = team->t.t_threads;
1221
1222#ifdef KMP_REVERSE_HYPER_BAR
1223 // Count up to correct level for parent
1224 for (level = 0, offset = 1;
1225 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
1226 level += branch_bits, offset <<= branch_bits)
1227 ;
1228
1229 // Now go down from there
1230 for (level -= branch_bits, offset >>= branch_bits; offset != 0;
1231 level -= branch_bits, offset >>= branch_bits)
1232#else
1233 // Go down the tree, level by level
1234 for (level = 0, offset = 1; offset < num_threads;
1235 level += branch_bits, offset <<= branch_bits)
1236#endif // KMP_REVERSE_HYPER_BAR
1237 {
1238#ifdef KMP_REVERSE_HYPER_BAR
1239 /* Now go in reverse order through the children, highest to lowest.
1240 Initial setting of child is conservative here. */
1241 child = num_threads >> ((level == 0) ? level : level - 1);
1242 for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
1243 child_tid = tid + (child << level);
1244 child >= 1; child--, child_tid -= (1 << level))
1245#else
1246 if (((tid >> level) & (branch_factor - 1)) != 0)
1247 // No need to go lower than this, since this is the level parent would be
1248 // notified
1249 break;
1250 // Iterate through children on this level of the tree
1251 for (child = 1, child_tid = tid + (1 << level);
1252 child < branch_factor && child_tid < num_threads;
1253 child++, child_tid += (1 << level))
1254#endif // KMP_REVERSE_HYPER_BAR
1255 {
1256 if (child_tid >= num_threads)
1257 continue; // Child doesn't exist so keep going
1258 else {
1259 kmp_info_t *child_thr = other_threads[child_tid];
1260 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1261#if KMP_CACHE_MANAGE
1262 kmp_uint32 next_child_tid = child_tid - (1 << level);
1263// Prefetch next thread's go count
1264#ifdef KMP_REVERSE_HYPER_BAR
1265 if (child - 1 >= 1 && next_child_tid < num_threads)
1266#else
1267 if (child + 1 < branch_factor && next_child_tid < num_threads)
1268#endif // KMP_REVERSE_HYPER_BAR
1270 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
1271#endif /* KMP_CACHE_MANAGE */
1272
1273#if KMP_BARRIER_ICV_PUSH
1274 if (propagate_icvs) // push my fixed ICVs to my child
1275 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1276#endif // KMP_BARRIER_ICV_PUSH
1277
1278 KA_TRACE(
1279 20,
1280 ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1281 "go(%p): %u => %u\n",
1282 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1283 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1284 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1285 // Release child from barrier
1286 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1287 flag.release();
1288 }
1289 }
1290 }
1291#if KMP_BARRIER_ICV_PUSH
1292 if (propagate_icvs &&
1293 !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
1294 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1295 FALSE);
1296 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1297 &thr_bar->th_fixed_icvs);
1298 }
1299#endif
1300 KA_TRACE(
1301 20,
1302 ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1303 gtid, team->t.t_id, tid, bt));
1304}
1305
1306// Hierarchical Barrier
1307
1308// Initialize thread barrier data
1309/* Initializes/re-initializes the hierarchical barrier data stored on a thread.
1310 Performs the minimum amount of initialization required based on how the team
1311 has changed. Returns true if leaf children will require both on-core and
1312 traditional wake-up mechanisms. For example, if the team size increases,
1313 threads already in the team will respond to on-core wakeup on their parent
1314 thread, but threads newly added to the team will only be listening on the
1315 their local b_go. */
1317 kmp_bstate_t *thr_bar,
1318 kmp_uint32 nproc, int gtid,
1319 int tid, kmp_team_t *team) {
1320 // Checks to determine if (re-)initialization is needed
1321 bool uninitialized = thr_bar->team == NULL;
1322 bool team_changed = team != thr_bar->team;
1323 bool team_sz_changed = nproc != thr_bar->nproc;
1324 bool tid_changed = tid != thr_bar->old_tid;
1325 bool retval = false;
1326
1327 if (uninitialized || team_sz_changed) {
1328 __kmp_get_hierarchy(nproc, thr_bar);
1329 }
1330
1331 if (uninitialized || team_sz_changed || tid_changed) {
1332 thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
1333 thr_bar->parent_tid = -1; // default for primary thread
1334 if (!KMP_MASTER_TID(tid)) {
1335 // if not primary thread, find parent thread in hierarchy
1336 kmp_uint32 d = 0;
1337 while (d < thr_bar->depth) { // find parent based on level of thread in
1338 // hierarchy, and note level
1339 kmp_uint32 rem;
1340 if (d == thr_bar->depth - 2) { // reached level right below the primary
1341 thr_bar->parent_tid = 0;
1342 thr_bar->my_level = d;
1343 break;
1344 } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
1345 // TODO: can we make the above op faster?
1346 // thread is not a subtree root at next level, so this is max
1347 thr_bar->parent_tid = tid - rem;
1348 thr_bar->my_level = d;
1349 break;
1350 }
1351 ++d;
1352 }
1353 }
1354 __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
1355 (thr_bar->skip_per_level[thr_bar->my_level])),
1356 &(thr_bar->offset));
1357 thr_bar->old_tid = tid;
1358 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1359 thr_bar->team = team;
1360 thr_bar->parent_bar =
1361 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1362 }
1363 if (uninitialized || team_changed || tid_changed) {
1364 thr_bar->team = team;
1365 thr_bar->parent_bar =
1366 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1367 retval = true;
1368 }
1369 if (uninitialized || team_sz_changed || tid_changed) {
1370 thr_bar->nproc = nproc;
1371 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
1372 if (thr_bar->my_level == 0)
1373 thr_bar->leaf_kids = 0;
1374 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
1375 __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
1376 thr_bar->leaf_state = 0;
1377 for (int i = 0; i < thr_bar->leaf_kids; ++i)
1378 ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
1379 }
1380 return retval;
1381}
1382
1384 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1385 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1386 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
1387 kmp_team_t *team = this_thr->th.th_team;
1388 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1389 kmp_uint32 nproc = this_thr->th.th_team_nproc;
1390 kmp_info_t **other_threads = team->t.t_threads;
1391 kmp_uint64 new_state = 0;
1392
1393 int level = team->t.t_level;
1394 if (other_threads[0]
1395 ->th.th_teams_microtask) // are we inside the teams construct?
1396 if (this_thr->th.th_teams_size.nteams > 1)
1397 ++level; // level was not increased in teams construct for team_of_masters
1398 if (level == 1)
1399 thr_bar->use_oncore_barrier = 1;
1400 else
1401 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1402
1403 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
1404 "barrier type %d\n",
1405 gtid, team->t.t_id, tid, bt));
1406 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1407
1408#if USE_ITT_BUILD && USE_ITT_NOTIFY
1409 // Barrier imbalance - save arrive time to the thread
1410 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1411 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
1412 }
1413#endif
1414
1415 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
1416 team);
1417
1418 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
1419 kmp_int32 child_tid;
1420 new_state =
1421 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1423 thr_bar->use_oncore_barrier) {
1424 if (thr_bar->leaf_kids) {
1425 // First, wait for leaf children to check-in on my b_arrived flag
1426 kmp_uint64 leaf_state =
1427 KMP_MASTER_TID(tid)
1428 ? thr_bar->b_arrived | thr_bar->leaf_state
1429 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
1430 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
1431 "for leaf kids\n",
1432 gtid, team->t.t_id, tid));
1433 kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
1434 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1435 if (reduce) {
1436 OMPT_REDUCTION_DECL(this_thr, gtid);
1438 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
1439 ++child_tid) {
1440 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1441 "T#%d(%d:%d)\n",
1442 gtid, team->t.t_id, tid,
1443 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1444 child_tid));
1445 (*reduce)(this_thr->th.th_local.reduce_data,
1446 other_threads[child_tid]->th.th_local.reduce_data);
1447 }
1449 }
1450 // clear leaf_state bits
1451 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
1452 }
1453 // Next, wait for higher level children on each child's b_arrived flag
1454 for (kmp_uint32 d = 1; d < thr_bar->my_level;
1455 ++d) { // gather lowest level threads first, but skip 0
1456 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1457 skip = thr_bar->skip_per_level[d];
1458 if (last > nproc)
1459 last = nproc;
1460 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1461 kmp_info_t *child_thr = other_threads[child_tid];
1462 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1463 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1464 "T#%d(%d:%d) "
1465 "arrived(%p) == %llu\n",
1466 gtid, team->t.t_id, tid,
1467 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1468 child_tid, &child_bar->b_arrived, new_state));
1469 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1470 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1471 if (reduce) {
1472 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1473 "T#%d(%d:%d)\n",
1474 gtid, team->t.t_id, tid,
1475 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1476 child_tid));
1477 (*reduce)(this_thr->th.th_local.reduce_data,
1478 child_thr->th.th_local.reduce_data);
1479 }
1480 }
1481 }
1482 } else { // Blocktime is not infinite
1483 for (kmp_uint32 d = 0; d < thr_bar->my_level;
1484 ++d) { // Gather lowest level threads first
1485 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1486 skip = thr_bar->skip_per_level[d];
1487 if (last > nproc)
1488 last = nproc;
1489 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1490 kmp_info_t *child_thr = other_threads[child_tid];
1491 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1492 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1493 "T#%d(%d:%d) "
1494 "arrived(%p) == %llu\n",
1495 gtid, team->t.t_id, tid,
1496 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1497 child_tid, &child_bar->b_arrived, new_state));
1498 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1499 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1500 if (reduce) {
1501 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1502 "T#%d(%d:%d)\n",
1503 gtid, team->t.t_id, tid,
1504 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1505 child_tid));
1506 (*reduce)(this_thr->th.th_local.reduce_data,
1507 child_thr->th.th_local.reduce_data);
1508 }
1509 }
1510 }
1511 }
1512 }
1513 // All subordinates are gathered; now release parent if not primary thread
1514
1515 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1516 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1517 " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1518 gtid, team->t.t_id, tid,
1519 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1520 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1521 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1522 /* Mark arrival to parent: After performing this write, a worker thread may
1523 not assume that the team is valid any more - it could be deallocated by
1524 the primary thread at any time. */
1525 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1526 !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1527 // flag; release it
1528 kmp_flag_64<> flag(&thr_bar->b_arrived,
1529 other_threads[thr_bar->parent_tid]);
1530 flag.release();
1531 } else {
1532 // Leaf does special release on "offset" bits of parent's b_arrived flag
1533 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1534 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1535 thr_bar->offset + 1);
1536 flag.set_waiter(other_threads[thr_bar->parent_tid]);
1537 flag.release();
1538 }
1539 } else { // Primary thread needs to update the team's b_arrived value
1540 team->t.t_bar[bt].b_arrived = new_state;
1541 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1542 "arrived(%p) = %llu\n",
1543 gtid, team->t.t_id, tid, team->t.t_id,
1544 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1545 }
1546 // Is the team access below unsafe or just technically invalid?
1547 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1548 "barrier type %d\n",
1549 gtid, team->t.t_id, tid, bt));
1550}
1551
1553 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1554 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1555 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1556 kmp_team_t *team;
1557 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1558 kmp_uint32 nproc;
1559 bool team_change = false; // indicates on-core barrier shouldn't be used
1560
1561 if (KMP_MASTER_TID(tid)) {
1562 team = __kmp_threads[gtid]->th.th_team;
1563 KMP_DEBUG_ASSERT(team != NULL);
1564 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1565 "entered barrier type %d\n",
1566 gtid, team->t.t_id, tid, bt));
1567 } else { // Worker threads
1568 // Wait for parent thread to release me
1569 if (!thr_bar->use_oncore_barrier ||
1570 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1571 thr_bar->team == NULL) {
1572 // Use traditional method of waiting on my own b_go flag
1573 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1574 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1575 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1576 TCW_8(thr_bar->b_go,
1577 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1578 } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1579 // infinite, not nested
1580 // Wait on my "offset" bits on parent's b_go flag
1581 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1582 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1583 thr_bar->offset + 1, bt,
1584 this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1585 flag.wait(this_thr, TRUE);
1586 if (thr_bar->wait_flag ==
1587 KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1588 TCW_8(thr_bar->b_go,
1589 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1590 } else { // Reset my bits on parent's b_go flag
1591 (RCAST(volatile char *,
1592 &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1593 }
1594 }
1595 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1596 // Early exit for reaping threads releasing forkjoin barrier
1597 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1598 return;
1599 // The worker thread may now assume that the team is valid.
1600 team = __kmp_threads[gtid]->th.th_team;
1601 KMP_DEBUG_ASSERT(team != NULL);
1602 tid = __kmp_tid_from_gtid(gtid);
1603
1604 KA_TRACE(
1605 20,
1606 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1607 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1608 KMP_MB(); // Flush all pending memory write invalidates.
1609 }
1610
1611 nproc = this_thr->th.th_team_nproc;
1612 int level = team->t.t_level;
1613 if (team->t.t_threads[0]
1614 ->th.th_teams_microtask) { // are we inside the teams construct?
1615 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1616 this_thr->th.th_teams_level == level)
1617 ++level; // level was not increased in teams construct for team_of_workers
1618 if (this_thr->th.th_teams_size.nteams > 1)
1619 ++level; // level was not increased in teams construct for team_of_masters
1620 }
1621 if (level == 1)
1622 thr_bar->use_oncore_barrier = 1;
1623 else
1624 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1625
1626 // If the team size has increased, we still communicate with old leaves via
1627 // oncore barrier.
1628 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1629 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1630 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1631 tid, team);
1632 // But if the entire team changes, we won't use oncore barrier at all
1633 if (team_change)
1634 old_leaf_kids = 0;
1635
1636#if KMP_BARRIER_ICV_PUSH
1637 if (propagate_icvs) {
1638 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1639 FALSE);
1640 if (KMP_MASTER_TID(
1641 tid)) { // primary already has copy in final destination; copy
1642 copy_icvs(&thr_bar->th_fixed_icvs,
1643 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1645 thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1646 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1647 // leaves (on-core children) pull parent's fixed ICVs directly to local
1648 // ICV store
1649 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1650 &thr_bar->parent_bar->th_fixed_icvs);
1651 // non-leaves will get ICVs piggybacked with b_go via NGO store
1652 } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1653 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1654 // access
1655 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1656 else // leaves copy parent's fixed ICVs directly to local ICV store
1657 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1658 &thr_bar->parent_bar->th_fixed_icvs);
1659 }
1660 }
1661#endif // KMP_BARRIER_ICV_PUSH
1662
1663 // Now, release my children
1664 if (thr_bar->my_level) { // not a leaf
1665 kmp_int32 child_tid;
1666 kmp_uint32 last;
1668 thr_bar->use_oncore_barrier) {
1669 if (KMP_MASTER_TID(tid)) { // do a flat release
1670 // Set local b_go to bump children via NGO store of the cache line
1671 // containing IVCs and b_go.
1672 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1673 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1674 // the cache line
1675 ngo_load(&thr_bar->th_fixed_icvs);
1676 // This loops over all the threads skipping only the leaf nodes in the
1677 // hierarchy
1678 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1679 child_tid += thr_bar->skip_per_level[1]) {
1680 kmp_bstate_t *child_bar =
1681 &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1682 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1683 "releasing T#%d(%d:%d)"
1684 " go(%p): %u => %u\n",
1685 gtid, team->t.t_id, tid,
1686 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1687 child_tid, &child_bar->b_go, child_bar->b_go,
1688 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1689 // Use ngo store (if available) to both store ICVs and release child
1690 // via child's b_go
1691 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1692 }
1693 ngo_sync();
1694 }
1695 TCW_8(thr_bar->b_go,
1696 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1697 // Now, release leaf children
1698 if (thr_bar->leaf_kids) { // if there are any
1699 // We test team_change on the off-chance that the level 1 team changed.
1700 if (team_change ||
1701 old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1702 if (old_leaf_kids) { // release old leaf kids
1703 thr_bar->b_go |= old_leaf_state;
1704 }
1705 // Release new leaf kids
1706 last = tid + thr_bar->skip_per_level[1];
1707 if (last > nproc)
1708 last = nproc;
1709 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1710 ++child_tid) { // skip_per_level[0]=1
1711 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1712 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1713 KA_TRACE(
1714 20,
1715 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1716 " T#%d(%d:%d) go(%p): %u => %u\n",
1717 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1718 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1719 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1720 // Release child using child's b_go flag
1721 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1722 flag.release();
1723 }
1724 } else { // Release all children at once with leaf_state bits on my own
1725 // b_go flag
1726 thr_bar->b_go |= thr_bar->leaf_state;
1727 }
1728 }
1729 } else { // Blocktime is not infinite; do a simple hierarchical release
1730 for (int d = thr_bar->my_level - 1; d >= 0;
1731 --d) { // Release highest level threads first
1732 last = tid + thr_bar->skip_per_level[d + 1];
1733 kmp_uint32 skip = thr_bar->skip_per_level[d];
1734 if (last > nproc)
1735 last = nproc;
1736 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1737 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1738 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1739 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1740 "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1741 gtid, team->t.t_id, tid,
1742 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1743 child_tid, &child_bar->b_go, child_bar->b_go,
1744 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1745 // Release child using child's b_go flag
1746 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1747 flag.release();
1748 }
1749 }
1750 }
1751#if KMP_BARRIER_ICV_PUSH
1752 if (propagate_icvs && !KMP_MASTER_TID(tid))
1753 // non-leaves copy ICVs from fixed ICVs to local dest
1754 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1755 &thr_bar->th_fixed_icvs);
1756#endif // KMP_BARRIER_ICV_PUSH
1757 }
1758 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1759 "barrier type %d\n",
1760 gtid, team->t.t_id, tid, bt));
1761}
1762
1763// End of Barrier Algorithms
1764
1765// type traits for cancellable value
1766// if cancellable is true, then is_cancellable is a normal boolean variable
1767// if cancellable is false, then is_cancellable is a compile time constant
1768template <bool cancellable> struct is_cancellable {};
1769template <> struct is_cancellable<true> {
1770 bool value;
1771 is_cancellable() : value(false) {}
1774 value = b;
1775 return *this;
1776 }
1777 operator bool() const { return value; }
1778};
1779template <> struct is_cancellable<false> {
1780 is_cancellable &operator=(bool b) { return *this; }
1781 constexpr operator bool() const { return false; }
1782};
1783
1784// Internal function to do a barrier.
1785/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1786 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1787 barrier
1788 When cancellable = false,
1789 Returns 0 if primary thread, 1 if worker thread.
1790 When cancellable = true
1791 Returns 0 if not cancelled, 1 if cancelled. */
1792template <bool cancellable = false>
1793static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1794 size_t reduce_size, void *reduce_data,
1795 void (*reduce)(void *, void *)) {
1796 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1797 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1798 int tid = __kmp_tid_from_gtid(gtid);
1799 kmp_info_t *this_thr = __kmp_threads[gtid];
1800 kmp_team_t *team = this_thr->th.th_team;
1801 int status = 0;
1803#if OMPT_SUPPORT && OMPT_OPTIONAL
1804 ompt_data_t *my_task_data;
1805 ompt_data_t *my_parallel_data;
1806 void *return_address;
1807 ompt_sync_region_t barrier_kind;
1808#endif
1809
1810 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1811 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1812
1813#if OMPT_SUPPORT
1814 if (ompt_enabled.enabled) {
1815#if OMPT_OPTIONAL
1816 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1817 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1818 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1819 barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1820 if (ompt_enabled.ompt_callback_sync_region) {
1821 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1822 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1823 return_address);
1824 }
1825 if (ompt_enabled.ompt_callback_sync_region_wait) {
1826 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1827 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1828 return_address);
1829 }
1830#endif
1831 // It is OK to report the barrier state after the barrier begin callback.
1832 // According to the OMPT specification, a compliant implementation may
1833 // even delay reporting this state until the barrier begins to wait.
1834 auto *ompt_thr_info = &this_thr->th.ompt_thread_info;
1835 switch (barrier_kind) {
1836 case ompt_sync_region_barrier_explicit:
1837 ompt_thr_info->state = ompt_state_wait_barrier_explicit;
1838 break;
1839 case ompt_sync_region_barrier_implicit_workshare:
1840 ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare;
1841 break;
1842 case ompt_sync_region_barrier_implicit_parallel:
1843 ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel;
1844 break;
1845 case ompt_sync_region_barrier_teams:
1846 ompt_thr_info->state = ompt_state_wait_barrier_teams;
1847 break;
1848 case ompt_sync_region_barrier_implementation:
1849 [[fallthrough]];
1850 default:
1851 ompt_thr_info->state = ompt_state_wait_barrier_implementation;
1852 }
1853 }
1854#endif
1855
1856#if ENABLE_LIBOMPTARGET
1857 // Give an opportunity to the offload runtime to make progress and create
1858 // proxy tasks if necessary
1859 if (UNLIKELY(kmp_target_sync_cb != NULL))
1860 (*kmp_target_sync_cb)(
1861 NULL, gtid, KMP_TASKDATA_TO_TASK(this_thr->th.th_current_task), NULL);
1862#endif
1863
1864 if (!team->t.t_serialized) {
1865#if USE_ITT_BUILD
1866 // This value will be used in itt notify events below.
1867 void *itt_sync_obj = NULL;
1868#if USE_ITT_NOTIFY
1869 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1870 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1871#endif
1872#endif /* USE_ITT_BUILD */
1874 __kmp_tasking_barrier(team, this_thr, gtid);
1875 KA_TRACE(15,
1876 ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1877 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1878 }
1879
1880 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1881 access it when the team struct is not guaranteed to exist. */
1882 // See note about the corresponding code in __kmp_join_barrier() being
1883 // performance-critical.
1885#if KMP_USE_MONITOR
1886 this_thr->th.th_team_bt_intervals =
1887 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1888 this_thr->th.th_team_bt_set =
1889 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1890#else
1891 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1892#endif
1893 }
1894
1895#if USE_ITT_BUILD
1896 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1897 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1898#endif /* USE_ITT_BUILD */
1899#if USE_DEBUGGER
1900 // Let the debugger know: the thread arrived to the barrier and waiting.
1901 if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1902 team->t.t_bar[bt].b_master_arrived += 1;
1903 } else {
1904 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1905 } // if
1906#endif /* USE_DEBUGGER */
1907 if (reduce != NULL) {
1908 // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1909 this_thr->th.th_local.reduce_data = reduce_data;
1910 }
1911
1913 __kmp_task_team_setup(this_thr, team);
1914
1915 if (cancellable) {
1917 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1918 } else {
1919 switch (__kmp_barrier_gather_pattern[bt]) {
1920 case bp_dist_bar: {
1921 __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
1922 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1923 break;
1924 }
1925 case bp_hyper_bar: {
1926 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1927 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1928 break;
1929 }
1930 case bp_hierarchical_bar: {
1932 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1933 break;
1934 }
1935 case bp_tree_bar: {
1936 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1937 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1938 break;
1939 }
1940 default: {
1941 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1942 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1943 }
1944 }
1945 }
1946
1947 KMP_MB();
1948
1949 if (KMP_MASTER_TID(tid)) {
1950 status = 0;
1951 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1952 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1953 }
1954#if USE_DEBUGGER
1955 // Let the debugger know: All threads are arrived and starting leaving the
1956 // barrier.
1957 team->t.t_bar[bt].b_team_arrived += 1;
1958#endif
1959
1961 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1962 // Reset cancellation flag for worksharing constructs
1963 if (cancel_request == cancel_loop ||
1964 cancel_request == cancel_sections) {
1965 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1966 }
1967 }
1968#if USE_ITT_BUILD
1969 /* TODO: In case of split reduction barrier, primary thread may send
1970 acquired event early, before the final summation into the shared
1971 variable is done (final summation can be a long operation for array
1972 reductions). */
1973 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1974 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1975#endif /* USE_ITT_BUILD */
1976#if USE_ITT_BUILD && USE_ITT_NOTIFY
1977 // Barrier - report frame end (only if active_level == 1)
1978 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1979 __kmp_forkjoin_frames_mode &&
1980 (this_thr->th.th_teams_microtask == NULL || // either not in teams
1981 this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1982 team->t.t_active_level == 1) {
1983 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1984 kmp_uint64 cur_time = __itt_get_timestamp();
1985 kmp_info_t **other_threads = team->t.t_threads;
1986 int nproc = this_thr->th.th_team_nproc;
1987 int i;
1988 switch (__kmp_forkjoin_frames_mode) {
1989 case 1:
1990 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1991 loc, nproc);
1992 this_thr->th.th_frame_time = cur_time;
1993 break;
1994 case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1995 // be fixed)
1996 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1997 1, loc, nproc);
1998 break;
1999 case 3:
2000 if (__itt_metadata_add_ptr) {
2001 // Initialize with primary thread's wait time
2002 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2003 // Set arrive time to zero to be able to check it in
2004 // __kmp_invoke_task(); the same is done inside the loop below
2005 this_thr->th.th_bar_arrive_time = 0;
2006 for (i = 1; i < nproc; ++i) {
2007 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2008 other_threads[i]->th.th_bar_arrive_time = 0;
2009 }
2010 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2011 cur_time, delta,
2012 (kmp_uint64)(reduce != NULL));
2013 }
2014 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2015 loc, nproc);
2016 this_thr->th.th_frame_time = cur_time;
2017 break;
2018 }
2019 }
2020#endif /* USE_ITT_BUILD */
2021 } else {
2022 status = 1;
2023#if USE_ITT_BUILD
2024 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2025 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2026#endif /* USE_ITT_BUILD */
2027 }
2028 if ((status == 1 || !is_split) && !cancelled) {
2029 if (cancellable) {
2031 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2032 } else {
2033 switch (__kmp_barrier_release_pattern[bt]) {
2034 case bp_dist_bar: {
2036 __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2037 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2038 break;
2039 }
2040 case bp_hyper_bar: {
2042 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2043 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2044 break;
2045 }
2046 case bp_hierarchical_bar: {
2048 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2049 break;
2050 }
2051 case bp_tree_bar: {
2053 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2054 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2055 break;
2056 }
2057 default: {
2058 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2059 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2060 }
2061 }
2062 }
2063 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
2064 __kmp_task_team_sync(this_thr, team);
2065 }
2066 }
2067
2068#if USE_ITT_BUILD
2069 /* GEH: TODO: Move this under if-condition above and also include in
2070 __kmp_end_split_barrier(). This will more accurately represent the actual
2071 release time of the threads for split barriers. */
2072 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2073 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2074#endif /* USE_ITT_BUILD */
2075 } else { // Team is serialized.
2076 status = 0;
2078 if (this_thr->th.th_task_team != NULL) {
2079#if USE_ITT_NOTIFY
2080 void *itt_sync_obj = NULL;
2081 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2082 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
2083 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2084 }
2085#endif
2086
2088 this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE ||
2089 this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
2090 TRUE);
2091 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2092 __kmp_task_team_setup(this_thr, team);
2093
2094#if USE_ITT_BUILD
2095 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2096 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2097#endif /* USE_ITT_BUILD */
2098 }
2099 }
2100 }
2101 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2102 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
2103 __kmp_tid_from_gtid(gtid), status));
2104
2105#if OMPT_SUPPORT
2106 if (ompt_enabled.enabled) {
2107#if OMPT_OPTIONAL
2108 if (ompt_enabled.ompt_callback_sync_region_wait) {
2109 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2110 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2111 return_address);
2112 }
2113 if (ompt_enabled.ompt_callback_sync_region) {
2114 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2115 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2116 return_address);
2117 }
2118#endif
2119 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
2120 }
2121#endif
2122
2123 if (cancellable)
2124 return (int)cancelled;
2125 return status;
2126}
2127
2128// Returns 0 if primary thread, 1 if worker thread.
2129int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
2130 size_t reduce_size, void *reduce_data,
2131 void (*reduce)(void *, void *)) {
2132 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
2133 reduce);
2134}
2135
2136#if defined(KMP_GOMP_COMPAT)
2137// Returns 1 if cancelled, 0 otherwise
2138int __kmp_barrier_gomp_cancel(int gtid) {
2141 0, NULL, NULL);
2142 if (cancelled) {
2143 int tid = __kmp_tid_from_gtid(gtid);
2144 kmp_info_t *this_thr = __kmp_threads[gtid];
2145 if (KMP_MASTER_TID(tid)) {
2146 // Primary thread does not need to revert anything
2147 } else {
2148 // Workers need to revert their private b_arrived flag
2149 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
2151 }
2152 }
2153 return cancelled;
2154 }
2155 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2156 return FALSE;
2157}
2158#endif
2159
2160void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
2161 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
2162 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
2164 int tid = __kmp_tid_from_gtid(gtid);
2165 kmp_info_t *this_thr = __kmp_threads[gtid];
2166 kmp_team_t *team = this_thr->th.th_team;
2167
2168 if (!team->t.t_serialized) {
2169 if (KMP_MASTER_GTID(gtid)) {
2170 switch (__kmp_barrier_release_pattern[bt]) {
2171 case bp_dist_bar: {
2172 __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2173 FALSE USE_ITT_BUILD_ARG(NULL));
2174 break;
2175 }
2176 case bp_hyper_bar: {
2178 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2179 FALSE USE_ITT_BUILD_ARG(NULL));
2180 break;
2181 }
2182 case bp_hierarchical_bar: {
2183 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
2184 FALSE USE_ITT_BUILD_ARG(NULL));
2185 break;
2186 }
2187 case bp_tree_bar: {
2189 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2190 FALSE USE_ITT_BUILD_ARG(NULL));
2191 break;
2192 }
2193 default: {
2194 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2195 FALSE USE_ITT_BUILD_ARG(NULL));
2196 }
2197 }
2199 __kmp_task_team_sync(this_thr, team);
2200 } // if
2201 }
2202 }
2203}
2204
2205void __kmp_join_barrier(int gtid) {
2206 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
2207 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2208
2210
2211 kmp_info_t *this_thr = __kmp_threads[gtid];
2212 kmp_team_t *team;
2213 int tid;
2214#ifdef KMP_DEBUG
2215 int team_id;
2216#endif /* KMP_DEBUG */
2217#if USE_ITT_BUILD
2218 void *itt_sync_obj = NULL;
2219#if USE_ITT_NOTIFY
2220 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
2221 // Get object created at fork_barrier
2222 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2223#endif
2224#endif /* USE_ITT_BUILD */
2225#if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG)
2226 int nproc = this_thr->th.th_team_nproc;
2227#endif
2228 KMP_MB();
2229
2230 // Get current info
2231 team = this_thr->th.th_team;
2232 KMP_DEBUG_ASSERT(nproc == team->t.t_nproc);
2233 tid = __kmp_tid_from_gtid(gtid);
2234#ifdef KMP_DEBUG
2235 team_id = team->t.t_id;
2236 kmp_info_t *master_thread = this_thr->th.th_team_master;
2237 if (master_thread != team->t.t_threads[0]) {
2239 }
2240#endif /* KMP_DEBUG */
2241 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
2242 KMP_MB();
2243
2244 // Verify state
2245 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
2246 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
2247 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
2248 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
2249 gtid, team_id, tid));
2250
2251#if OMPT_SUPPORT
2252 if (ompt_enabled.enabled) {
2253#if OMPT_OPTIONAL
2254 ompt_data_t *my_task_data;
2255 ompt_data_t *my_parallel_data;
2256 void *codeptr = NULL;
2257 int ds_tid = this_thr->th.th_info.ds.ds_tid;
2258 if (KMP_MASTER_TID(ds_tid) &&
2259 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2260 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2261 codeptr = team->t.ompt_team_info.master_return_address;
2262 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
2263 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
2264 ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2265 ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel;
2266 if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) {
2267 sync_kind = ompt_sync_region_barrier_teams;
2268 ompt_state = ompt_state_wait_barrier_teams;
2269 }
2270 if (ompt_enabled.ompt_callback_sync_region) {
2271 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2272 sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2273 }
2274 if (ompt_enabled.ompt_callback_sync_region_wait) {
2275 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2276 sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2277 }
2278 if (!KMP_MASTER_TID(ds_tid))
2279 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
2280#endif
2281 this_thr->th.ompt_thread_info.state = ompt_state;
2282 }
2283#endif
2284
2286 __kmp_tasking_barrier(team, this_thr, gtid);
2287 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
2288 gtid, team_id, tid));
2289 }
2290#ifdef KMP_DEBUG
2292 KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
2293 "%p, th_task_team = %p\n",
2294 __kmp_gtid_from_thread(this_thr), team_id,
2295 team->t.t_task_team[this_thr->th.th_task_state],
2296 this_thr->th.th_task_team));
2298 }
2299#endif /* KMP_DEBUG */
2300
2301 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
2302 access it when the team struct is not guaranteed to exist. Doing these
2303 loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
2304 we do not perform the copy if blocktime=infinite, since the values are not
2305 used by __kmp_wait_template() in that case. */
2307#if KMP_USE_MONITOR
2308 this_thr->th.th_team_bt_intervals =
2309 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2310 this_thr->th.th_team_bt_set =
2311 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2312#else
2313 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2314#endif
2315 }
2316
2317#if USE_ITT_BUILD
2318 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2319 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2320#endif /* USE_ITT_BUILD */
2321
2323 case bp_dist_bar: {
2325 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2326 break;
2327 }
2328 case bp_hyper_bar: {
2330 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2331 break;
2332 }
2333 case bp_hierarchical_bar: {
2335 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2336 break;
2337 }
2338 case bp_tree_bar: {
2340 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2341 break;
2342 }
2343 default: {
2345 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2346 }
2347 }
2348
2349 /* From this point on, the team data structure may be deallocated at any time
2350 by the primary thread - it is unsafe to reference it in any of the worker
2351 threads. Any per-team data items that need to be referenced before the
2352 end of the barrier should be moved to the kmp_task_team_t structs. */
2353 if (KMP_MASTER_TID(tid)) {
2355 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2356 }
2358 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
2359 }
2360#if KMP_STATS_ENABLED
2361 // Have primary thread flag the workers to indicate they are now waiting for
2362 // next parallel region, Also wake them up so they switch their timers to
2363 // idle.
2364 for (int i = 0; i < team->t.t_nproc; ++i) {
2365 kmp_info_t *team_thread = team->t.t_threads[i];
2366 if (team_thread == this_thr)
2367 continue;
2368 team_thread->th.th_stats->setIdleFlag();
2370 team_thread->th.th_sleep_loc != NULL)
2371 __kmp_null_resume_wrapper(team_thread);
2372 }
2373#endif
2374#if USE_ITT_BUILD
2375 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2376 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2377#endif /* USE_ITT_BUILD */
2378
2379#if USE_ITT_BUILD && USE_ITT_NOTIFY
2380 // Join barrier - report frame end
2381 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2382 __kmp_forkjoin_frames_mode &&
2383 (this_thr->th.th_teams_microtask == NULL || // either not in teams
2384 this_thr->th.th_teams_size.nteams == 1) && // or inside single team
2385 team->t.t_active_level == 1) {
2386 kmp_uint64 cur_time = __itt_get_timestamp();
2387 ident_t *loc = team->t.t_ident;
2388 kmp_info_t **other_threads = team->t.t_threads;
2389 switch (__kmp_forkjoin_frames_mode) {
2390 case 1:
2391 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2392 loc, nproc);
2393 break;
2394 case 2:
2395 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
2396 loc, nproc);
2397 break;
2398 case 3:
2399 if (__itt_metadata_add_ptr) {
2400 // Initialize with primary thread's wait time
2401 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2402 // Set arrive time to zero to be able to check it in
2403 // __kmp_invoke_task(); the same is done inside the loop below
2404 this_thr->th.th_bar_arrive_time = 0;
2405 for (int i = 1; i < nproc; ++i) {
2406 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2407 other_threads[i]->th.th_bar_arrive_time = 0;
2408 }
2409 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2410 cur_time, delta, 0);
2411 }
2412 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2413 loc, nproc);
2414 this_thr->th.th_frame_time = cur_time;
2415 break;
2416 }
2417 }
2418#endif /* USE_ITT_BUILD */
2419 }
2420#if USE_ITT_BUILD
2421 else {
2422 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2423 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2424 }
2425#endif /* USE_ITT_BUILD */
2426
2427#if KMP_DEBUG
2428 if (KMP_MASTER_TID(tid)) {
2429 KA_TRACE(
2430 15,
2431 ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
2432 gtid, team_id, tid, nproc));
2433 }
2434#endif /* KMP_DEBUG */
2435
2436 // TODO now, mark worker threads as done so they may be disbanded
2437 KMP_MB(); // Flush all pending memory write invalidates.
2438 KA_TRACE(10,
2439 ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
2440
2441}
2442
2443// TODO release worker threads' fork barriers as we are ready instead of all at
2444// once
2445void __kmp_fork_barrier(int gtid, int tid) {
2446 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
2447 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2448 kmp_info_t *this_thr = __kmp_threads[gtid];
2449 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
2450#if USE_ITT_BUILD
2451 void *itt_sync_obj = NULL;
2452#endif /* USE_ITT_BUILD */
2453#ifdef KMP_DEBUG
2454 if (team)
2455 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
2456 (team != NULL) ? team->t.t_id : -1, tid));
2457#endif
2458 // th_team pointer only valid for primary thread here
2459 if (KMP_MASTER_TID(tid)) {
2460#if USE_ITT_BUILD && USE_ITT_NOTIFY
2461 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2462 // Create itt barrier object
2463 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
2464 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
2465 }
2466#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2467
2468#ifdef KMP_DEBUG
2469 KMP_DEBUG_ASSERT(team);
2470 kmp_info_t **other_threads = team->t.t_threads;
2471 int i;
2472
2473 // Verify state
2474 KMP_MB();
2475
2476 for (i = 1; i < team->t.t_nproc; ++i) {
2477 KA_TRACE(500,
2478 ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
2479 "== %u.\n",
2480 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
2481 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
2482 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
2484 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
2486 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
2487 }
2488#endif
2489
2491 __kmp_task_team_setup(this_thr, team);
2492
2493 /* The primary thread may have changed its blocktime between join barrier
2494 and fork barrier. Copy the blocktime info to the thread, where
2495 __kmp_wait_template() can access it when the team struct is not
2496 guaranteed to exist. */
2497 // See note about the corresponding code in __kmp_join_barrier() being
2498 // performance-critical
2500#if KMP_USE_MONITOR
2501 this_thr->th.th_team_bt_intervals =
2502 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2503 this_thr->th.th_team_bt_set =
2504 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2505#else
2506 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2507#endif
2508 }
2509 } // primary thread
2510
2512 case bp_dist_bar: {
2514 TRUE USE_ITT_BUILD_ARG(NULL));
2515 break;
2516 }
2517 case bp_hyper_bar: {
2520 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2521 break;
2522 }
2523 case bp_hierarchical_bar: {
2525 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2526 break;
2527 }
2528 case bp_tree_bar: {
2531 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2532 break;
2533 }
2534 default: {
2536 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2537 }
2538 }
2539
2540#if OMPT_SUPPORT
2541 ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
2542 if (ompt_enabled.enabled &&
2543 (ompt_state == ompt_state_wait_barrier_teams ||
2544 ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
2545 int ds_tid = this_thr->th.th_info.ds.ds_tid;
2546 ompt_data_t *task_data = (team)
2547 ? OMPT_CUR_TASK_DATA(this_thr)
2548 : &(this_thr->th.ompt_thread_info.task_data);
2549 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2550#if OMPT_OPTIONAL
2551 void *codeptr = NULL;
2552 if (KMP_MASTER_TID(ds_tid) &&
2553 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2554 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2555 codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2556 ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2557 if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
2558 sync_kind = ompt_sync_region_barrier_teams;
2559 if (ompt_enabled.ompt_callback_sync_region_wait) {
2560 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2561 sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2562 }
2563 if (ompt_enabled.ompt_callback_sync_region) {
2564 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2565 sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2566 }
2567#endif
2568 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2569 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2570 ompt_scope_end, NULL, task_data, 0, ds_tid,
2571 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2572 }
2573 }
2574#endif
2575
2576 // Early exit for reaping threads releasing forkjoin barrier
2577 if (TCR_4(__kmp_global.g.g_done)) {
2578 this_thr->th.th_task_team = NULL;
2579
2580#if USE_ITT_BUILD && USE_ITT_NOTIFY
2581 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2582 if (!KMP_MASTER_TID(tid)) {
2583 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2584 if (itt_sync_obj)
2585 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2586 }
2587 }
2588#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2589 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2590 return;
2591 }
2592
2593 /* We can now assume that a valid team structure has been allocated by the
2594 primary thread and propagated to all worker threads. The current thread,
2595 however, may not be part of the team, so we can't blindly assume that the
2596 team pointer is non-null. */
2597 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2598 KMP_DEBUG_ASSERT(team != NULL);
2599 tid = __kmp_tid_from_gtid(gtid);
2600
2601#if KMP_BARRIER_ICV_PULL
2602 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2603 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2604 implicit task has this data before this function is called. We cannot
2605 modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2606 thread struct, because it is not always the case that the threads arrays
2607 have been allocated when __kmp_fork_call() is executed. */
2608 {
2610 if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2611 // Copy the initial ICVs from the primary thread's thread struct to the
2612 // implicit task for this tid.
2613 KA_TRACE(10,
2614 ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2615 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2616 tid, FALSE);
2617 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2618 &team->t.t_threads[0]
2619 ->th.th_bar[bs_forkjoin_barrier]
2620 .bb.th_fixed_icvs);
2621 }
2622 }
2623#endif // KMP_BARRIER_ICV_PULL
2624
2626 __kmp_task_team_sync(this_thr, team);
2627 }
2628
2629#if KMP_AFFINITY_SUPPORTED
2630 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2631 if (proc_bind == proc_bind_intel) {
2632 // Call dynamic affinity settings
2633 if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) {
2634 __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2635 }
2636 } else if (proc_bind != proc_bind_false) {
2637 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2638 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2639 __kmp_gtid_from_thread(this_thr),
2640 this_thr->th.th_current_place));
2641 } else {
2642 __kmp_affinity_bind_place(gtid);
2643 }
2644 }
2645#endif // KMP_AFFINITY_SUPPORTED
2646 // Perform the display affinity functionality
2648 if (team->t.t_display_affinity
2650 || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed)
2651#endif
2652 ) {
2653 // NULL means use the affinity-format-var ICV
2654 __kmp_aux_display_affinity(gtid, NULL);
2655 this_thr->th.th_prev_num_threads = team->t.t_nproc;
2656 this_thr->th.th_prev_level = team->t.t_level;
2657 }
2658 }
2659 if (!KMP_MASTER_TID(tid))
2660 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2661
2662#if USE_ITT_BUILD && USE_ITT_NOTIFY
2663 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2664 if (!KMP_MASTER_TID(tid)) {
2665 // Get correct barrier object
2666 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2667 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2668 } // (prepare called inside barrier_release)
2669 }
2670#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2671 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2672 team->t.t_id, tid));
2673}
2674
2675void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2676 kmp_internal_control_t *new_icvs, ident_t *loc) {
2677 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2678
2679 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2681
2682/* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2683 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2684 implicit task has this data before this function is called. */
2685#if KMP_BARRIER_ICV_PULL
2686 /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2687 remains untouched), where all of the worker threads can access them and
2688 make their own copies after the barrier. */
2689 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2690 // allocated at this point
2691 copy_icvs(
2692 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2693 new_icvs);
2694 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2695 team->t.t_threads[0], team));
2696#elif KMP_BARRIER_ICV_PUSH
2697 // The ICVs will be propagated in the fork barrier, so nothing needs to be
2698 // done here.
2699 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2700 team->t.t_threads[0], team));
2701#else
2702 // Copy the ICVs to each of the non-primary threads. This takes O(nthreads)
2703 // time.
2704 ngo_load(new_icvs);
2705 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2706 // allocated at this point
2707 for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2708 // TODO: GEH - pass in better source location info since usually NULL here
2709 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2710 f, team->t.t_threads[f], team));
2711 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2712 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2713 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2714 f, team->t.t_threads[f], team));
2715 }
2716 ngo_sync();
2717#endif // KMP_BARRIER_ICV_PULL
2718}
char bool
size_t KMP_ALIGN_CACHE gos_per_group
kmp_uint64 go_release()
size_t KMP_ALIGN_CACHE num_groups
Definition kmp_barrier.h:99
size_t KMP_ALIGN_CACHE threads_per_group
size_t KMP_ALIGN_CACHE num_gos
Definition kmp_barrier.h:97
size_t KMP_ALIGN_CACHE threads_per_go
size_t KMP_ALIGN_CACHE num_threads
Definition kmp_barrier.h:94
size_t KMP_ALIGN_CACHE max_threads
Definition kmp_barrier.h:95
static void deallocate(distributedBarrier *db)
flags_s * flags[MAX_ITERS]
Definition kmp_barrier.h:89
distributedBarrier()=delete
bool KMP_ALIGN_CACHE fix_threads_per_go
bool wait(kmp_info_t *this_thr, int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj))
bool wait(kmp_info_t *this_thr, int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj))
bool wait(kmp_info_t *this_thr, int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj))
void set_waiter(kmp_info_t *thr)
void stop(char *errorMsg)
void
Definition ittnotify.h:3324
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type size_t void ITT_FORMAT p const __itt_domain __itt_id __itt_string_handle const wchar_t size_t ITT_FORMAT lu const __itt_domain __itt_id __itt_relation __itt_id ITT_FORMAT p const wchar_t int ITT_FORMAT __itt_group_mark d int
#define __kmp_free(ptr)
Definition kmp.h:3756
#define KMP_INTERNAL_MALLOC(sz)
Definition kmp.h:113
kmp_global_t __kmp_global
union kmp_task_team kmp_task_team_t
Definition kmp.h:253
void __kmp_teams_master(int gtid)
#define KMP_MAX_BLOCKTIME
Definition kmp.h:1254
#define KMP_INTERNAL_REALLOC(p, sz)
Definition kmp.h:115
void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team)
#define KMP_TASKDATA_TO_TASK(taskdata)
Definition kmp.h:2467
#define KMP_NOT_SAFE_TO_REAP
Definition kmp.h:2148
static kmp_team_t * __kmp_team_from_gtid(int gtid)
Definition kmp.h:3639
kmp_bar_pat_e __kmp_barrier_gather_pattern[bs_last_barrier]
kmp_tasking_mode_t __kmp_tasking_mode
void __kmp_abort_thread(void)
int __kmp_dflt_blocktime
void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar)
int __kmp_omp_cancellation
#define KMP_BARRIER_UNUSED_STATE
Definition kmp.h:2127
int __kmp_barrier_gomp_cancel(int gtid)
#define KMP_BARRIER_SLEEP_STATE
Definition kmp.h:2126
struct kmp_internal_control kmp_internal_control_t
static int __kmp_tid_from_gtid(int gtid)
Definition kmp.h:3619
#define KMP_MIN(x, y)
Definition kmp.h:300
#define KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, thr)
Definition kmp.h:4152
@ cancel_sections
Definition kmp.h:983
@ cancel_loop
Definition kmp.h:982
@ cancel_noreq
Definition kmp.h:980
#define KMP_CHECK_UPDATE(a, b)
Definition kmp.h:2383
#define KMP_MASTER_TID(tid)
Definition kmp.h:1341
kmp_uint32 __kmp_barrier_release_branch_bits[bs_last_barrier]
#define KMP_BARRIER_OWN_FLAG
Definition kmp.h:2139
void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, kmp_team_t *team, int tid, int set_curr_task)
static void copy_icvs(kmp_internal_control_t *dst, kmp_internal_control_t *src)
Definition kmp.h:2214
#define KMP_TASKING_ENABLED(task_team)
Definition kmp.h:2471
kmp_info_t ** __kmp_threads
#define KMP_BARRIER_PARENT_FLAG
Definition kmp.h:2141
union kmp_team kmp_team_t
Definition kmp.h:251
#define KMP_MASTER_GTID(gtid)
Definition kmp.h:1344
volatile int __kmp_init_parallel
#define __kmp_allocate(size)
Definition kmp.h:3754
#define TRUE
Definition kmp.h:1350
#define FALSE
Definition kmp.h:1349
void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid)
@ tskm_extra_barrier
Definition kmp.h:2448
@ tskm_immediate_exec
Definition kmp.h:2447
#define UNLIKELY(x)
Definition kmp.h:150
void __kmp_aux_display_affinity(int gtid, const char *format)
union kmp_barrier_team_union kmp_balign_team_t
Definition kmp.h:2278
#define KMP_INIT_BARRIER_STATE
Definition kmp.h:2121
kmp_uint32 __kmp_barrier_gather_branch_bits[bs_last_barrier]
#define KMP_BARRIER_NOT_WAITING
Definition kmp.h:2138
#define KMP_INTERNAL_FREE(p)
Definition kmp.h:114
static int __kmp_gtid_from_tid(int tid, const kmp_team_t *team)
Definition kmp.h:3624
#define KMP_BARRIER_SWITCHING
Definition kmp.h:2145
#define KMP_SAFE_TO_REAP
Definition kmp.h:2150
barrier_type
Definition kmp.h:2161
@ bs_plain_barrier
Definition kmp.h:2162
@ bs_last_barrier
Definition kmp.h:2168
@ bs_forkjoin_barrier
Definition kmp.h:2164
int __kmp_display_affinity
#define KMP_BLOCKTIME_INTERVAL(team, tid)
Definition kmp.h:1298
void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team)
int __kmp_atomic_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64< C, S > *flag, int final_spin, int *thread_finished, kmp_int32 is_constrained)
kmp_proc_bind_t
Definition kmp.h:940
@ proc_bind_false
Definition kmp.h:941
@ proc_bind_intel
Definition kmp.h:946
@ KMP_HW_SOCKET
Definition kmp.h:603
@ KMP_HW_CORE
Definition kmp.h:613
#define KMP_BARRIER_STATE_BUMP
Definition kmp.h:2128
void __kmp_atomic_resume_64(int target_gtid, kmp_atomic_flag_64< C, S > *flag)
static int __kmp_gtid_from_thread(const kmp_info_t *thr)
Definition kmp.h:3629
static void __kmp_type_convert(T1 src, T2 *dest)
Definition kmp.h:4884
struct KMP_ALIGN_CACHE kmp_bstate kmp_bstate_t
kmp_bar_pat_e __kmp_barrier_release_pattern[bs_last_barrier]
@ bp_dist_bar
Definition kmp.h:2184
@ bp_tree_bar
Definition kmp.h:2179
@ bp_hierarchical_bar
Definition kmp.h:2183
@ bp_hyper_bar
Definition kmp.h:2181
union KMP_ALIGN_CACHE kmp_info kmp_info_t
void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team, int wait=1)
kmp_topology_t * __kmp_topology
KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86<<, 2i, 1, KMP_ARCH_X86) ATOMIC_CMPXCHG(fixed2, shr, kmp_int16, 16, > KMP_ARCH_X86 KMP_ARCH_X86 kmp_uint32
static bool __kmp_linear_barrier_gather_template(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, void(*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj))
static void __kmp_dist_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj))
#define ngo_load(src)
static void __kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj))
#define ngo_store_icvs(dst, src)
static bool __kmp_linear_barrier_release_template(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj))
#define ngo_store_go(dst, src)
int __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size, void *reduce_data, void(*reduce)(void *, void *))
#define ngo_sync()
void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc)
static void __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, void(*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj))
void __kmp_join_barrier(int gtid)
static void __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, void(*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj))
void __kmp_end_split_barrier(enum barrier_type bt, int gtid)
static bool __kmp_linear_barrier_release_cancellable(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj))
static bool __kmp_linear_barrier_gather_cancellable(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, void(*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj))
static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split, size_t reduce_size, void *reduce_data, void(*reduce)(void *, void *))
static void __kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, void(*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj))
void __kmp_print_structure(void)
static void __kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj))
void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team, size_t start, size_t stop, size_t inc, size_t tid)
static void __kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj))
static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc, int gtid, int tid, kmp_team_t *team)
void __kmp_fork_barrier(int gtid, int tid)
static void __kmp_dist_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, void(*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj))
static void __kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, void(*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj))
static void __kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj))
#define KMP_OPTIMIZE_FOR_REDUCTIONS
Definition kmp_barrier.h:57
#define KMP_ALIGNED_FREE(ptr)
Definition kmp_barrier.h:47
#define KA_TRACE(d, x)
Definition kmp_debug.h:157
#define KMP_ASSERT(cond)
Definition kmp_debug.h:59
#define KF_TRACE(d, x)
Definition kmp_debug.h:162
#define KMP_DEBUG_ASSERT(cond)
Definition kmp_debug.h:61
unsigned long long kmp_uint64
static volatile kmp_i18n_cat_status_t status
Definition kmp_i18n.cpp:48
#define USE_ITT_BUILD_ARG(x)
Definition kmp_itt.h:346
#define TCW_8(a, b)
Definition kmp_os.h:1142
void(* microtask_t)(int *gtid, int *npr,...)
Definition kmp_os.h:1185
#define KMP_TEST_THEN_AND64(p, v)
Definition kmp_os.h:798
#define TCR_PTR(a)
Definition kmp_os.h:1166
#define RCAST(type, var)
Definition kmp_os.h:292
#define KMP_CACHE_PREFETCH(ADDR)
Definition kmp_os.h:348
#define KMP_ATOMIC_ST_RLX(p, v)
Definition kmp_os.h:1262
#define KMP_MB()
Definition kmp_os.h:1066
#define TCR_4(a)
Definition kmp_os.h:1137
#define KMP_ATOMIC_LD_RLX(p)
Definition kmp_os.h:1260
#define KMP_MFENCE()
Definition kmp_os.h:1099
#define KMP_AFFINITY_SUPPORTED
Definition kmp_os.h:88
#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv)
Definition kmp_os.h:814
#define TCW_4(a, b)
Definition kmp_os.h:1138
#define TCR_SYNC_4(a)
Definition kmp_os.h:1145
Functions for collecting statistics.
#define KMP_SET_THREAD_STATE_BLOCK(state_name)
Definition kmp_stats.h:1018
#define KMP_TIME_PARTITIONED_BLOCK(name)
Definition kmp_stats.h:1013
#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n)
Definition kmp_stats.h:1008
#define i
Definition kmp_stub.cpp:87
static void __kmp_null_resume_wrapper(kmp_info_t *thr)
int32_t kmp_int32
ompt_callbacks_active_t ompt_enabled
ompt_callbacks_internal_t ompt_callbacks
ompt_sync_region_t __ompt_get_barrier_kind(enum barrier_type bt, kmp_info_t *thr)
#define OMPT_REDUCTION_BEGIN
#define OMPT_REDUCTION_DECL(this_thr, gtid)
#define OMPT_REDUCTION_END
static id loc
volatile int flag
is_cancellable & operator=(bool b)
is_cancellable & operator=(bool b)
KMP_ALIGN_CACHE volatile kmp_uint32 tt_active
Definition kmp.h:2880
kmp_uint64 b_arrived
Definition kmp.h:2266
kmp_base_task_team_t tt
Definition kmp.h:2884
kmp_base_team_t t
Definition kmp.h:3234