LLVM OpenMP 19.0.0git
kmp_barrier.cpp
Go to the documentation of this file.
1/*
2 * kmp_barrier.cpp
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp_wait_release.h"
14#include "kmp_barrier.h"
15#include "kmp_itt.h"
16#include "kmp_os.h"
17#include "kmp_stats.h"
18#include "ompt-specific.h"
19// for distributed barrier
20#include "kmp_affinity.h"
21
22#if KMP_MIC
23#include <immintrin.h>
24#define USE_NGO_STORES 1
25#endif // KMP_MIC
26
27#if KMP_MIC && USE_NGO_STORES
28// ICV copying
29#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32#define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33#else
34#define ngo_load(src) ((void)0)
35#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37#define ngo_sync() ((void)0)
38#endif /* KMP_MIC && USE_NGO_STORES */
39
40void __kmp_print_structure(void); // Forward declaration
41
42// ---------------------------- Barrier Algorithms ----------------------------
43// Distributed barrier
44
45// Compute how many threads to have polling each cache-line.
46// We want to limit the number of writes to IDEAL_GO_RESOLUTION.
47void distributedBarrier::computeVarsForN(size_t n) {
48 int nsockets = 1;
49 if (__kmp_topology) {
50 int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
51 int core_level = __kmp_topology->get_level(KMP_HW_CORE);
52 int ncores_per_socket =
53 __kmp_topology->calculate_ratio(core_level, socket_level);
54 nsockets = __kmp_topology->get_count(socket_level);
55
56 if (nsockets <= 0)
57 nsockets = 1;
58 if (ncores_per_socket <= 0)
59 ncores_per_socket = 1;
60
61 threads_per_go = ncores_per_socket >> 1;
62 if (!fix_threads_per_go) {
63 // Minimize num_gos
64 if (threads_per_go > 4) {
67 }
68 if (threads_per_go > 4 && nsockets == 1)
70 }
71 }
72 if (threads_per_go == 0)
74 fix_threads_per_go = true;
76 if (n % threads_per_go)
77 num_gos++;
78 if (nsockets == 1 || num_gos == 1)
79 num_groups = 1;
80 else {
81 num_groups = num_gos / nsockets;
82 if (num_gos % nsockets)
83 num_groups++;
84 }
85 if (num_groups <= 0)
86 num_groups = 1;
88 if (num_gos % num_groups)
91 } else {
93 if (n % threads_per_go)
94 num_gos++;
95 if (num_gos == 1)
96 num_groups = 1;
97 else {
98 num_groups = num_gos / 2;
99 if (num_gos % 2)
100 num_groups++;
101 }
103 if (num_gos % num_groups)
106 }
107}
108
109void distributedBarrier::computeGo(size_t n) {
110 // Minimize num_gos
111 for (num_gos = 1;; num_gos++)
112 if (IDEAL_CONTENTION * num_gos >= n)
113 break;
115 if (n % num_gos)
117 while (num_gos > MAX_GOS) {
120 if (n % threads_per_go)
121 num_gos++;
122 }
123 computeVarsForN(n);
124}
125
126// This function is to resize the barrier arrays when the new number of threads
127// exceeds max_threads, which is the current size of all the arrays
128void distributedBarrier::resize(size_t nthr) {
130
131 // expand to requested size * 2
132 max_threads = nthr * 2;
133
134 // allocate arrays to new max threads
135 for (int i = 0; i < MAX_ITERS; ++i) {
136 if (flags[i])
137 flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
138 max_threads * sizeof(flags_s));
139 else
140 flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));
141 }
142
143 if (go)
144 go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));
145 else
146 go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));
147
148 if (iter)
149 iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));
150 else
151 iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));
152
153 if (sleep)
154 sleep =
155 (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));
156 else
157 sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));
158}
159
160// This function is to set all the go flags that threads might be waiting
161// on, and when blocktime is not infinite, it should be followed by a wake-up
162// call to each thread
165 for (size_t j = 0; j < num_gos; j++) {
166 go[j].go.store(next_go);
167 }
168 return next_go;
169}
170
172 for (size_t j = 0; j < max_threads; ++j) {
173 for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
174 flags[i][j].stillNeed = 1;
175 }
176 go[j].go.store(0);
177 iter[j].iter = 0;
178 }
179}
180
181// This function inits/re-inits the distributed barrier for a particular number
182// of threads. If a resize of arrays is needed, it calls the resize function.
183void distributedBarrier::init(size_t nthr) {
184 size_t old_max = max_threads;
185 if (nthr > max_threads) { // need more space in arrays
186 resize(nthr);
187 }
188
189 for (size_t i = 0; i < max_threads; i++) {
190 for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
191 flags[j][i].stillNeed = 1;
192 }
193 go[i].go.store(0);
194 iter[i].iter = 0;
195 if (i >= old_max)
196 sleep[i].sleep = false;
197 }
198
199 // Recalculate num_gos, etc. based on new nthr
200 computeVarsForN(nthr);
201
202 num_threads = nthr;
203
204 if (team_icvs == NULL)
206}
207
208// This function is used only when KMP_BLOCKTIME is not infinite.
209// static
211 size_t start, size_t stop, size_t inc,
212 size_t tid) {
214 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
215 return;
216
217 kmp_info_t **other_threads = team->t.t_threads;
218 for (size_t thr = start; thr < stop; thr += inc) {
219 KMP_DEBUG_ASSERT(other_threads[thr]);
220 int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
221 // Wake up worker regardless of if it appears to be sleeping or not
223 }
224}
225
227 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
228 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
230 kmp_team_t *team;
232 kmp_info_t **other_threads;
233 kmp_uint64 my_current_iter, my_next_iter;
234 kmp_uint32 nproc;
235 bool group_leader;
236
237 team = this_thr->th.th_team;
238 nproc = this_thr->th.th_team_nproc;
239 other_threads = team->t.t_threads;
240 b = team->t.b;
241 my_current_iter = b->iter[tid].iter;
242 my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
243 group_leader = ((tid % b->threads_per_group) == 0);
244
245 KA_TRACE(20,
246 ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
247 gtid, team->t.t_id, tid, bt));
248
249#if USE_ITT_BUILD && USE_ITT_NOTIFY
250 // Barrier imbalance - save arrive time to the thread
251 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
252 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
253 __itt_get_timestamp();
254 }
255#endif
256
257 if (group_leader) {
258 // Start from the thread after the group leader
259 size_t group_start = tid + 1;
260 size_t group_end = tid + b->threads_per_group;
261 size_t threads_pending = 0;
262
263 if (group_end > nproc)
264 group_end = nproc;
265 do { // wait for threads in my group
266 threads_pending = 0;
267 // Check all the flags every time to avoid branch misspredict
268 for (size_t thr = group_start; thr < group_end; thr++) {
269 // Each thread uses a different cache line
270 threads_pending += b->flags[my_current_iter][thr].stillNeed;
271 }
272 // Execute tasks here
274 kmp_task_team_t *task_team = this_thr->th.th_task_team;
275 if (task_team != NULL) {
276 if (TCR_SYNC_4(task_team->tt.tt_active)) {
277 if (KMP_TASKING_ENABLED(task_team)) {
278 int tasks_completed = FALSE;
280 this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
281 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
282 } else
283 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
284 }
285 } else {
286 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
287 } // if
288 }
289 if (TCR_4(__kmp_global.g.g_done)) {
290 if (__kmp_global.g.g_abort)
292 break;
294 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
295 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
296 }
297 } while (threads_pending > 0);
298
299 if (reduce) { // Perform reduction if needed
300 OMPT_REDUCTION_DECL(this_thr, gtid);
302 // Group leader reduces all threads in group
303 for (size_t thr = group_start; thr < group_end; thr++) {
304 (*reduce)(this_thr->th.th_local.reduce_data,
305 other_threads[thr]->th.th_local.reduce_data);
306 }
308 }
309
310 // Set flag for next iteration
311 b->flags[my_next_iter][tid].stillNeed = 1;
312 // Each thread uses a different cache line; resets stillNeed to 0 to
313 // indicate it has reached the barrier
314 b->flags[my_current_iter][tid].stillNeed = 0;
315
316 do { // wait for all group leaders
317 threads_pending = 0;
318 for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
319 threads_pending += b->flags[my_current_iter][thr].stillNeed;
320 }
321 // Execute tasks here
323 kmp_task_team_t *task_team = this_thr->th.th_task_team;
324 if (task_team != NULL) {
325 if (TCR_SYNC_4(task_team->tt.tt_active)) {
326 if (KMP_TASKING_ENABLED(task_team)) {
327 int tasks_completed = FALSE;
329 this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
330 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
331 } else
332 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
333 }
334 } else {
335 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
336 } // if
337 }
338 if (TCR_4(__kmp_global.g.g_done)) {
339 if (__kmp_global.g.g_abort)
341 break;
343 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
344 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
345 }
346 } while (threads_pending > 0);
347
348 if (reduce) { // Perform reduction if needed
349 if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
350 OMPT_REDUCTION_DECL(this_thr, gtid);
352 for (size_t thr = b->threads_per_group; thr < nproc;
353 thr += b->threads_per_group) {
354 (*reduce)(this_thr->th.th_local.reduce_data,
355 other_threads[thr]->th.th_local.reduce_data);
356 }
358 }
359 }
360 } else {
361 // Set flag for next iteration
362 b->flags[my_next_iter][tid].stillNeed = 1;
363 // Each thread uses a different cache line; resets stillNeed to 0 to
364 // indicate it has reached the barrier
365 b->flags[my_current_iter][tid].stillNeed = 0;
366 }
367
368 KMP_MFENCE();
369
370 KA_TRACE(20,
371 ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
372 gtid, team->t.t_id, tid, bt));
373}
374
376 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
377 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
378 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
379 kmp_team_t *team;
381 kmp_bstate_t *thr_bar;
382 kmp_uint64 my_current_iter, next_go;
383 size_t my_go_index;
384 bool group_leader;
385
386 KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
387 gtid, tid, bt));
388
389 thr_bar = &this_thr->th.th_bar[bt].bb;
390
391 if (!KMP_MASTER_TID(tid)) {
392 // workers and non-master group leaders need to check their presence in team
393 do {
394 if (this_thr->th.th_used_in_team.load() != 1 &&
395 this_thr->th.th_used_in_team.load() != 3) {
396 // Thread is not in use in a team. Wait on location in tid's thread
397 // struct. The 0 value tells anyone looking that this thread is spinning
398 // or sleeping until this location becomes 3 again; 3 is the transition
399 // state to get to 1 which is waiting on go and being in the team
400 kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
401 if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
402 0) ||
403 this_thr->th.th_used_in_team.load() == 0) {
404 my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
405 }
406#if USE_ITT_BUILD && USE_ITT_NOTIFY
407 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
408 // In fork barrier where we could not get the object reliably
409 itt_sync_obj =
410 __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
411 // Cancel wait on previous parallel region...
412 __kmp_itt_task_starting(itt_sync_obj);
413
414 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
415 return;
416
417 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
418 if (itt_sync_obj != NULL)
419 // Call prepare as early as possible for "new" barrier
420 __kmp_itt_task_finished(itt_sync_obj);
421 } else
422#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
423 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
424 return;
425 }
426 if (this_thr->th.th_used_in_team.load() != 1 &&
427 this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?
428 continue;
429 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
430 return;
431
432 // At this point, the thread thinks it is in use in a team, or in
433 // transition to be used in a team, but it might have reached this barrier
434 // before it was marked unused by the team. Unused threads are awoken and
435 // shifted to wait on local thread struct elsewhere. It also might reach
436 // this point by being picked up for use by a different team. Either way,
437 // we need to update the tid.
438 tid = __kmp_tid_from_gtid(gtid);
439 team = this_thr->th.th_team;
440 KMP_DEBUG_ASSERT(tid >= 0);
441 KMP_DEBUG_ASSERT(team);
442 b = team->t.b;
443 my_current_iter = b->iter[tid].iter;
444 next_go = my_current_iter + distributedBarrier::MAX_ITERS;
445 my_go_index = tid / b->threads_per_go;
446 if (this_thr->th.th_used_in_team.load() == 3) {
447 KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3, 1);
448 }
449 // Check if go flag is set
450 if (b->go[my_go_index].go.load() != next_go) {
451 // Wait on go flag on team
453 &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
454 my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
455 KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
456 b->iter[tid].iter == 0);
457 KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
458 }
459
460 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
461 return;
462 // At this point, the thread's go location was set. This means the primary
463 // thread is safely in the barrier, and so this thread's data is
464 // up-to-date, but we should check again that this thread is really in
465 // use in the team, as it could have been woken up for the purpose of
466 // changing team size, or reaping threads at shutdown.
467 if (this_thr->th.th_used_in_team.load() == 1)
468 break;
469 } while (1);
470
471 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
472 return;
473
474 group_leader = ((tid % b->threads_per_group) == 0);
475 if (group_leader) {
476 // Tell all the threads in my group they can go!
477 for (size_t go_idx = my_go_index + 1;
478 go_idx < my_go_index + b->gos_per_group; go_idx++) {
479 b->go[go_idx].go.store(next_go);
480 }
481 // Fence added so that workers can see changes to go. sfence inadequate.
482 KMP_MFENCE();
483 }
484
485#if KMP_BARRIER_ICV_PUSH
486 if (propagate_icvs) { // copy ICVs to final dest
487 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
488 tid, FALSE);
489 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
490 (kmp_internal_control_t *)team->t.b->team_icvs);
491 copy_icvs(&thr_bar->th_fixed_icvs,
492 &team->t.t_implicit_task_taskdata[tid].td_icvs);
493 }
494#endif
495 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
496 // This thread is now awake and participating in the barrier;
497 // wake up the other threads in the group
498 size_t nproc = this_thr->th.th_team_nproc;
499 size_t group_end = tid + b->threads_per_group;
500 if (nproc < group_end)
501 group_end = nproc;
502 __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
503 }
504 } else { // Primary thread
505 team = this_thr->th.th_team;
506 b = team->t.b;
507 my_current_iter = b->iter[tid].iter;
508 next_go = my_current_iter + distributedBarrier::MAX_ITERS;
509#if KMP_BARRIER_ICV_PUSH
510 if (propagate_icvs) {
511 // primary thread has ICVs in final destination; copy
512 copy_icvs(&thr_bar->th_fixed_icvs,
513 &team->t.t_implicit_task_taskdata[tid].td_icvs);
514 }
515#endif
516 // Tell all the group leaders they can go!
517 for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
518 b->go[go_idx].go.store(next_go);
519 }
520
522 // Wake-up the group leaders
523 size_t nproc = this_thr->th.th_team_nproc;
524 __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
525 b->threads_per_group, tid);
526 }
527
528 // Tell all the threads in my group they can go!
529 for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
530 b->go[go_idx].go.store(next_go);
531 }
532
533 // Fence added so that workers can see changes to go. sfence inadequate.
534 KMP_MFENCE();
535
537 // Wake-up the other threads in my group
538 size_t nproc = this_thr->th.th_team_nproc;
539 size_t group_end = tid + b->threads_per_group;
540 if (nproc < group_end)
541 group_end = nproc;
542 __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
543 }
544 }
545 // Update to next iteration
546 KMP_ASSERT(my_current_iter == b->iter[tid].iter);
547 b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
548
549 KA_TRACE(
550 20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
551 gtid, team->t.t_id, tid, bt));
552}
553
554// Linear Barrier
555template <bool cancellable = false>
557 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
558 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
559 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
560 kmp_team_t *team = this_thr->th.th_team;
561 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
562 kmp_info_t **other_threads = team->t.t_threads;
563
564 KA_TRACE(
565 20,
566 ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
567 gtid, team->t.t_id, tid, bt));
568 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
569
570#if USE_ITT_BUILD && USE_ITT_NOTIFY
571 // Barrier imbalance - save arrive time to the thread
572 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
573 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
574 __itt_get_timestamp();
575 }
576#endif
577 // We now perform a linear reduction to signal that all of the threads have
578 // arrived.
579 if (!KMP_MASTER_TID(tid)) {
580 KA_TRACE(20,
581 ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
582 "arrived(%p): %llu => %llu\n",
583 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
584 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
585 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
586 // Mark arrival to primary thread
587 /* After performing this write, a worker thread may not assume that the team
588 is valid any more - it could be deallocated by the primary thread at any
589 time. */
590 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
591 flag.release();
592 } else {
593 kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
594 int nproc = this_thr->th.th_team_nproc;
595 int i;
596 // Don't have to worry about sleep bit here or atomic since team setting
597 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
598
599 // Collect all the worker team member threads.
600 for (i = 1; i < nproc; ++i) {
601#if KMP_CACHE_MANAGE
602 // Prefetch next thread's arrived count
603 if (i + 1 < nproc)
604 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
605#endif /* KMP_CACHE_MANAGE */
606 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
607 "arrived(%p) == %llu\n",
608 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
609 team->t.t_id, i,
610 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
611
612 // Wait for worker thread to arrive
613 if (cancellable) {
615 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
616 if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
617 return true;
618 } else {
619 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
620 new_state);
621 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
622 }
623#if USE_ITT_BUILD && USE_ITT_NOTIFY
624 // Barrier imbalance - write min of the thread time and the other thread
625 // time to the thread.
626 if (__kmp_forkjoin_frames_mode == 2) {
627 this_thr->th.th_bar_min_time = KMP_MIN(
628 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
629 }
630#endif
631 if (reduce) {
632 KA_TRACE(100,
633 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
634 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
635 team->t.t_id, i));
636 OMPT_REDUCTION_DECL(this_thr, gtid);
638 (*reduce)(this_thr->th.th_local.reduce_data,
639 other_threads[i]->th.th_local.reduce_data);
641 }
642 }
643 // Don't have to worry about sleep bit here or atomic since team setting
644 team_bar->b_arrived = new_state;
645 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
646 "arrived(%p) = %llu\n",
647 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
648 new_state));
649 }
650 KA_TRACE(
651 20,
652 ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
653 gtid, team->t.t_id, tid, bt));
654 return false;
655}
656
657template <bool cancellable = false>
659 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
660 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
661 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
662 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
663 kmp_team_t *team;
664
665 if (KMP_MASTER_TID(tid)) {
666 unsigned int i;
667 kmp_uint32 nproc = this_thr->th.th_team_nproc;
668 kmp_info_t **other_threads;
669
670 team = __kmp_threads[gtid]->th.th_team;
671 KMP_DEBUG_ASSERT(team != NULL);
672 other_threads = team->t.t_threads;
673
674 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
675 "barrier type %d\n",
676 gtid, team->t.t_id, tid, bt));
677
678 if (nproc > 1) {
679#if KMP_BARRIER_ICV_PUSH
680 {
682 if (propagate_icvs) {
683 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
684 for (i = 1; i < nproc; ++i) {
685 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
686 team, i, FALSE);
687 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
688 &team->t.t_implicit_task_taskdata[0].td_icvs);
689 }
690 ngo_sync();
691 }
692 }
693#endif // KMP_BARRIER_ICV_PUSH
694
695 // Now, release all of the worker threads
696 for (i = 1; i < nproc; ++i) {
697#if KMP_CACHE_MANAGE
698 // Prefetch next thread's go flag
699 if (i + 1 < nproc)
700 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
701#endif /* KMP_CACHE_MANAGE */
702 KA_TRACE(
703 20,
704 ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
705 "go(%p): %u => %u\n",
706 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
707 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
708 other_threads[i]->th.th_bar[bt].bb.b_go,
709 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
710 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
711 other_threads[i]);
712 flag.release();
713 }
714 }
715 } else { // Wait for the PRIMARY thread to release us
716 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
717 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
718 if (cancellable) {
720 if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
721 return true;
722 } else {
724 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
725 }
726#if USE_ITT_BUILD && USE_ITT_NOTIFY
727 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
728 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
729 // disabled)
730 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
731 // Cancel wait on previous parallel region...
732 __kmp_itt_task_starting(itt_sync_obj);
733
734 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
735 return false;
736
737 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
738 if (itt_sync_obj != NULL)
739 // Call prepare as early as possible for "new" barrier
740 __kmp_itt_task_finished(itt_sync_obj);
741 } else
742#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
743 // Early exit for reaping threads releasing forkjoin barrier
744 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
745 return false;
746// The worker thread may now assume that the team is valid.
747#ifdef KMP_DEBUG
748 tid = __kmp_tid_from_gtid(gtid);
749 team = __kmp_threads[gtid]->th.th_team;
750#endif
751 KMP_DEBUG_ASSERT(team != NULL);
752 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
753 KA_TRACE(20,
754 ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
755 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
756 KMP_MB(); // Flush all pending memory write invalidates.
757 }
758 KA_TRACE(
759 20,
760 ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
761 gtid, team->t.t_id, tid, bt));
762 return false;
763}
764
766 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
767 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
768 __kmp_linear_barrier_gather_template<false>(
769 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
770}
771
773 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
774 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
775 return __kmp_linear_barrier_gather_template<true>(
776 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
777}
778
780 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
781 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
782 __kmp_linear_barrier_release_template<false>(
783 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
784}
785
787 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
788 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
789 return __kmp_linear_barrier_release_template<true>(
790 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
791}
792
793// Tree barrier
795 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
796 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
798 kmp_team_t *team = this_thr->th.th_team;
799 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
800 kmp_info_t **other_threads = team->t.t_threads;
801 kmp_uint32 nproc = this_thr->th.th_team_nproc;
803 kmp_uint32 branch_factor = 1 << branch_bits;
804 kmp_uint32 child;
805 kmp_uint32 child_tid;
806 kmp_uint64 new_state = 0;
807
808 KA_TRACE(
809 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
810 gtid, team->t.t_id, tid, bt));
811 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
812
813#if USE_ITT_BUILD && USE_ITT_NOTIFY
814 // Barrier imbalance - save arrive time to the thread
815 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
816 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
817 __itt_get_timestamp();
818 }
819#endif
820 // Perform tree gather to wait until all threads have arrived; reduce any
821 // required data as we go
822 child_tid = (tid << branch_bits) + 1;
823 if (child_tid < nproc) {
824 // Parent threads wait for all their children to arrive
825 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
826 child = 1;
827 do {
828 kmp_info_t *child_thr = other_threads[child_tid];
829 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
830#if KMP_CACHE_MANAGE
831 // Prefetch next thread's arrived count
832 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
834 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
835#endif /* KMP_CACHE_MANAGE */
836 KA_TRACE(20,
837 ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
838 "arrived(%p) == %llu\n",
839 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
840 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
841 // Wait for child to arrive
842 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
843 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
844#if USE_ITT_BUILD && USE_ITT_NOTIFY
845 // Barrier imbalance - write min of the thread time and a child time to
846 // the thread.
847 if (__kmp_forkjoin_frames_mode == 2) {
848 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
849 child_thr->th.th_bar_min_time);
850 }
851#endif
852 if (reduce) {
853 KA_TRACE(100,
854 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
855 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
856 team->t.t_id, child_tid));
857 OMPT_REDUCTION_DECL(this_thr, gtid);
859 (*reduce)(this_thr->th.th_local.reduce_data,
860 child_thr->th.th_local.reduce_data);
862 }
863 child++;
864 child_tid++;
865 } while (child <= branch_factor && child_tid < nproc);
866 }
867
868 if (!KMP_MASTER_TID(tid)) { // Worker threads
869 kmp_int32 parent_tid = (tid - 1) >> branch_bits;
870
871 KA_TRACE(20,
872 ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
873 "arrived(%p): %llu => %llu\n",
874 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
875 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
876 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
877
878 // Mark arrival to parent thread
879 /* After performing this write, a worker thread may not assume that the team
880 is valid any more - it could be deallocated by the primary thread at any
881 time. */
882 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
883 flag.release();
884 } else {
885 // Need to update the team arrived pointer if we are the primary thread
886 if (nproc > 1) // New value was already computed above
887 team->t.t_bar[bt].b_arrived = new_state;
888 else
889 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
890 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
891 "arrived(%p) = %llu\n",
892 gtid, team->t.t_id, tid, team->t.t_id,
893 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
894 }
895 KA_TRACE(20,
896 ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
897 gtid, team->t.t_id, tid, bt));
898}
899
901 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
902 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
903 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
904 kmp_team_t *team;
905 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
906 kmp_uint32 nproc;
908 kmp_uint32 branch_factor = 1 << branch_bits;
909 kmp_uint32 child;
910 kmp_uint32 child_tid;
911
912 // Perform a tree release for all of the threads that have been gathered
913 if (!KMP_MASTER_TID(
914 tid)) { // Handle fork barrier workers who aren't part of a team yet
915 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
916 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
917 // Wait for parent thread to release us
919 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
920#if USE_ITT_BUILD && USE_ITT_NOTIFY
921 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
922 // In fork barrier where we could not get the object reliably (or
923 // ITTNOTIFY is disabled)
924 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
925 // Cancel wait on previous parallel region...
926 __kmp_itt_task_starting(itt_sync_obj);
927
928 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
929 return;
930
931 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
932 if (itt_sync_obj != NULL)
933 // Call prepare as early as possible for "new" barrier
934 __kmp_itt_task_finished(itt_sync_obj);
935 } else
936#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
937 // Early exit for reaping threads releasing forkjoin barrier
938 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
939 return;
940
941 // The worker thread may now assume that the team is valid.
942 team = __kmp_threads[gtid]->th.th_team;
943 KMP_DEBUG_ASSERT(team != NULL);
944 tid = __kmp_tid_from_gtid(gtid);
945
946 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
947 KA_TRACE(20,
948 ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
949 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
950 KMP_MB(); // Flush all pending memory write invalidates.
951 } else {
952 team = __kmp_threads[gtid]->th.th_team;
953 KMP_DEBUG_ASSERT(team != NULL);
954 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
955 "barrier type %d\n",
956 gtid, team->t.t_id, tid, bt));
957 }
958 nproc = this_thr->th.th_team_nproc;
959 child_tid = (tid << branch_bits) + 1;
960
961 if (child_tid < nproc) {
962 kmp_info_t **other_threads = team->t.t_threads;
963 child = 1;
964 // Parent threads release all their children
965 do {
966 kmp_info_t *child_thr = other_threads[child_tid];
967 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
968#if KMP_CACHE_MANAGE
969 // Prefetch next thread's go count
970 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
972 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
973#endif /* KMP_CACHE_MANAGE */
974
975#if KMP_BARRIER_ICV_PUSH
976 {
978 if (propagate_icvs) {
979 __kmp_init_implicit_task(team->t.t_ident,
980 team->t.t_threads[child_tid], team,
981 child_tid, FALSE);
982 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
983 &team->t.t_implicit_task_taskdata[0].td_icvs);
984 }
985 }
986#endif // KMP_BARRIER_ICV_PUSH
987 KA_TRACE(20,
988 ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
989 "go(%p): %u => %u\n",
990 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
991 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
992 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
993 // Release child from barrier
994 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
995 flag.release();
996 child++;
997 child_tid++;
998 } while (child <= branch_factor && child_tid < nproc);
999 }
1000 KA_TRACE(
1001 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1002 gtid, team->t.t_id, tid, bt));
1003}
1004
1005// Hyper Barrier
1007 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1008 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1009 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
1010 kmp_team_t *team = this_thr->th.th_team;
1011 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1012 kmp_info_t **other_threads = team->t.t_threads;
1014 kmp_uint32 num_threads = this_thr->th.th_team_nproc;
1016 kmp_uint32 branch_factor = 1 << branch_bits;
1017 kmp_uint32 offset;
1019
1020 KA_TRACE(
1021 20,
1022 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1023 gtid, team->t.t_id, tid, bt));
1024 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1025
1026#if USE_ITT_BUILD && USE_ITT_NOTIFY
1027 // Barrier imbalance - save arrive time to the thread
1028 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1029 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
1030 __itt_get_timestamp();
1031 }
1032#endif
1033 /* Perform a hypercube-embedded tree gather to wait until all of the threads
1034 have arrived, and reduce any required data as we go. */
1035 kmp_flag_64<> p_flag(&thr_bar->b_arrived);
1036 for (level = 0, offset = 1; offset < num_threads;
1037 level += branch_bits, offset <<= branch_bits) {
1038 kmp_uint32 child;
1039 kmp_uint32 child_tid;
1040
1041 if (((tid >> level) & (branch_factor - 1)) != 0) {
1042 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
1043
1044 KMP_MB(); // Synchronize parent and child threads.
1045 KA_TRACE(20,
1046 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1047 "arrived(%p): %llu => %llu\n",
1048 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
1049 team->t.t_id, parent_tid, &thr_bar->b_arrived,
1050 thr_bar->b_arrived,
1051 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1052 // Mark arrival to parent thread
1053 /* After performing this write (in the last iteration of the enclosing for
1054 loop), a worker thread may not assume that the team is valid any more
1055 - it could be deallocated by the primary thread at any time. */
1056 p_flag.set_waiter(other_threads[parent_tid]);
1057 p_flag.release();
1058 break;
1059 }
1060
1061 // Parent threads wait for children to arrive
1062 if (new_state == KMP_BARRIER_UNUSED_STATE)
1063 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1064 for (child = 1, child_tid = tid + (1 << level);
1065 child < branch_factor && child_tid < num_threads;
1066 child++, child_tid += (1 << level)) {
1067 kmp_info_t *child_thr = other_threads[child_tid];
1068 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1069#if KMP_CACHE_MANAGE
1070 kmp_uint32 next_child_tid = child_tid + (1 << level);
1071 // Prefetch next thread's arrived count
1072 if (child + 1 < branch_factor && next_child_tid < num_threads)
1074 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
1075#endif /* KMP_CACHE_MANAGE */
1076 KA_TRACE(20,
1077 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1078 "arrived(%p) == %llu\n",
1079 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1080 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
1081 // Wait for child to arrive
1082 kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
1083 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1084 KMP_MB(); // Synchronize parent and child threads.
1085#if USE_ITT_BUILD && USE_ITT_NOTIFY
1086 // Barrier imbalance - write min of the thread time and a child time to
1087 // the thread.
1088 if (__kmp_forkjoin_frames_mode == 2) {
1089 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
1090 child_thr->th.th_bar_min_time);
1091 }
1092#endif
1093 if (reduce) {
1094 KA_TRACE(100,
1095 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1096 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1097 team->t.t_id, child_tid));
1098 OMPT_REDUCTION_DECL(this_thr, gtid);
1100 (*reduce)(this_thr->th.th_local.reduce_data,
1101 child_thr->th.th_local.reduce_data);
1103 }
1104 }
1105 }
1106
1107 if (KMP_MASTER_TID(tid)) {
1108 // Need to update the team arrived pointer if we are the primary thread
1109 if (new_state == KMP_BARRIER_UNUSED_STATE)
1110 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
1111 else
1112 team->t.t_bar[bt].b_arrived = new_state;
1113 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
1114 "arrived(%p) = %llu\n",
1115 gtid, team->t.t_id, tid, team->t.t_id,
1116 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1117 }
1118 KA_TRACE(
1119 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1120 gtid, team->t.t_id, tid, bt));
1121}
1122
1123// The reverse versions seem to beat the forward versions overall
1124#define KMP_REVERSE_HYPER_BAR
1126 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1127 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1128 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
1129 kmp_team_t *team;
1130 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1131 kmp_info_t **other_threads;
1132 kmp_uint32 num_threads;
1134 kmp_uint32 branch_factor = 1 << branch_bits;
1135 kmp_uint32 child;
1136 kmp_uint32 child_tid;
1137 kmp_uint32 offset;
1139
1140 /* Perform a hypercube-embedded tree release for all of the threads that have
1141 been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
1142 are released in the reverse order of the corresponding gather, otherwise
1143 threads are released in the same order. */
1144 if (KMP_MASTER_TID(tid)) { // primary thread
1145 team = __kmp_threads[gtid]->th.th_team;
1146 KMP_DEBUG_ASSERT(team != NULL);
1147 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
1148 "barrier type %d\n",
1149 gtid, team->t.t_id, tid, bt));
1150#if KMP_BARRIER_ICV_PUSH
1151 if (propagate_icvs) { // primary already has ICVs in final destination; copy
1152 copy_icvs(&thr_bar->th_fixed_icvs,
1153 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1154 }
1155#endif
1156 } else { // Handle fork barrier workers who aren't part of a team yet
1157 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
1158 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
1159 // Wait for parent thread to release us
1160 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1161 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1162#if USE_ITT_BUILD && USE_ITT_NOTIFY
1163 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
1164 // In fork barrier where we could not get the object reliably
1165 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
1166 // Cancel wait on previous parallel region...
1167 __kmp_itt_task_starting(itt_sync_obj);
1168
1169 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1170 return;
1171
1172 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1173 if (itt_sync_obj != NULL)
1174 // Call prepare as early as possible for "new" barrier
1175 __kmp_itt_task_finished(itt_sync_obj);
1176 } else
1177#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1178 // Early exit for reaping threads releasing forkjoin barrier
1179 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1180 return;
1181
1182 // The worker thread may now assume that the team is valid.
1183 team = __kmp_threads[gtid]->th.th_team;
1184 KMP_DEBUG_ASSERT(team != NULL);
1185 tid = __kmp_tid_from_gtid(gtid);
1186
1187 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1188 KA_TRACE(20,
1189 ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1190 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1191 KMP_MB(); // Flush all pending memory write invalidates.
1192 }
1193 num_threads = this_thr->th.th_team_nproc;
1194 other_threads = team->t.t_threads;
1195
1196#ifdef KMP_REVERSE_HYPER_BAR
1197 // Count up to correct level for parent
1198 for (level = 0, offset = 1;
1199 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
1200 level += branch_bits, offset <<= branch_bits)
1201 ;
1202
1203 // Now go down from there
1204 for (level -= branch_bits, offset >>= branch_bits; offset != 0;
1205 level -= branch_bits, offset >>= branch_bits)
1206#else
1207 // Go down the tree, level by level
1208 for (level = 0, offset = 1; offset < num_threads;
1209 level += branch_bits, offset <<= branch_bits)
1210#endif // KMP_REVERSE_HYPER_BAR
1211 {
1212#ifdef KMP_REVERSE_HYPER_BAR
1213 /* Now go in reverse order through the children, highest to lowest.
1214 Initial setting of child is conservative here. */
1215 child = num_threads >> ((level == 0) ? level : level - 1);
1216 for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
1217 child_tid = tid + (child << level);
1218 child >= 1; child--, child_tid -= (1 << level))
1219#else
1220 if (((tid >> level) & (branch_factor - 1)) != 0)
1221 // No need to go lower than this, since this is the level parent would be
1222 // notified
1223 break;
1224 // Iterate through children on this level of the tree
1225 for (child = 1, child_tid = tid + (1 << level);
1226 child < branch_factor && child_tid < num_threads;
1227 child++, child_tid += (1 << level))
1228#endif // KMP_REVERSE_HYPER_BAR
1229 {
1230 if (child_tid >= num_threads)
1231 continue; // Child doesn't exist so keep going
1232 else {
1233 kmp_info_t *child_thr = other_threads[child_tid];
1234 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1235#if KMP_CACHE_MANAGE
1236 kmp_uint32 next_child_tid = child_tid - (1 << level);
1237// Prefetch next thread's go count
1238#ifdef KMP_REVERSE_HYPER_BAR
1239 if (child - 1 >= 1 && next_child_tid < num_threads)
1240#else
1241 if (child + 1 < branch_factor && next_child_tid < num_threads)
1242#endif // KMP_REVERSE_HYPER_BAR
1244 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
1245#endif /* KMP_CACHE_MANAGE */
1246
1247#if KMP_BARRIER_ICV_PUSH
1248 if (propagate_icvs) // push my fixed ICVs to my child
1249 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1250#endif // KMP_BARRIER_ICV_PUSH
1251
1252 KA_TRACE(
1253 20,
1254 ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1255 "go(%p): %u => %u\n",
1256 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1257 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1258 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1259 // Release child from barrier
1260 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1261 flag.release();
1262 }
1263 }
1264 }
1265#if KMP_BARRIER_ICV_PUSH
1266 if (propagate_icvs &&
1267 !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
1268 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1269 FALSE);
1270 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1271 &thr_bar->th_fixed_icvs);
1272 }
1273#endif
1274 KA_TRACE(
1275 20,
1276 ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1277 gtid, team->t.t_id, tid, bt));
1278}
1279
1280// Hierarchical Barrier
1281
1282// Initialize thread barrier data
1283/* Initializes/re-initializes the hierarchical barrier data stored on a thread.
1284 Performs the minimum amount of initialization required based on how the team
1285 has changed. Returns true if leaf children will require both on-core and
1286 traditional wake-up mechanisms. For example, if the team size increases,
1287 threads already in the team will respond to on-core wakeup on their parent
1288 thread, but threads newly added to the team will only be listening on the
1289 their local b_go. */
1291 kmp_bstate_t *thr_bar,
1292 kmp_uint32 nproc, int gtid,
1293 int tid, kmp_team_t *team) {
1294 // Checks to determine if (re-)initialization is needed
1295 bool uninitialized = thr_bar->team == NULL;
1296 bool team_changed = team != thr_bar->team;
1297 bool team_sz_changed = nproc != thr_bar->nproc;
1298 bool tid_changed = tid != thr_bar->old_tid;
1299 bool retval = false;
1300
1301 if (uninitialized || team_sz_changed) {
1302 __kmp_get_hierarchy(nproc, thr_bar);
1303 }
1304
1305 if (uninitialized || team_sz_changed || tid_changed) {
1306 thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
1307 thr_bar->parent_tid = -1; // default for primary thread
1308 if (!KMP_MASTER_TID(tid)) {
1309 // if not primary thread, find parent thread in hierarchy
1310 kmp_uint32 d = 0;
1311 while (d < thr_bar->depth) { // find parent based on level of thread in
1312 // hierarchy, and note level
1313 kmp_uint32 rem;
1314 if (d == thr_bar->depth - 2) { // reached level right below the primary
1315 thr_bar->parent_tid = 0;
1316 thr_bar->my_level = d;
1317 break;
1318 } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
1319 // TODO: can we make the above op faster?
1320 // thread is not a subtree root at next level, so this is max
1321 thr_bar->parent_tid = tid - rem;
1322 thr_bar->my_level = d;
1323 break;
1324 }
1325 ++d;
1326 }
1327 }
1328 __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
1329 (thr_bar->skip_per_level[thr_bar->my_level])),
1330 &(thr_bar->offset));
1331 thr_bar->old_tid = tid;
1332 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1333 thr_bar->team = team;
1334 thr_bar->parent_bar =
1335 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1336 }
1337 if (uninitialized || team_changed || tid_changed) {
1338 thr_bar->team = team;
1339 thr_bar->parent_bar =
1340 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1341 retval = true;
1342 }
1343 if (uninitialized || team_sz_changed || tid_changed) {
1344 thr_bar->nproc = nproc;
1345 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
1346 if (thr_bar->my_level == 0)
1347 thr_bar->leaf_kids = 0;
1348 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
1349 __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
1350 thr_bar->leaf_state = 0;
1351 for (int i = 0; i < thr_bar->leaf_kids; ++i)
1352 ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
1353 }
1354 return retval;
1355}
1356
1358 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1359 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1360 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
1361 kmp_team_t *team = this_thr->th.th_team;
1362 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1363 kmp_uint32 nproc = this_thr->th.th_team_nproc;
1364 kmp_info_t **other_threads = team->t.t_threads;
1365 kmp_uint64 new_state = 0;
1366
1367 int level = team->t.t_level;
1368 if (other_threads[0]
1369 ->th.th_teams_microtask) // are we inside the teams construct?
1370 if (this_thr->th.th_teams_size.nteams > 1)
1371 ++level; // level was not increased in teams construct for team_of_masters
1372 if (level == 1)
1373 thr_bar->use_oncore_barrier = 1;
1374 else
1375 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1376
1377 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
1378 "barrier type %d\n",
1379 gtid, team->t.t_id, tid, bt));
1380 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1381
1382#if USE_ITT_BUILD && USE_ITT_NOTIFY
1383 // Barrier imbalance - save arrive time to the thread
1384 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1385 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
1386 }
1387#endif
1388
1389 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
1390 team);
1391
1392 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
1393 kmp_int32 child_tid;
1394 new_state =
1395 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1397 thr_bar->use_oncore_barrier) {
1398 if (thr_bar->leaf_kids) {
1399 // First, wait for leaf children to check-in on my b_arrived flag
1400 kmp_uint64 leaf_state =
1401 KMP_MASTER_TID(tid)
1402 ? thr_bar->b_arrived | thr_bar->leaf_state
1403 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
1404 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
1405 "for leaf kids\n",
1406 gtid, team->t.t_id, tid));
1407 kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
1408 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1409 if (reduce) {
1410 OMPT_REDUCTION_DECL(this_thr, gtid);
1412 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
1413 ++child_tid) {
1414 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1415 "T#%d(%d:%d)\n",
1416 gtid, team->t.t_id, tid,
1417 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1418 child_tid));
1419 (*reduce)(this_thr->th.th_local.reduce_data,
1420 other_threads[child_tid]->th.th_local.reduce_data);
1421 }
1423 }
1424 // clear leaf_state bits
1425 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
1426 }
1427 // Next, wait for higher level children on each child's b_arrived flag
1428 for (kmp_uint32 d = 1; d < thr_bar->my_level;
1429 ++d) { // gather lowest level threads first, but skip 0
1430 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1431 skip = thr_bar->skip_per_level[d];
1432 if (last > nproc)
1433 last = nproc;
1434 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1435 kmp_info_t *child_thr = other_threads[child_tid];
1436 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1437 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1438 "T#%d(%d:%d) "
1439 "arrived(%p) == %llu\n",
1440 gtid, team->t.t_id, tid,
1441 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1442 child_tid, &child_bar->b_arrived, new_state));
1443 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1444 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1445 if (reduce) {
1446 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1447 "T#%d(%d:%d)\n",
1448 gtid, team->t.t_id, tid,
1449 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1450 child_tid));
1451 (*reduce)(this_thr->th.th_local.reduce_data,
1452 child_thr->th.th_local.reduce_data);
1453 }
1454 }
1455 }
1456 } else { // Blocktime is not infinite
1457 for (kmp_uint32 d = 0; d < thr_bar->my_level;
1458 ++d) { // Gather lowest level threads first
1459 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1460 skip = thr_bar->skip_per_level[d];
1461 if (last > nproc)
1462 last = nproc;
1463 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1464 kmp_info_t *child_thr = other_threads[child_tid];
1465 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1466 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1467 "T#%d(%d:%d) "
1468 "arrived(%p) == %llu\n",
1469 gtid, team->t.t_id, tid,
1470 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1471 child_tid, &child_bar->b_arrived, new_state));
1472 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1473 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1474 if (reduce) {
1475 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1476 "T#%d(%d:%d)\n",
1477 gtid, team->t.t_id, tid,
1478 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1479 child_tid));
1480 (*reduce)(this_thr->th.th_local.reduce_data,
1481 child_thr->th.th_local.reduce_data);
1482 }
1483 }
1484 }
1485 }
1486 }
1487 // All subordinates are gathered; now release parent if not primary thread
1488
1489 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1490 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1491 " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1492 gtid, team->t.t_id, tid,
1493 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1494 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1495 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1496 /* Mark arrival to parent: After performing this write, a worker thread may
1497 not assume that the team is valid any more - it could be deallocated by
1498 the primary thread at any time. */
1499 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1500 !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1501 // flag; release it
1502 kmp_flag_64<> flag(&thr_bar->b_arrived,
1503 other_threads[thr_bar->parent_tid]);
1504 flag.release();
1505 } else {
1506 // Leaf does special release on "offset" bits of parent's b_arrived flag
1507 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1508 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1509 thr_bar->offset + 1);
1510 flag.set_waiter(other_threads[thr_bar->parent_tid]);
1511 flag.release();
1512 }
1513 } else { // Primary thread needs to update the team's b_arrived value
1514 team->t.t_bar[bt].b_arrived = new_state;
1515 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1516 "arrived(%p) = %llu\n",
1517 gtid, team->t.t_id, tid, team->t.t_id,
1518 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1519 }
1520 // Is the team access below unsafe or just technically invalid?
1521 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1522 "barrier type %d\n",
1523 gtid, team->t.t_id, tid, bt));
1524}
1525
1527 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1528 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1529 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1530 kmp_team_t *team;
1531 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1532 kmp_uint32 nproc;
1533 bool team_change = false; // indicates on-core barrier shouldn't be used
1534
1535 if (KMP_MASTER_TID(tid)) {
1536 team = __kmp_threads[gtid]->th.th_team;
1537 KMP_DEBUG_ASSERT(team != NULL);
1538 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1539 "entered barrier type %d\n",
1540 gtid, team->t.t_id, tid, bt));
1541 } else { // Worker threads
1542 // Wait for parent thread to release me
1543 if (!thr_bar->use_oncore_barrier ||
1544 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1545 thr_bar->team == NULL) {
1546 // Use traditional method of waiting on my own b_go flag
1547 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1548 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1549 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1550 TCW_8(thr_bar->b_go,
1551 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1552 } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1553 // infinite, not nested
1554 // Wait on my "offset" bits on parent's b_go flag
1555 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1556 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1557 thr_bar->offset + 1, bt,
1558 this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1559 flag.wait(this_thr, TRUE);
1560 if (thr_bar->wait_flag ==
1561 KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1562 TCW_8(thr_bar->b_go,
1563 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1564 } else { // Reset my bits on parent's b_go flag
1565 (RCAST(volatile char *,
1566 &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1567 }
1568 }
1569 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1570 // Early exit for reaping threads releasing forkjoin barrier
1571 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1572 return;
1573 // The worker thread may now assume that the team is valid.
1574 team = __kmp_threads[gtid]->th.th_team;
1575 KMP_DEBUG_ASSERT(team != NULL);
1576 tid = __kmp_tid_from_gtid(gtid);
1577
1578 KA_TRACE(
1579 20,
1580 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1581 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1582 KMP_MB(); // Flush all pending memory write invalidates.
1583 }
1584
1585 nproc = this_thr->th.th_team_nproc;
1586 int level = team->t.t_level;
1587 if (team->t.t_threads[0]
1588 ->th.th_teams_microtask) { // are we inside the teams construct?
1589 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1590 this_thr->th.th_teams_level == level)
1591 ++level; // level was not increased in teams construct for team_of_workers
1592 if (this_thr->th.th_teams_size.nteams > 1)
1593 ++level; // level was not increased in teams construct for team_of_masters
1594 }
1595 if (level == 1)
1596 thr_bar->use_oncore_barrier = 1;
1597 else
1598 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1599
1600 // If the team size has increased, we still communicate with old leaves via
1601 // oncore barrier.
1602 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1603 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1604 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1605 tid, team);
1606 // But if the entire team changes, we won't use oncore barrier at all
1607 if (team_change)
1608 old_leaf_kids = 0;
1609
1610#if KMP_BARRIER_ICV_PUSH
1611 if (propagate_icvs) {
1612 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1613 FALSE);
1614 if (KMP_MASTER_TID(
1615 tid)) { // primary already has copy in final destination; copy
1616 copy_icvs(&thr_bar->th_fixed_icvs,
1617 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1619 thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1620 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1621 // leaves (on-core children) pull parent's fixed ICVs directly to local
1622 // ICV store
1623 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1624 &thr_bar->parent_bar->th_fixed_icvs);
1625 // non-leaves will get ICVs piggybacked with b_go via NGO store
1626 } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1627 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1628 // access
1629 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1630 else // leaves copy parent's fixed ICVs directly to local ICV store
1631 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1632 &thr_bar->parent_bar->th_fixed_icvs);
1633 }
1634 }
1635#endif // KMP_BARRIER_ICV_PUSH
1636
1637 // Now, release my children
1638 if (thr_bar->my_level) { // not a leaf
1639 kmp_int32 child_tid;
1640 kmp_uint32 last;
1642 thr_bar->use_oncore_barrier) {
1643 if (KMP_MASTER_TID(tid)) { // do a flat release
1644 // Set local b_go to bump children via NGO store of the cache line
1645 // containing IVCs and b_go.
1646 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1647 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1648 // the cache line
1649 ngo_load(&thr_bar->th_fixed_icvs);
1650 // This loops over all the threads skipping only the leaf nodes in the
1651 // hierarchy
1652 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1653 child_tid += thr_bar->skip_per_level[1]) {
1654 kmp_bstate_t *child_bar =
1655 &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1656 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1657 "releasing T#%d(%d:%d)"
1658 " go(%p): %u => %u\n",
1659 gtid, team->t.t_id, tid,
1660 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1661 child_tid, &child_bar->b_go, child_bar->b_go,
1662 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1663 // Use ngo store (if available) to both store ICVs and release child
1664 // via child's b_go
1665 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1666 }
1667 ngo_sync();
1668 }
1669 TCW_8(thr_bar->b_go,
1670 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1671 // Now, release leaf children
1672 if (thr_bar->leaf_kids) { // if there are any
1673 // We test team_change on the off-chance that the level 1 team changed.
1674 if (team_change ||
1675 old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1676 if (old_leaf_kids) { // release old leaf kids
1677 thr_bar->b_go |= old_leaf_state;
1678 }
1679 // Release new leaf kids
1680 last = tid + thr_bar->skip_per_level[1];
1681 if (last > nproc)
1682 last = nproc;
1683 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1684 ++child_tid) { // skip_per_level[0]=1
1685 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1686 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1687 KA_TRACE(
1688 20,
1689 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1690 " T#%d(%d:%d) go(%p): %u => %u\n",
1691 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1692 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1693 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1694 // Release child using child's b_go flag
1695 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1696 flag.release();
1697 }
1698 } else { // Release all children at once with leaf_state bits on my own
1699 // b_go flag
1700 thr_bar->b_go |= thr_bar->leaf_state;
1701 }
1702 }
1703 } else { // Blocktime is not infinite; do a simple hierarchical release
1704 for (int d = thr_bar->my_level - 1; d >= 0;
1705 --d) { // Release highest level threads first
1706 last = tid + thr_bar->skip_per_level[d + 1];
1707 kmp_uint32 skip = thr_bar->skip_per_level[d];
1708 if (last > nproc)
1709 last = nproc;
1710 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1711 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1712 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1713 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1714 "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1715 gtid, team->t.t_id, tid,
1716 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1717 child_tid, &child_bar->b_go, child_bar->b_go,
1718 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1719 // Release child using child's b_go flag
1720 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1721 flag.release();
1722 }
1723 }
1724 }
1725#if KMP_BARRIER_ICV_PUSH
1726 if (propagate_icvs && !KMP_MASTER_TID(tid))
1727 // non-leaves copy ICVs from fixed ICVs to local dest
1728 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1729 &thr_bar->th_fixed_icvs);
1730#endif // KMP_BARRIER_ICV_PUSH
1731 }
1732 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1733 "barrier type %d\n",
1734 gtid, team->t.t_id, tid, bt));
1735}
1736
1737// End of Barrier Algorithms
1738
1739// type traits for cancellable value
1740// if cancellable is true, then is_cancellable is a normal boolean variable
1741// if cancellable is false, then is_cancellable is a compile time constant
1742template <bool cancellable> struct is_cancellable {};
1743template <> struct is_cancellable<true> {
1744 bool value;
1745 is_cancellable() : value(false) {}
1748 value = b;
1749 return *this;
1750 }
1751 operator bool() const { return value; }
1752};
1753template <> struct is_cancellable<false> {
1754 is_cancellable &operator=(bool b) { return *this; }
1755 constexpr operator bool() const { return false; }
1756};
1757
1758// Internal function to do a barrier.
1759/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1760 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1761 barrier
1762 When cancellable = false,
1763 Returns 0 if primary thread, 1 if worker thread.
1764 When cancellable = true
1765 Returns 0 if not cancelled, 1 if cancelled. */
1766template <bool cancellable = false>
1767static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1768 size_t reduce_size, void *reduce_data,
1769 void (*reduce)(void *, void *)) {
1770 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1771 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1772 int tid = __kmp_tid_from_gtid(gtid);
1773 kmp_info_t *this_thr = __kmp_threads[gtid];
1774 kmp_team_t *team = this_thr->th.th_team;
1775 int status = 0;
1777#if OMPT_SUPPORT && OMPT_OPTIONAL
1778 ompt_data_t *my_task_data;
1779 ompt_data_t *my_parallel_data;
1780 void *return_address;
1781 ompt_sync_region_t barrier_kind;
1782#endif
1783
1784 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1785 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1786
1787#if OMPT_SUPPORT
1788 if (ompt_enabled.enabled) {
1789#if OMPT_OPTIONAL
1790 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1791 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1792 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1793 barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1794 if (ompt_enabled.ompt_callback_sync_region) {
1795 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1796 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1797 return_address);
1798 }
1799 if (ompt_enabled.ompt_callback_sync_region_wait) {
1800 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1801 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1802 return_address);
1803 }
1804#endif
1805 // It is OK to report the barrier state after the barrier begin callback.
1806 // According to the OMPT specification, a compliant implementation may
1807 // even delay reporting this state until the barrier begins to wait.
1808 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1809 }
1810#endif
1811
1812 if (!team->t.t_serialized) {
1813#if USE_ITT_BUILD
1814 // This value will be used in itt notify events below.
1815 void *itt_sync_obj = NULL;
1816#if USE_ITT_NOTIFY
1817 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1818 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1819#endif
1820#endif /* USE_ITT_BUILD */
1822 __kmp_tasking_barrier(team, this_thr, gtid);
1823 KA_TRACE(15,
1824 ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1825 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1826 }
1827
1828 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1829 access it when the team struct is not guaranteed to exist. */
1830 // See note about the corresponding code in __kmp_join_barrier() being
1831 // performance-critical.
1833#if KMP_USE_MONITOR
1834 this_thr->th.th_team_bt_intervals =
1835 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1836 this_thr->th.th_team_bt_set =
1837 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1838#else
1839 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1840#endif
1841 }
1842
1843#if USE_ITT_BUILD
1844 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1845 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1846#endif /* USE_ITT_BUILD */
1847#if USE_DEBUGGER
1848 // Let the debugger know: the thread arrived to the barrier and waiting.
1849 if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1850 team->t.t_bar[bt].b_master_arrived += 1;
1851 } else {
1852 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1853 } // if
1854#endif /* USE_DEBUGGER */
1855 if (reduce != NULL) {
1856 // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1857 this_thr->th.th_local.reduce_data = reduce_data;
1858 }
1859
1861 __kmp_task_team_setup(this_thr, team);
1862
1863 if (cancellable) {
1865 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1866 } else {
1867 switch (__kmp_barrier_gather_pattern[bt]) {
1868 case bp_dist_bar: {
1869 __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
1870 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1871 break;
1872 }
1873 case bp_hyper_bar: {
1874 // don't set branch bits to 0; use linear
1876 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1877 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1878 break;
1879 }
1880 case bp_hierarchical_bar: {
1882 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1883 break;
1884 }
1885 case bp_tree_bar: {
1886 // don't set branch bits to 0; use linear
1888 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1889 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1890 break;
1891 }
1892 default: {
1893 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1894 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1895 }
1896 }
1897 }
1898
1899 KMP_MB();
1900
1901 if (KMP_MASTER_TID(tid)) {
1902 status = 0;
1903 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1904 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1905 }
1906#if USE_DEBUGGER
1907 // Let the debugger know: All threads are arrived and starting leaving the
1908 // barrier.
1909 team->t.t_bar[bt].b_team_arrived += 1;
1910#endif
1911
1913 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1914 // Reset cancellation flag for worksharing constructs
1915 if (cancel_request == cancel_loop ||
1916 cancel_request == cancel_sections) {
1917 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1918 }
1919 }
1920#if USE_ITT_BUILD
1921 /* TODO: In case of split reduction barrier, primary thread may send
1922 acquired event early, before the final summation into the shared
1923 variable is done (final summation can be a long operation for array
1924 reductions). */
1925 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1926 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1927#endif /* USE_ITT_BUILD */
1928#if USE_ITT_BUILD && USE_ITT_NOTIFY
1929 // Barrier - report frame end (only if active_level == 1)
1930 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1931 __kmp_forkjoin_frames_mode &&
1932 (this_thr->th.th_teams_microtask == NULL || // either not in teams
1933 this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1934 team->t.t_active_level == 1) {
1935 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1936 kmp_uint64 cur_time = __itt_get_timestamp();
1937 kmp_info_t **other_threads = team->t.t_threads;
1938 int nproc = this_thr->th.th_team_nproc;
1939 int i;
1940 switch (__kmp_forkjoin_frames_mode) {
1941 case 1:
1942 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1943 loc, nproc);
1944 this_thr->th.th_frame_time = cur_time;
1945 break;
1946 case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1947 // be fixed)
1948 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1949 1, loc, nproc);
1950 break;
1951 case 3:
1952 if (__itt_metadata_add_ptr) {
1953 // Initialize with primary thread's wait time
1954 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1955 // Set arrive time to zero to be able to check it in
1956 // __kmp_invoke_task(); the same is done inside the loop below
1957 this_thr->th.th_bar_arrive_time = 0;
1958 for (i = 1; i < nproc; ++i) {
1959 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1960 other_threads[i]->th.th_bar_arrive_time = 0;
1961 }
1962 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1963 cur_time, delta,
1964 (kmp_uint64)(reduce != NULL));
1965 }
1966 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1967 loc, nproc);
1968 this_thr->th.th_frame_time = cur_time;
1969 break;
1970 }
1971 }
1972#endif /* USE_ITT_BUILD */
1973 } else {
1974 status = 1;
1975#if USE_ITT_BUILD
1976 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1977 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1978#endif /* USE_ITT_BUILD */
1979 }
1980 if ((status == 1 || !is_split) && !cancelled) {
1981 if (cancellable) {
1983 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1984 } else {
1985 switch (__kmp_barrier_release_pattern[bt]) {
1986 case bp_dist_bar: {
1988 __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
1989 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1990 break;
1991 }
1992 case bp_hyper_bar: {
1994 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1995 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1996 break;
1997 }
1998 case bp_hierarchical_bar: {
2000 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2001 break;
2002 }
2003 case bp_tree_bar: {
2005 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2006 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2007 break;
2008 }
2009 default: {
2010 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2011 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2012 }
2013 }
2014 }
2015 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
2016 __kmp_task_team_sync(this_thr, team);
2017 }
2018 }
2019
2020#if USE_ITT_BUILD
2021 /* GEH: TODO: Move this under if-condition above and also include in
2022 __kmp_end_split_barrier(). This will more accurately represent the actual
2023 release time of the threads for split barriers. */
2024 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2025 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2026#endif /* USE_ITT_BUILD */
2027 } else { // Team is serialized.
2028 status = 0;
2030 if (this_thr->th.th_task_team != NULL) {
2031#if USE_ITT_NOTIFY
2032 void *itt_sync_obj = NULL;
2033 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2034 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
2035 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2036 }
2037#endif
2038
2040 this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE ||
2041 this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
2042 TRUE);
2043 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2044 __kmp_task_team_setup(this_thr, team);
2045
2046#if USE_ITT_BUILD
2047 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2048 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2049#endif /* USE_ITT_BUILD */
2050 }
2051 }
2052 }
2053 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2054 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
2055 __kmp_tid_from_gtid(gtid), status));
2056
2057#if OMPT_SUPPORT
2058 if (ompt_enabled.enabled) {
2059#if OMPT_OPTIONAL
2060 if (ompt_enabled.ompt_callback_sync_region_wait) {
2061 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2062 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2063 return_address);
2064 }
2065 if (ompt_enabled.ompt_callback_sync_region) {
2066 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2067 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2068 return_address);
2069 }
2070#endif
2071 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
2072 }
2073#endif
2074
2075 if (cancellable)
2076 return (int)cancelled;
2077 return status;
2078}
2079
2080// Returns 0 if primary thread, 1 if worker thread.
2081int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
2082 size_t reduce_size, void *reduce_data,
2083 void (*reduce)(void *, void *)) {
2084 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
2085 reduce);
2086}
2087
2088#if defined(KMP_GOMP_COMPAT)
2089// Returns 1 if cancelled, 0 otherwise
2090int __kmp_barrier_gomp_cancel(int gtid) {
2092 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
2093 0, NULL, NULL);
2094 if (cancelled) {
2095 int tid = __kmp_tid_from_gtid(gtid);
2096 kmp_info_t *this_thr = __kmp_threads[gtid];
2097 if (KMP_MASTER_TID(tid)) {
2098 // Primary thread does not need to revert anything
2099 } else {
2100 // Workers need to revert their private b_arrived flag
2101 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
2103 }
2104 }
2105 return cancelled;
2106 }
2107 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2108 return FALSE;
2109}
2110#endif
2111
2112void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
2113 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
2114 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
2116 int tid = __kmp_tid_from_gtid(gtid);
2117 kmp_info_t *this_thr = __kmp_threads[gtid];
2118 kmp_team_t *team = this_thr->th.th_team;
2119
2120 if (!team->t.t_serialized) {
2121 if (KMP_MASTER_GTID(gtid)) {
2122 switch (__kmp_barrier_release_pattern[bt]) {
2123 case bp_dist_bar: {
2124 __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2125 FALSE USE_ITT_BUILD_ARG(NULL));
2126 break;
2127 }
2128 case bp_hyper_bar: {
2130 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2131 FALSE USE_ITT_BUILD_ARG(NULL));
2132 break;
2133 }
2134 case bp_hierarchical_bar: {
2135 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
2136 FALSE USE_ITT_BUILD_ARG(NULL));
2137 break;
2138 }
2139 case bp_tree_bar: {
2141 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2142 FALSE USE_ITT_BUILD_ARG(NULL));
2143 break;
2144 }
2145 default: {
2146 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2147 FALSE USE_ITT_BUILD_ARG(NULL));
2148 }
2149 }
2151 __kmp_task_team_sync(this_thr, team);
2152 } // if
2153 }
2154 }
2155}
2156
2157void __kmp_join_barrier(int gtid) {
2158 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
2159 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2160
2162
2163 kmp_info_t *this_thr = __kmp_threads[gtid];
2164 kmp_team_t *team;
2165 int tid;
2166#ifdef KMP_DEBUG
2167 int team_id;
2168#endif /* KMP_DEBUG */
2169#if USE_ITT_BUILD
2170 void *itt_sync_obj = NULL;
2171#if USE_ITT_NOTIFY
2172 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
2173 // Get object created at fork_barrier
2174 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2175#endif
2176#endif /* USE_ITT_BUILD */
2177#if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG)
2178 int nproc = this_thr->th.th_team_nproc;
2179#endif
2180 KMP_MB();
2181
2182 // Get current info
2183 team = this_thr->th.th_team;
2184 KMP_DEBUG_ASSERT(nproc == team->t.t_nproc);
2185 tid = __kmp_tid_from_gtid(gtid);
2186#ifdef KMP_DEBUG
2187 team_id = team->t.t_id;
2188 kmp_info_t *master_thread = this_thr->th.th_team_master;
2189 if (master_thread != team->t.t_threads[0]) {
2191 }
2192#endif /* KMP_DEBUG */
2193 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
2194 KMP_MB();
2195
2196 // Verify state
2197 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
2198 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
2199 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
2200 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
2201 gtid, team_id, tid));
2202
2203#if OMPT_SUPPORT
2204 if (ompt_enabled.enabled) {
2205#if OMPT_OPTIONAL
2206 ompt_data_t *my_task_data;
2207 ompt_data_t *my_parallel_data;
2208 void *codeptr = NULL;
2209 int ds_tid = this_thr->th.th_info.ds.ds_tid;
2210 if (KMP_MASTER_TID(ds_tid) &&
2211 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2212 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2213 codeptr = team->t.ompt_team_info.master_return_address;
2214 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
2215 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
2216 if (ompt_enabled.ompt_callback_sync_region) {
2217 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2218 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
2219 my_task_data, codeptr);
2220 }
2221 if (ompt_enabled.ompt_callback_sync_region_wait) {
2222 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2223 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
2224 my_task_data, codeptr);
2225 }
2226 if (!KMP_MASTER_TID(ds_tid))
2227 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
2228#endif
2229 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
2230 }
2231#endif
2232
2234 __kmp_tasking_barrier(team, this_thr, gtid);
2235 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
2236 gtid, team_id, tid));
2237 }
2238#ifdef KMP_DEBUG
2240 KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
2241 "%p, th_task_team = %p\n",
2242 __kmp_gtid_from_thread(this_thr), team_id,
2243 team->t.t_task_team[this_thr->th.th_task_state],
2244 this_thr->th.th_task_team));
2246 }
2247#endif /* KMP_DEBUG */
2248
2249 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
2250 access it when the team struct is not guaranteed to exist. Doing these
2251 loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
2252 we do not perform the copy if blocktime=infinite, since the values are not
2253 used by __kmp_wait_template() in that case. */
2255#if KMP_USE_MONITOR
2256 this_thr->th.th_team_bt_intervals =
2257 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2258 this_thr->th.th_team_bt_set =
2259 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2260#else
2261 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2262#endif
2263 }
2264
2265#if USE_ITT_BUILD
2266 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2267 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2268#endif /* USE_ITT_BUILD */
2269
2271 case bp_dist_bar: {
2273 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2274 break;
2275 }
2276 case bp_hyper_bar: {
2279 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2280 break;
2281 }
2282 case bp_hierarchical_bar: {
2284 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2285 break;
2286 }
2287 case bp_tree_bar: {
2290 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2291 break;
2292 }
2293 default: {
2295 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2296 }
2297 }
2298
2299 /* From this point on, the team data structure may be deallocated at any time
2300 by the primary thread - it is unsafe to reference it in any of the worker
2301 threads. Any per-team data items that need to be referenced before the
2302 end of the barrier should be moved to the kmp_task_team_t structs. */
2303 if (KMP_MASTER_TID(tid)) {
2305 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2306 }
2308 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
2309 }
2310#if KMP_STATS_ENABLED
2311 // Have primary thread flag the workers to indicate they are now waiting for
2312 // next parallel region, Also wake them up so they switch their timers to
2313 // idle.
2314 for (int i = 0; i < team->t.t_nproc; ++i) {
2315 kmp_info_t *team_thread = team->t.t_threads[i];
2316 if (team_thread == this_thr)
2317 continue;
2318 team_thread->th.th_stats->setIdleFlag();
2320 team_thread->th.th_sleep_loc != NULL)
2321 __kmp_null_resume_wrapper(team_thread);
2322 }
2323#endif
2324#if USE_ITT_BUILD
2325 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2326 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2327#endif /* USE_ITT_BUILD */
2328
2329#if USE_ITT_BUILD && USE_ITT_NOTIFY
2330 // Join barrier - report frame end
2331 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2332 __kmp_forkjoin_frames_mode &&
2333 (this_thr->th.th_teams_microtask == NULL || // either not in teams
2334 this_thr->th.th_teams_size.nteams == 1) && // or inside single team
2335 team->t.t_active_level == 1) {
2336 kmp_uint64 cur_time = __itt_get_timestamp();
2337 ident_t *loc = team->t.t_ident;
2338 kmp_info_t **other_threads = team->t.t_threads;
2339 switch (__kmp_forkjoin_frames_mode) {
2340 case 1:
2341 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2342 loc, nproc);
2343 break;
2344 case 2:
2345 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
2346 loc, nproc);
2347 break;
2348 case 3:
2349 if (__itt_metadata_add_ptr) {
2350 // Initialize with primary thread's wait time
2351 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2352 // Set arrive time to zero to be able to check it in
2353 // __kmp_invoke_task(); the same is done inside the loop below
2354 this_thr->th.th_bar_arrive_time = 0;
2355 for (int i = 1; i < nproc; ++i) {
2356 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2357 other_threads[i]->th.th_bar_arrive_time = 0;
2358 }
2359 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2360 cur_time, delta, 0);
2361 }
2362 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2363 loc, nproc);
2364 this_thr->th.th_frame_time = cur_time;
2365 break;
2366 }
2367 }
2368#endif /* USE_ITT_BUILD */
2369 }
2370#if USE_ITT_BUILD
2371 else {
2372 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2373 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2374 }
2375#endif /* USE_ITT_BUILD */
2376
2377#if KMP_DEBUG
2378 if (KMP_MASTER_TID(tid)) {
2379 KA_TRACE(
2380 15,
2381 ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
2382 gtid, team_id, tid, nproc));
2383 }
2384#endif /* KMP_DEBUG */
2385
2386 // TODO now, mark worker threads as done so they may be disbanded
2387 KMP_MB(); // Flush all pending memory write invalidates.
2388 KA_TRACE(10,
2389 ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
2390
2391}
2392
2393// TODO release worker threads' fork barriers as we are ready instead of all at
2394// once
2395void __kmp_fork_barrier(int gtid, int tid) {
2396 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
2397 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2398 kmp_info_t *this_thr = __kmp_threads[gtid];
2399 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
2400#if USE_ITT_BUILD
2401 void *itt_sync_obj = NULL;
2402#endif /* USE_ITT_BUILD */
2403#ifdef KMP_DEBUG
2404 if (team)
2405 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
2406 (team != NULL) ? team->t.t_id : -1, tid));
2407#endif
2408 // th_team pointer only valid for primary thread here
2409 if (KMP_MASTER_TID(tid)) {
2410#if USE_ITT_BUILD && USE_ITT_NOTIFY
2411 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2412 // Create itt barrier object
2413 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
2414 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
2415 }
2416#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2417
2418#ifdef KMP_DEBUG
2419 KMP_DEBUG_ASSERT(team);
2420 kmp_info_t **other_threads = team->t.t_threads;
2421 int i;
2422
2423 // Verify state
2424 KMP_MB();
2425
2426 for (i = 1; i < team->t.t_nproc; ++i) {
2427 KA_TRACE(500,
2428 ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
2429 "== %u.\n",
2430 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
2431 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
2432 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
2434 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
2436 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
2437 }
2438#endif
2439
2441 __kmp_task_team_setup(this_thr, team);
2442
2443 /* The primary thread may have changed its blocktime between join barrier
2444 and fork barrier. Copy the blocktime info to the thread, where
2445 __kmp_wait_template() can access it when the team struct is not
2446 guaranteed to exist. */
2447 // See note about the corresponding code in __kmp_join_barrier() being
2448 // performance-critical
2450#if KMP_USE_MONITOR
2451 this_thr->th.th_team_bt_intervals =
2452 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2453 this_thr->th.th_team_bt_set =
2454 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2455#else
2456 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2457#endif
2458 }
2459 } // primary thread
2460
2462 case bp_dist_bar: {
2464 TRUE USE_ITT_BUILD_ARG(NULL));
2465 break;
2466 }
2467 case bp_hyper_bar: {
2470 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2471 break;
2472 }
2473 case bp_hierarchical_bar: {
2475 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2476 break;
2477 }
2478 case bp_tree_bar: {
2481 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2482 break;
2483 }
2484 default: {
2486 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2487 }
2488 }
2489
2490#if OMPT_SUPPORT
2491 if (ompt_enabled.enabled &&
2492 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2493 int ds_tid = this_thr->th.th_info.ds.ds_tid;
2494 ompt_data_t *task_data = (team)
2495 ? OMPT_CUR_TASK_DATA(this_thr)
2496 : &(this_thr->th.ompt_thread_info.task_data);
2497 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2498#if OMPT_OPTIONAL
2499 void *codeptr = NULL;
2500 if (KMP_MASTER_TID(ds_tid) &&
2501 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2502 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2503 codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2504 if (ompt_enabled.ompt_callback_sync_region_wait) {
2505 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2506 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2507 codeptr);
2508 }
2509 if (ompt_enabled.ompt_callback_sync_region) {
2510 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2511 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2512 codeptr);
2513 }
2514#endif
2515 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2516 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2517 ompt_scope_end, NULL, task_data, 0, ds_tid,
2518 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2519 }
2520 }
2521#endif
2522
2523 // Early exit for reaping threads releasing forkjoin barrier
2524 if (TCR_4(__kmp_global.g.g_done)) {
2525 this_thr->th.th_task_team = NULL;
2526
2527#if USE_ITT_BUILD && USE_ITT_NOTIFY
2528 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2529 if (!KMP_MASTER_TID(tid)) {
2530 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2531 if (itt_sync_obj)
2532 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2533 }
2534 }
2535#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2536 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2537 return;
2538 }
2539
2540 /* We can now assume that a valid team structure has been allocated by the
2541 primary thread and propagated to all worker threads. The current thread,
2542 however, may not be part of the team, so we can't blindly assume that the
2543 team pointer is non-null. */
2544 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2545 KMP_DEBUG_ASSERT(team != NULL);
2546 tid = __kmp_tid_from_gtid(gtid);
2547
2548#if KMP_BARRIER_ICV_PULL
2549 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2550 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2551 implicit task has this data before this function is called. We cannot
2552 modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2553 thread struct, because it is not always the case that the threads arrays
2554 have been allocated when __kmp_fork_call() is executed. */
2555 {
2557 if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2558 // Copy the initial ICVs from the primary thread's thread struct to the
2559 // implicit task for this tid.
2560 KA_TRACE(10,
2561 ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2562 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2563 tid, FALSE);
2564 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2565 &team->t.t_threads[0]
2566 ->th.th_bar[bs_forkjoin_barrier]
2567 .bb.th_fixed_icvs);
2568 }
2569 }
2570#endif // KMP_BARRIER_ICV_PULL
2571
2573 __kmp_task_team_sync(this_thr, team);
2574 }
2575
2576#if KMP_AFFINITY_SUPPORTED
2577 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2578 if (proc_bind == proc_bind_intel) {
2579 // Call dynamic affinity settings
2580 if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) {
2581 __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2582 }
2583 } else if (proc_bind != proc_bind_false) {
2584 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2585 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2586 __kmp_gtid_from_thread(this_thr),
2587 this_thr->th.th_current_place));
2588 } else {
2589 __kmp_affinity_bind_place(gtid);
2590 }
2591 }
2592#endif // KMP_AFFINITY_SUPPORTED
2593 // Perform the display affinity functionality
2595 if (team->t.t_display_affinity
2597 || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed)
2598#endif
2599 ) {
2600 // NULL means use the affinity-format-var ICV
2601 __kmp_aux_display_affinity(gtid, NULL);
2602 this_thr->th.th_prev_num_threads = team->t.t_nproc;
2603 this_thr->th.th_prev_level = team->t.t_level;
2604 }
2605 }
2606 if (!KMP_MASTER_TID(tid))
2607 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2608
2609#if USE_ITT_BUILD && USE_ITT_NOTIFY
2610 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2611 if (!KMP_MASTER_TID(tid)) {
2612 // Get correct barrier object
2613 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2614 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2615 } // (prepare called inside barrier_release)
2616 }
2617#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2618 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2619 team->t.t_id, tid));
2620}
2621
2622void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2623 kmp_internal_control_t *new_icvs, ident_t *loc) {
2624 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2625
2626 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2628
2629/* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2630 __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2631 implicit task has this data before this function is called. */
2632#if KMP_BARRIER_ICV_PULL
2633 /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2634 remains untouched), where all of the worker threads can access them and
2635 make their own copies after the barrier. */
2636 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2637 // allocated at this point
2638 copy_icvs(
2639 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2640 new_icvs);
2641 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2642 team->t.t_threads[0], team));
2643#elif KMP_BARRIER_ICV_PUSH
2644 // The ICVs will be propagated in the fork barrier, so nothing needs to be
2645 // done here.
2646 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2647 team->t.t_threads[0], team));
2648#else
2649 // Copy the ICVs to each of the non-primary threads. This takes O(nthreads)
2650 // time.
2651 ngo_load(new_icvs);
2652 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2653 // allocated at this point
2654 for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2655 // TODO: GEH - pass in better source location info since usually NULL here
2656 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2657 f, team->t.t_threads[f], team));
2658 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2659 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2660 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2661 f, team->t.t_threads[f], team));
2662 }
2663 ngo_sync();
2664#endif // KMP_BARRIER_ICV_PULL
2665}
char bool
size_t KMP_ALIGN_CACHE gos_per_group
Definition: kmp_barrier.h:106
kmp_uint64 go_release()
size_t KMP_ALIGN_CACHE num_groups
Definition: kmp_barrier.h:99
size_t KMP_ALIGN_CACHE threads_per_group
Definition: kmp_barrier.h:104
size_t KMP_ALIGN_CACHE num_gos
Definition: kmp_barrier.h:97
size_t KMP_ALIGN_CACHE threads_per_go
Definition: kmp_barrier.h:101
size_t KMP_ALIGN_CACHE num_threads
Definition: kmp_barrier.h:94
size_t KMP_ALIGN_CACHE max_threads
Definition: kmp_barrier.h:95
flags_s * flags[MAX_ITERS]
Definition: kmp_barrier.h:89
bool KMP_ALIGN_CACHE fix_threads_per_go
Definition: kmp_barrier.h:102
bool wait(kmp_info_t *this_thr, int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj))
bool wait(kmp_info_t *this_thr, int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj))
bool wait(kmp_info_t *this_thr, int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj))
void set_waiter(kmp_info_t *thr)
int get_level(kmp_hw_t type) const
int get_count(int level) const
int calculate_ratio(int level1, int level2) const
void stop(char *errorMsg)
void
Definition: ittnotify.h:3324
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long value
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type size_t void ITT_FORMAT p const __itt_domain __itt_id __itt_string_handle const wchar_t size_t ITT_FORMAT lu const __itt_domain __itt_id __itt_relation __itt_id ITT_FORMAT p const wchar_t int ITT_FORMAT __itt_group_mark d int
#define KMP_INTERNAL_MALLOC(sz)
Definition: kmp.h:122
kmp_global_t __kmp_global
Definition: kmp_global.cpp:467
void __kmp_teams_master(int gtid)
#define KMP_MAX_BLOCKTIME
Definition: kmp.h:1228
#define KMP_INTERNAL_REALLOC(p, sz)
Definition: kmp.h:124
void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team)
#define KMP_NOT_SAFE_TO_REAP
Definition: kmp.h:2118
static kmp_team_t * __kmp_team_from_gtid(int gtid)
Definition: kmp.h:3639
kmp_bar_pat_e __kmp_barrier_gather_pattern[bs_last_barrier]
Definition: kmp_global.cpp:92
kmp_tasking_mode_t __kmp_tasking_mode
Definition: kmp_global.cpp:299
void __kmp_abort_thread(void)
int __kmp_dflt_blocktime
Definition: kmp_global.cpp:158
void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar)
int __kmp_omp_cancellation
Definition: kmp_global.cpp:215
#define KMP_BARRIER_UNUSED_STATE
Definition: kmp.h:2097
int __kmp_barrier_gomp_cancel(int gtid)
#define KMP_BARRIER_SLEEP_STATE
Definition: kmp.h:2096
static int __kmp_tid_from_gtid(int gtid)
Definition: kmp.h:3619
#define KMP_MIN(x, y)
Definition: kmp.h:320
#define KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, thr)
Definition: kmp.h:4145
@ cancel_sections
Definition: kmp.h:1001
@ cancel_loop
Definition: kmp.h:1000
@ cancel_noreq
Definition: kmp.h:998
#define KMP_CHECK_UPDATE(a, b)
Definition: kmp.h:2353
#define KMP_MASTER_TID(tid)
Definition: kmp.h:1315
kmp_uint32 __kmp_barrier_release_branch_bits[bs_last_barrier]
Definition: kmp_global.cpp:91
#define KMP_BARRIER_OWN_FLAG
Definition: kmp.h:2109
void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr, kmp_team_t *team, int tid, int set_curr_task)
static void copy_icvs(kmp_internal_control_t *dst, kmp_internal_control_t *src)
Definition: kmp.h:2184
#define KMP_TASKING_ENABLED(task_team)
Definition: kmp.h:2441
kmp_info_t ** __kmp_threads
Definition: kmp_global.cpp:450
#define KMP_BARRIER_PARENT_FLAG
Definition: kmp.h:2111
#define KMP_MASTER_GTID(gtid)
Definition: kmp.h:1318
volatile int __kmp_init_parallel
Definition: kmp_global.cpp:49
#define __kmp_allocate(size)
Definition: kmp.h:3754
#define TRUE
Definition: kmp.h:1324
#define FALSE
Definition: kmp.h:1323
void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid)
@ tskm_extra_barrier
Definition: kmp.h:2418
@ tskm_immediate_exec
Definition: kmp.h:2417
void __kmp_aux_display_affinity(int gtid, const char *format)
#define KMP_INIT_BARRIER_STATE
Definition: kmp.h:2091
kmp_uint32 __kmp_barrier_gather_branch_bits[bs_last_barrier]
Definition: kmp_global.cpp:90
#define KMP_BARRIER_NOT_WAITING
Definition: kmp.h:2108
static int __kmp_gtid_from_tid(int tid, const kmp_team_t *team)
Definition: kmp.h:3624
#define KMP_BARRIER_SWITCHING
Definition: kmp.h:2115
#define KMP_SAFE_TO_REAP
Definition: kmp.h:2120
barrier_type
Definition: kmp.h:2131
@ bs_plain_barrier
Definition: kmp.h:2132
@ bs_last_barrier
Definition: kmp.h:2138
@ bs_forkjoin_barrier
Definition: kmp.h:2134
int __kmp_display_affinity
Definition: kmp_global.cpp:294
#define KMP_BLOCKTIME_INTERVAL(team, tid)
Definition: kmp.h:1272
void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team)
int __kmp_atomic_execute_tasks_64(kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64< C, S > *flag, int final_spin, int *thread_finished, kmp_int32 is_constrained)
kmp_proc_bind_t
Definition: kmp.h:958
@ proc_bind_false
Definition: kmp.h:959
@ proc_bind_intel
Definition: kmp.h:964
@ KMP_HW_SOCKET
Definition: kmp.h:621
@ KMP_HW_CORE
Definition: kmp.h:631
#define KMP_BARRIER_STATE_BUMP
Definition: kmp.h:2098
void __kmp_atomic_resume_64(int target_gtid, kmp_atomic_flag_64< C, S > *flag)
static int __kmp_gtid_from_thread(const kmp_info_t *thr)
Definition: kmp.h:3629
static void __kmp_type_convert(T1 src, T2 *dest)
Definition: kmp.h:4855
struct KMP_ALIGN_CACHE kmp_bstate kmp_bstate_t
kmp_bar_pat_e __kmp_barrier_release_pattern[bs_last_barrier]
Definition: kmp_global.cpp:93
@ bp_dist_bar
Definition: kmp.h:2154
@ bp_tree_bar
Definition: kmp.h:2149
@ bp_hierarchical_bar
Definition: kmp.h:2153
@ bp_hyper_bar
Definition: kmp.h:2151
union KMP_ALIGN_CACHE kmp_info kmp_info_t
void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team, int wait=1)
kmp_topology_t * __kmp_topology
static bool __kmp_linear_barrier_gather_template(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, void(*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj))
static void __kmp_dist_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj))
#define ngo_load(src)
Definition: kmp_barrier.cpp:34
static void __kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj))
#define ngo_store_icvs(dst, src)
Definition: kmp_barrier.cpp:35
static bool __kmp_linear_barrier_release_template(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj))
#define ngo_store_go(dst, src)
Definition: kmp_barrier.cpp:36
int __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size, void *reduce_data, void(*reduce)(void *, void *))
#define ngo_sync()
Definition: kmp_barrier.cpp:37
void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc)
static void __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, void(*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj))
void __kmp_join_barrier(int gtid)
static void __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, void(*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj))
void __kmp_end_split_barrier(enum barrier_type bt, int gtid)
static bool __kmp_linear_barrier_release_cancellable(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj))
static bool __kmp_linear_barrier_gather_cancellable(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, void(*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj))
static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split, size_t reduce_size, void *reduce_data, void(*reduce)(void *, void *))
static void __kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, void(*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj))
void __kmp_print_structure(void)
static void __kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj))
void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team, size_t start, size_t stop, size_t inc, size_t tid)
static void __kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj))
static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc, int gtid, int tid, kmp_team_t *team)
void __kmp_fork_barrier(int gtid, int tid)
static void __kmp_dist_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, void(*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj))
static void __kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, void(*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj))
static void __kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj))
#define KMP_OPTIMIZE_FOR_REDUCTIONS
Definition: kmp_barrier.h:57
#define KA_TRACE(d, x)
Definition: kmp_debug.h:157
#define KMP_ASSERT(cond)
Definition: kmp_debug.h:59
#define KF_TRACE(d, x)
Definition: kmp_debug.h:162
#define KMP_DEBUG_ASSERT(cond)
Definition: kmp_debug.h:61
unsigned long long kmp_uint64
static volatile kmp_i18n_cat_status_t status
Definition: kmp_i18n.cpp:48
#define USE_ITT_BUILD_ARG(x)
Definition: kmp_itt.h:346
#define TCW_8(a, b)
Definition: kmp_os.h:1140
void(* microtask_t)(int *gtid, int *npr,...)
Definition: kmp_os.h:1183
#define KMP_TEST_THEN_AND64(p, v)
Definition: kmp_os.h:797
#define TCR_PTR(a)
Definition: kmp_os.h:1164
#define RCAST(type, var)
Definition: kmp_os.h:291
#define KMP_CACHE_PREFETCH(ADDR)
Definition: kmp_os.h:347
#define KMP_ATOMIC_ST_RLX(p, v)
Definition: kmp_os.h:1260
#define KMP_MB()
Definition: kmp_os.h:1064
#define TCR_4(a)
Definition: kmp_os.h:1135
#define KMP_ATOMIC_LD_RLX(p)
Definition: kmp_os.h:1258
#define KMP_MFENCE()
Definition: kmp_os.h:1097
#define KMP_AFFINITY_SUPPORTED
Definition: kmp_os.h:88
#define KMP_COMPARE_AND_STORE_ACQ32(p, cv, sv)
Definition: kmp_os.h:813
#define TCW_4(a, b)
Definition: kmp_os.h:1136
#define TCR_SYNC_4(a)
Definition: kmp_os.h:1143
Functions for collecting statistics.
#define KMP_SET_THREAD_STATE_BLOCK(state_name)
Definition: kmp_stats.h:1018
#define KMP_TIME_PARTITIONED_BLOCK(name)
Definition: kmp_stats.h:1013
#define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n)
Definition: kmp_stats.h:1008
#define i
Definition: kmp_stub.cpp:87
static void __kmp_null_resume_wrapper(kmp_info_t *thr)
int32_t kmp_int32
if(ret)
ompt_callbacks_active_t ompt_enabled
ompt_callbacks_internal_t ompt_callbacks
ompt_sync_region_t __ompt_get_barrier_kind(enum barrier_type bt, kmp_info_t *thr)
#define OMPT_REDUCTION_BEGIN
#define OMPT_REDUCTION_DECL(this_thr, gtid)
#define OMPT_REDUCTION_END
static id loc
volatile int flag
is_cancellable & operator=(bool b)
is_cancellable & operator=(bool b)
KMP_ALIGN_CACHE volatile kmp_uint32 tt_active
Definition: kmp.h:2865
kmp_uint64 b_arrived
Definition: kmp.h:2236
kmp_base_task_team_t tt
Definition: kmp.h:2869
Definition: kmp.h:3215
kmp_base_team_t t
Definition: kmp.h:3216