LLVM OpenMP 22.0.0git
kmp_dispatch.cpp
Go to the documentation of this file.
1/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13/* Dynamic scheduling initialization and dispatch.
14 *
15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16 * it may change values between parallel regions. __kmp_max_nth
17 * is the largest value __kmp_nth may take, 1 is the smallest.
18 */
19
20#include "kmp.h"
21#include "kmp_error.h"
22#include "kmp_i18n.h"
23#include "kmp_itt.h"
24#include "kmp_stats.h"
25#include "kmp_str.h"
26#if KMP_USE_X87CONTROL
27#include <float.h>
28#endif
29#include "kmp_lock.h"
30#include "kmp_dispatch.h"
31#if KMP_USE_HIER_SCHED
32#include "kmp_dispatch_hier.h"
33#endif
34
35#if OMPT_SUPPORT
36#include "ompt-specific.h"
37#endif
38
39/* ------------------------------------------------------------------------ */
40/* ------------------------------------------------------------------------ */
41
42void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43 kmp_info_t *th;
44
45 KMP_DEBUG_ASSERT(gtid_ref);
46
48 th = __kmp_threads[*gtid_ref];
49 if (th->th.th_root->r.r_active &&
50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51#if KMP_USE_DYNAMIC_LOCK
52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53#else
54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55#endif
56 }
57 }
58}
59
60void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61 kmp_info_t *th;
62
64 th = __kmp_threads[*gtid_ref];
65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67 }
68 }
69}
70
71// Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
72static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73 bool use_hier = false) {
74 // Pick up the nonmonotonic/monotonic bits from the scheduling type
75 // Nonmonotonic as default for dynamic schedule when no modifier is specified
76 int monotonicity = SCHEDULE_NONMONOTONIC;
77
78 // Let default be monotonic for executables
79 // compiled with OpenMP* 4.5 or less compilers
80 if (loc != NULL && loc->get_openmp_version() < 50)
81 monotonicity = SCHEDULE_MONOTONIC;
82
83 if (use_hier || __kmp_force_monotonic)
84 monotonicity = SCHEDULE_MONOTONIC;
85 else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86 monotonicity = SCHEDULE_NONMONOTONIC;
87 else if (SCHEDULE_HAS_MONOTONIC(schedule))
88 monotonicity = SCHEDULE_MONOTONIC;
89
90 return monotonicity;
91}
92
93#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
94// Return floating point number rounded to two decimal points
95static inline float __kmp_round_2decimal_val(float num) {
96 return (float)(static_cast<int>(num * 100 + 0.5)) / 100;
97}
98static inline int __kmp_get_round_val(float num) {
99 return static_cast<int>(num < 0 ? num - 0.5 : num + 0.5);
100}
101#endif
102
103template <typename T>
104inline void
107 typename traits_t<T>::unsigned_t nchunks, T nproc,
108 typename traits_t<T>::unsigned_t &init,
109 T &small_chunk, T &extras, T &p_extra) {
110
111#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
112 if (pr->flags.use_hybrid) {
113 kmp_info_t *th = __kmp_threads[__kmp_gtid_from_tid((int)id, team)];
115 (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
116 T pchunks = pr->u.p.pchunks;
117 T echunks = nchunks - pchunks;
118 T num_procs_with_pcore = pr->u.p.num_procs_with_pcore;
119 T num_procs_with_ecore = nproc - num_procs_with_pcore;
120 T first_thread_with_ecore = pr->u.p.first_thread_with_ecore;
121 T big_chunk =
122 pchunks / num_procs_with_pcore; // chunks per thread with p-core
123 small_chunk =
124 echunks / num_procs_with_ecore; // chunks per thread with e-core
125
126 extras =
127 (pchunks % num_procs_with_pcore) + (echunks % num_procs_with_ecore);
128
129 p_extra = (big_chunk - small_chunk);
130
131 if (type == KMP_HW_CORE_TYPE_CORE) {
132 if (id < first_thread_with_ecore) {
133 init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
134 } else {
135 init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
136 (id < extras ? id : extras);
137 }
138 } else {
139 if (id == first_thread_with_ecore) {
140 init = id * small_chunk + id * p_extra + (id < extras ? id : extras);
141 } else {
142 init = id * small_chunk + first_thread_with_ecore * p_extra +
143 (id < extras ? id : extras);
144 }
145 }
146 p_extra = (type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
147 return;
148 }
149#endif
150
151 small_chunk = nchunks / nproc; // chunks per thread
152 extras = nchunks % nproc;
153 p_extra = 0;
154 init = id * small_chunk + (id < extras ? id : extras);
155}
156
157#if KMP_STATIC_STEAL_ENABLED
158enum { // values for steal_flag (possible states of private per-loop buffer)
159 UNUSED = 0,
160 CLAIMED = 1, // owner thread started initialization
161 READY = 2, // available for stealing
162 THIEF = 3 // finished by owner, or claimed by thief
163 // possible state changes:
164 // 0 -> 1 owner only, sync
165 // 0 -> 3 thief only, sync
166 // 1 -> 2 owner only, async
167 // 2 -> 3 owner only, async
168 // 3 -> 2 owner only, async
169 // 3 -> 0 last thread finishing the loop, async
170};
171#endif
172
173// Initialize a dispatch_private_info_template<T> buffer for a particular
174// type of schedule,chunk. The loop description is found in lb (lower bound),
175// ub (upper bound), and st (stride). nproc is the number of threads relevant
176// to the scheduling (often the number of threads in a team, but not always if
177// hierarchical scheduling is used). tid is the id of the thread calling
178// the function within the group of nproc threads. It will have a value
179// between 0 and nproc - 1. This is often just the thread id within a team, but
180// is not necessarily the case when using hierarchical scheduling.
181// loc is the source file location of the corresponding loop
182// gtid is the global thread id
183template <typename T>
186 enum sched_type schedule, T lb, T ub,
187 typename traits_t<T>::signed_t st,
188#if USE_ITT_BUILD
189 kmp_uint64 *cur_chunk,
190#endif
191 typename traits_t<T>::signed_t chunk,
192 T nproc, T tid) {
193 typedef typename traits_t<T>::unsigned_t UT;
194 typedef typename traits_t<T>::floating_t DBL;
195
196 int active;
197 T tc;
198 kmp_info_t *th;
199 kmp_team_t *team;
200 int monotonicity;
201 bool use_hier;
202
203#ifdef KMP_DEBUG
204 typedef typename traits_t<T>::signed_t ST;
205 {
206 char *buff;
207 // create format specifiers before the debug output
208 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
209 "pr:%%p lb:%%%s ub:%%%s st:%%%s "
210 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
211 traits_t<T>::spec, traits_t<T>::spec,
212 traits_t<ST>::spec, traits_t<ST>::spec,
213 traits_t<T>::spec, traits_t<T>::spec);
214 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
215 __kmp_str_free(&buff);
216 }
217#endif
218 /* setup data */
219 th = __kmp_threads[gtid];
220 team = th->th.th_team;
221 active = !team->t.t_serialized;
222
223#if USE_ITT_BUILD
224 int itt_need_metadata_reporting =
225 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
226 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
227 team->t.t_active_level == 1;
228#endif
229
230#if KMP_USE_HIER_SCHED
231 use_hier = pr->flags.use_hier;
232#else
233 use_hier = false;
234#endif
235
236 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
237 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
238 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
239
240 /* Pick up the nomerge/ordered bits from the scheduling type */
241 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
242 pr->flags.nomerge = TRUE;
243 schedule =
244 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
245 } else {
246 pr->flags.nomerge = FALSE;
247 }
248 pr->type_size = traits_t<T>::type_size; // remember the size of variables
249 if (kmp_ord_lower & schedule) {
250 pr->flags.ordered = TRUE;
251 schedule =
252 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
253 } else {
254 pr->flags.ordered = FALSE;
255 }
256 // Ordered overrides nonmonotonic
257 if (pr->flags.ordered) {
258 monotonicity = SCHEDULE_MONOTONIC;
259 }
260
261 if (schedule == kmp_sch_static) {
262 schedule = __kmp_static;
263 } else {
264 if (schedule == kmp_sch_runtime) {
265 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
266 // not specified)
267 schedule = team->t.t_sched.r_sched_type;
268 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
269 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
270 if (pr->flags.ordered) // correct monotonicity for ordered loop if needed
271 monotonicity = SCHEDULE_MONOTONIC;
272 // Detail the schedule if needed (global controls are differentiated
273 // appropriately)
274 if (schedule == kmp_sch_guided_chunked) {
275 schedule = __kmp_guided;
276 } else if (schedule == kmp_sch_static) {
277 schedule = __kmp_static;
278 }
279 // Use the chunk size specified by OMP_SCHEDULE (or default if not
280 // specified)
281 chunk = team->t.t_sched.chunk;
282#if USE_ITT_BUILD
283 if (cur_chunk)
284 *cur_chunk = chunk;
285#endif
286#ifdef KMP_DEBUG
287 {
288 char *buff;
289 // create format specifiers before the debug output
290 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
291 "schedule:%%d chunk:%%%s\n",
292 traits_t<ST>::spec);
293 KD_TRACE(10, (buff, gtid, schedule, chunk));
294 __kmp_str_free(&buff);
295 }
296#endif
297 } else {
298 if (schedule == kmp_sch_guided_chunked) {
299 schedule = __kmp_guided;
300 }
301 if (chunk <= 0) {
302 chunk = KMP_DEFAULT_CHUNK;
303 }
304 }
305
306 if (schedule == kmp_sch_auto) {
307 // mapping and differentiation: in the __kmp_do_serial_initialize()
308 schedule = __kmp_auto;
309#ifdef KMP_DEBUG
310 {
311 char *buff;
312 // create format specifiers before the debug output
313 buff = __kmp_str_format(
314 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
315 "schedule:%%d chunk:%%%s\n",
316 traits_t<ST>::spec);
317 KD_TRACE(10, (buff, gtid, schedule, chunk));
318 __kmp_str_free(&buff);
319 }
320#endif
321 }
322#if KMP_STATIC_STEAL_ENABLED
323 // map nonmonotonic:dynamic to static steal
324 if (schedule == kmp_sch_dynamic_chunked) {
325 if (monotonicity == SCHEDULE_NONMONOTONIC)
326 schedule = kmp_sch_static_steal;
327 }
328#endif
329 /* guided analytical not safe for too many threads */
330 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
332 KMP_WARNING(DispatchManyThreads);
333 }
334 if (schedule == kmp_sch_runtime_simd) {
335 // compiler provides simd_width in the chunk parameter
336 schedule = team->t.t_sched.r_sched_type;
337 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
338 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
339 // Detail the schedule if needed (global controls are differentiated
340 // appropriately)
341 if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
342 schedule == __kmp_static) {
344 } else {
345 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
346 schedule = kmp_sch_guided_simd;
347 }
348 chunk = team->t.t_sched.chunk * chunk;
349 }
350#if USE_ITT_BUILD
351 if (cur_chunk)
352 *cur_chunk = chunk;
353#endif
354#ifdef KMP_DEBUG
355 {
356 char *buff;
357 // create format specifiers before the debug output
358 buff = __kmp_str_format(
359 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
360 " chunk:%%%s\n",
361 traits_t<ST>::spec);
362 KD_TRACE(10, (buff, gtid, schedule, chunk));
363 __kmp_str_free(&buff);
364 }
365#endif
366 }
367 pr->u.p.parm1 = chunk;
368 }
369 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
370 "unknown scheduling type");
371
372 pr->u.p.count = 0;
373
375 if (st == 0) {
376 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
378 }
379 }
380 // compute trip count
381 if (st == 1) { // most common case
382 if (ub >= lb) {
383 tc = ub - lb + 1;
384 } else { // ub < lb
385 tc = 0; // zero-trip
386 }
387 } else if (st < 0) {
388 if (lb >= ub) {
389 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
390 // where the division needs to be unsigned regardless of the result type
391 tc = (UT)(lb - ub) / (-st) + 1;
392 } else { // lb < ub
393 tc = 0; // zero-trip
394 }
395 } else { // st > 0
396 if (ub >= lb) {
397 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
398 // where the division needs to be unsigned regardless of the result type
399 tc = (UT)(ub - lb) / st + 1;
400 } else { // ub < lb
401 tc = 0; // zero-trip
402 }
403 }
404
405#if KMP_STATS_ENABLED
406 if (KMP_MASTER_GTID(gtid)) {
407 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
408 }
409#endif
410
411 pr->u.p.lb = lb;
412 pr->u.p.ub = ub;
413 pr->u.p.st = st;
414 pr->u.p.tc = tc;
415
416#if KMP_OS_WINDOWS
417 pr->u.p.last_upper = ub + st;
418#endif /* KMP_OS_WINDOWS */
419
420 /* NOTE: only the active parallel region(s) has active ordered sections */
421
422 if (active) {
423 if (pr->flags.ordered) {
424 pr->ordered_bumped = 0;
425 pr->u.p.ordered_lower = 1;
426 pr->u.p.ordered_upper = 0;
427 }
428 }
429
430 switch (schedule) {
431#if KMP_STATIC_STEAL_ENABLED
433 T ntc, init = 0;
434
435 KD_TRACE(100,
436 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
437 gtid));
438
439 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
440 if (nproc > 1 && ntc >= nproc) {
441 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
442 T id = tid;
443 T small_chunk, extras, p_extra = 0;
444 kmp_uint32 old = UNUSED;
445 int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED);
446 if (traits_t<T>::type_size > 4) {
447 // AC: TODO: check if 16-byte CAS available and use it to
448 // improve performance (probably wait for explicit request
449 // before spending time on this).
450 // For now use dynamically allocated per-private-buffer lock,
451 // free memory in __kmp_dispatch_next when status==0.
452 pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
453 __kmp_init_lock(pr->u.p.steal_lock);
454 }
455
456#if KMP_WEIGHTED_ITERATIONS_SUPPORTED
457 // Iterations are divided in a 60/40 skewed distribution among CORE and
458 // ATOM processors for hybrid systems
459 bool use_hybrid = false;
461 T first_thread_with_ecore = 0;
462 T num_procs_with_pcore = 0;
463 T num_procs_with_ecore = 0;
464 T p_ntc = 0, e_ntc = 0;
465 if (__kmp_is_hybrid_cpu() && __kmp_affinity.type != affinity_none &&
466 __kmp_affinity.type != affinity_explicit) {
467 use_hybrid = true;
468 core_type = (kmp_hw_core_type_t)th->th.th_topology_attrs.core_type;
469 if (core_type != KMP_HW_CORE_TYPE_UNKNOWN &&
470 __kmp_first_osid_with_ecore > -1) {
471 for (int i = 0; i < team->t.t_nproc; ++i) {
472 kmp_hw_core_type_t type = (kmp_hw_core_type_t)team->t.t_threads[i]
473 ->th.th_topology_attrs.core_type;
474 int id = team->t.t_threads[i]->th.th_topology_ids.os_id;
475 if (id == __kmp_first_osid_with_ecore) {
476 first_thread_with_ecore =
477 team->t.t_threads[i]->th.th_info.ds.ds_tid;
478 }
479 if (type == KMP_HW_CORE_TYPE_CORE) {
480 num_procs_with_pcore++;
481 } else if (type == KMP_HW_CORE_TYPE_ATOM) {
482 num_procs_with_ecore++;
483 } else {
484 use_hybrid = false;
485 break;
486 }
487 }
488 }
489 if (num_procs_with_pcore > 0 && num_procs_with_ecore > 0) {
490 float multiplier = 60.0 / 40.0;
491 float p_ratio = (float)num_procs_with_pcore / nproc;
492 float e_ratio = (float)num_procs_with_ecore / nproc;
493 float e_multiplier =
494 (float)1 /
495 (((multiplier * num_procs_with_pcore) / nproc) + e_ratio);
496 float p_multiplier = multiplier * e_multiplier;
497 p_ntc = __kmp_get_round_val(ntc * p_ratio * p_multiplier);
498 if ((int)p_ntc > (int)(ntc * p_ratio * p_multiplier))
499 e_ntc =
500 (int)(__kmp_round_2decimal_val(ntc * e_ratio * e_multiplier));
501 else
502 e_ntc = __kmp_get_round_val(ntc * e_ratio * e_multiplier);
503 KMP_DEBUG_ASSERT(ntc == p_ntc + e_ntc);
504
505 // Use regular static steal if not enough chunks for skewed
506 // distribution
507 use_hybrid = (use_hybrid && (p_ntc >= num_procs_with_pcore &&
508 e_ntc >= num_procs_with_ecore)
509 ? true
510 : false);
511 } else {
512 use_hybrid = false;
513 }
514 }
515 pr->flags.use_hybrid = use_hybrid;
516 pr->u.p.pchunks = p_ntc;
517 pr->u.p.num_procs_with_pcore = num_procs_with_pcore;
518 pr->u.p.first_thread_with_ecore = first_thread_with_ecore;
519
520 if (use_hybrid) {
521 KMP_DEBUG_ASSERT(nproc == num_procs_with_pcore + num_procs_with_ecore);
522 T big_chunk = p_ntc / num_procs_with_pcore;
523 small_chunk = e_ntc / num_procs_with_ecore;
524
525 extras =
526 (p_ntc % num_procs_with_pcore) + (e_ntc % num_procs_with_ecore);
527
528 p_extra = (big_chunk - small_chunk);
529
530 if (core_type == KMP_HW_CORE_TYPE_CORE) {
531 if (id < first_thread_with_ecore) {
532 init =
533 id * small_chunk + id * p_extra + (id < extras ? id : extras);
534 } else {
535 init = id * small_chunk + (id - num_procs_with_ecore) * p_extra +
536 (id < extras ? id : extras);
537 }
538 } else {
539 if (id == first_thread_with_ecore) {
540 init =
541 id * small_chunk + id * p_extra + (id < extras ? id : extras);
542 } else {
543 init = id * small_chunk + first_thread_with_ecore * p_extra +
544 (id < extras ? id : extras);
545 }
546 }
547 p_extra = (core_type == KMP_HW_CORE_TYPE_CORE) ? p_extra : 0;
548 } else
549#endif
550 {
551 small_chunk = ntc / nproc;
552 extras = ntc % nproc;
553 init = id * small_chunk + (id < extras ? id : extras);
554 p_extra = 0;
555 }
556 pr->u.p.count = init;
557 if (claimed) { // are we succeeded in claiming own buffer?
558 pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
559 // Other threads will inspect steal_flag when searching for a victim.
560 // READY means other threads may steal from this thread from now on.
561 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
562 } else {
563 // other thread has stolen whole our range
564 KMP_DEBUG_ASSERT(pr->steal_flag == THIEF);
565 pr->u.p.ub = init; // mark there is no iterations to work on
566 }
567 pr->u.p.parm2 = ntc; // save number of chunks
568 // parm3 is the number of times to attempt stealing which is
569 // nproc (just a heuristics, could be optimized later on).
570 pr->u.p.parm3 = nproc;
571 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
572 break;
573 } else {
574 /* too few chunks: switching to kmp_sch_dynamic_chunked */
575 schedule = kmp_sch_dynamic_chunked;
576 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
577 "kmp_sch_dynamic_chunked\n",
578 gtid));
579 goto dynamic_init;
580 break;
581 } // if
582 } // case
583#endif
585 T init, limit;
586
587 KD_TRACE(
588 100,
589 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
590 gtid));
591
592 if (nproc > 1) {
593 T id = tid;
594
595 if (tc < nproc) {
596 if (id < tc) {
597 init = id;
598 limit = id;
599 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
600 } else {
601 pr->u.p.count = 1; /* means no more chunks to execute */
602 pr->u.p.parm1 = FALSE;
603 break;
604 }
605 } else {
606 T small_chunk = tc / nproc;
607 T extras = tc % nproc;
608 init = id * small_chunk + (id < extras ? id : extras);
609 limit = init + small_chunk - (id < extras ? 0 : 1);
610 pr->u.p.parm1 = (id == nproc - 1);
611 }
612 } else {
613 if (tc > 0) {
614 init = 0;
615 limit = tc - 1;
616 pr->u.p.parm1 = TRUE;
617 } else {
618 // zero trip count
619 pr->u.p.count = 1; /* means no more chunks to execute */
620 pr->u.p.parm1 = FALSE;
621 break;
622 }
623 }
624#if USE_ITT_BUILD
625 // Calculate chunk for metadata report
626 if (itt_need_metadata_reporting)
627 if (cur_chunk)
628 *cur_chunk = limit - init + 1;
629#endif
630 if (st == 1) {
631 pr->u.p.lb = lb + init;
632 pr->u.p.ub = lb + limit;
633 } else {
634 // calculated upper bound, "ub" is user-defined upper bound
635 T ub_tmp = lb + limit * st;
636 pr->u.p.lb = lb + init * st;
637 // adjust upper bound to "ub" if needed, so that MS lastprivate will match
638 // it exactly
639 if (st > 0) {
640 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
641 } else {
642 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
643 }
644 }
645 if (pr->flags.ordered) {
646 pr->u.p.ordered_lower = init;
647 pr->u.p.ordered_upper = limit;
648 }
649 break;
650 } // case
652 // similar to balanced, but chunk adjusted to multiple of simd width
653 T nth = nproc;
654 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
655 " -> falling-through to static_greedy\n",
656 gtid));
657 schedule = kmp_sch_static_greedy;
658 if (nth > 1)
659 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
660 else
661 pr->u.p.parm1 = tc;
662 break;
663 } // case
666 KD_TRACE(
667 100,
668 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
669 " case\n",
670 gtid));
671
672 if (nproc > 1) {
673 if ((2L * chunk + 1) * nproc >= tc) {
674 /* chunk size too large, switch to dynamic */
675 schedule = kmp_sch_dynamic_chunked;
676 goto dynamic_init;
677 } else {
678 // when remaining iters become less than parm2 - switch to dynamic
679 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
680 *(double *)&pr->u.p.parm3 =
681 guided_flt_param / (double)nproc; // may occupy parm3 and parm4
682 }
683 } else {
684 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
685 "kmp_sch_static_greedy\n",
686 gtid));
687 schedule = kmp_sch_static_greedy;
688 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
689 KD_TRACE(
690 100,
691 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
692 gtid));
693 pr->u.p.parm1 = tc;
694 } // if
695 } // case
696 break;
698 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
699 "kmp_sch_guided_analytical_chunked case\n",
700 gtid));
701
702 if (nproc > 1) {
703 if ((2L * chunk + 1) * nproc >= tc) {
704 /* chunk size too large, switch to dynamic */
705 schedule = kmp_sch_dynamic_chunked;
706 goto dynamic_init;
707 } else {
708 /* commonly used term: (2 nproc - 1)/(2 nproc) */
709 DBL x;
710
711#if KMP_USE_X87CONTROL
712 /* Linux* OS already has 64-bit computation by default for long double,
713 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
714 Windows* OS on IA-32 architecture, we need to set precision to 64-bit
715 instead of the default 53-bit. Even though long double doesn't work
716 on Windows* OS on Intel(R) 64, the resulting lack of precision is not
717 expected to impact the correctness of the algorithm, but this has not
718 been mathematically proven. */
719 // save original FPCW and set precision to 64-bit, as
720 // Windows* OS on IA-32 architecture defaults to 53-bit
721 unsigned int oldFpcw = _control87(0, 0);
722 _control87(_PC_64, _MCW_PC); // 0,0x30000
723#endif
724 /* value used for comparison in solver for cross-over point */
725 KMP_ASSERT(tc > 0);
726 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
727
728 /* crossover point--chunk indexes equal to or greater than
729 this point switch to dynamic-style scheduling */
730 UT cross;
731
732 /* commonly used term: (2 nproc - 1)/(2 nproc) */
733 x = 1.0 - 0.5 / (double)nproc;
734
735#ifdef KMP_DEBUG
736 { // test natural alignment
737 struct _test_a {
738 char a;
739 union {
740 char b;
741 DBL d;
742 };
743 } t;
744 ptrdiff_t natural_alignment =
745 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
746 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
747 // long)natural_alignment );
749 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
750 }
751#endif // KMP_DEBUG
752
753 /* save the term in thread private dispatch structure */
754 *(DBL *)&pr->u.p.parm3 = x;
755
756 /* solve for the crossover point to the nearest integer i for which C_i
757 <= chunk */
758 {
759 UT left, right, mid;
760 long double p;
761
762 /* estimate initial upper and lower bound */
763
764 /* doesn't matter what value right is as long as it is positive, but
765 it affects performance of the solver */
766 right = 229;
767 p = __kmp_pow<UT>(x, right);
768 if (p > target) {
769 do {
770 p *= p;
771 right <<= 1;
772 } while (p > target && right < (1 << 27));
773 /* lower bound is previous (failed) estimate of upper bound */
774 left = right >> 1;
775 } else {
776 left = 0;
777 }
778
779 /* bisection root-finding method */
780 while (left + 1 < right) {
781 mid = (left + right) / 2;
782 if (__kmp_pow<UT>(x, mid) > target) {
783 left = mid;
784 } else {
785 right = mid;
786 }
787 } // while
788 cross = right;
789 }
790 /* assert sanity of computed crossover point */
791 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
792 __kmp_pow<UT>(x, cross) <= target);
793
794 /* save the crossover point in thread private dispatch structure */
795 pr->u.p.parm2 = cross;
796
797// C75803
798#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
799#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
800#else
801#define GUIDED_ANALYTICAL_WORKAROUND (x)
802#endif
803 /* dynamic-style scheduling offset */
804 pr->u.p.count = tc -
806 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
807 cross * chunk;
808#if KMP_USE_X87CONTROL
809 // restore FPCW
810 _control87(oldFpcw, _MCW_PC);
811#endif
812 } // if
813 } else {
814 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
815 "kmp_sch_static_greedy\n",
816 gtid));
817 schedule = kmp_sch_static_greedy;
818 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
819 pr->u.p.parm1 = tc;
820 } // if
821 } // case
822 break;
824 KD_TRACE(
825 100,
826 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
827 gtid));
828 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
829 break;
832 dynamic_init:
833 if (tc == 0)
834 break;
835 if (pr->u.p.parm1 <= 0)
836 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
837 else if (pr->u.p.parm1 > tc)
838 pr->u.p.parm1 = tc;
839 // Store the total number of chunks to prevent integer overflow during
840 // bounds calculations in the get next chunk routine.
841 pr->u.p.parm2 = (tc / pr->u.p.parm1) + (tc % pr->u.p.parm1 ? 1 : 0);
842 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
843 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
844 gtid));
845 break;
846 case kmp_sch_trapezoidal: {
847 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
848
849 T parm1, parm2, parm3, parm4;
850 KD_TRACE(100,
851 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
852 gtid));
853
854 parm1 = chunk;
855
856 /* F : size of the first cycle */
857 parm2 = (tc / (2 * nproc));
858
859 if (parm2 < 1) {
860 parm2 = 1;
861 }
862
863 /* L : size of the last cycle. Make sure the last cycle is not larger
864 than the first cycle. */
865 if (parm1 < 1) {
866 parm1 = 1;
867 } else if (parm1 > parm2) {
868 parm1 = parm2;
869 }
870
871 /* N : number of cycles */
872 parm3 = (parm2 + parm1);
873 parm3 = (2 * tc + parm3 - 1) / parm3;
874
875 if (parm3 < 2) {
876 parm3 = 2;
877 }
878
879 /* sigma : decreasing incr of the trapezoid */
880 parm4 = (parm3 - 1);
881 parm4 = (parm2 - parm1) / parm4;
882
883 // pointless check, because parm4 >= 0 always
884 // if ( parm4 < 0 ) {
885 // parm4 = 0;
886 //}
887
888 pr->u.p.parm1 = parm1;
889 pr->u.p.parm2 = parm2;
890 pr->u.p.parm3 = parm3;
891 pr->u.p.parm4 = parm4;
892 } // case
893 break;
894
895 default: {
896 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
897 KMP_HNT(GetNewerLibrary), // Hint
898 __kmp_msg_null // Variadic argument list terminator
899 );
900 } break;
901 } // switch
902 pr->schedule = schedule;
903}
904
905#if KMP_USE_HIER_SCHED
906template <typename T>
907inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
908 typename traits_t<T>::signed_t st);
909template <>
910inline void
911__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
912 kmp_int32 ub, kmp_int32 st) {
915 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
916}
917template <>
918inline void
919__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
920 kmp_uint32 ub, kmp_int32 st) {
923 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
924}
925template <>
926inline void
927__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
928 kmp_int64 ub, kmp_int64 st) {
931 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
932}
933template <>
934inline void
935__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
936 kmp_uint64 ub, kmp_int64 st) {
939 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
940}
941
942// free all the hierarchy scheduling memory associated with the team
944 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
945 for (int i = 0; i < num_disp_buff; ++i) {
946 // type does not matter here so use kmp_int32
947 auto sh =
948 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
949 &team->t.t_disp_buffer[i]);
950 if (sh->hier) {
951 sh->hier->deallocate();
952 __kmp_free(sh->hier);
953 }
954 }
955}
956#endif
957
958// UT - unsigned flavor of T, ST - signed flavor of T,
959// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
960template <typename T>
961static void
962__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
963 T ub, typename traits_t<T>::signed_t st,
964 typename traits_t<T>::signed_t chunk, int push_ws) {
965 typedef typename traits_t<T>::unsigned_t UT;
966
967 int active;
968 kmp_info_t *th;
969 kmp_team_t *team;
970 kmp_uint32 my_buffer_index;
973
975 sizeof(dispatch_private_info));
977 sizeof(dispatch_shared_info));
979
982
984
985#if INCLUDE_SSC_MARKS
986 SSC_MARK_DISPATCH_INIT();
987#endif
988#ifdef KMP_DEBUG
989 typedef typename traits_t<T>::signed_t ST;
990 {
991 char *buff;
992 // create format specifiers before the debug output
993 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
994 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
995 traits_t<ST>::spec, traits_t<T>::spec,
996 traits_t<T>::spec, traits_t<ST>::spec);
997 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
998 __kmp_str_free(&buff);
999 }
1000#endif
1001 /* setup data */
1002 th = __kmp_threads[gtid];
1003 team = th->th.th_team;
1004 active = !team->t.t_serialized;
1005 th->th.th_ident = loc;
1006
1007 // Any half-decent optimizer will remove this test when the blocks are empty
1008 // since the macros expand to nothing
1009 // when statistics are disabled.
1010 if (schedule == __kmp_static) {
1011 KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
1012 } else {
1013 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
1014 }
1015
1016#if KMP_USE_HIER_SCHED
1017 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
1018 // Hierarchical scheduling does not work with ordered, so if ordered is
1019 // detected, then revert back to threaded scheduling.
1020 bool ordered;
1021 enum sched_type my_sched = schedule;
1022 my_buffer_index = th->th.th_dispatch->th_disp_index;
1023 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1024 &th->th.th_dispatch
1025 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1026 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
1027 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
1028 my_sched =
1029 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
1030 ordered = (kmp_ord_lower & my_sched);
1031 if (pr->flags.use_hier) {
1032 if (ordered) {
1033 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
1034 "Disabling hierarchical scheduling.\n",
1035 gtid));
1036 pr->flags.use_hier = FALSE;
1037 }
1038 }
1039 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
1040 // Don't use hierarchical for ordered parallel loops and don't
1041 // use the runtime hierarchy if one was specified in the program
1042 if (!ordered && !pr->flags.use_hier)
1043 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
1044 }
1045#endif // KMP_USE_HIER_SCHED
1046
1047#if USE_ITT_BUILD
1048 kmp_uint64 cur_chunk = chunk;
1049 int itt_need_metadata_reporting =
1050 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
1051 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
1052 team->t.t_active_level == 1;
1053#endif
1054 if (!active) {
1055 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1056 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1057 } else {
1058 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1059 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1060
1061 my_buffer_index = th->th.th_dispatch->th_disp_index++;
1062
1063 /* What happens when number of threads changes, need to resize buffer? */
1064 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1065 &th->th.th_dispatch
1066 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1067 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
1068 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
1069 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
1070 my_buffer_index));
1071 if (sh->buffer_index != my_buffer_index) { // too many loops in progress?
1072 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d"
1073 " sh->buffer_index:%d\n",
1074 gtid, my_buffer_index, sh->buffer_index));
1075 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1077 // Note: KMP_WAIT() cannot be used there: buffer index and
1078 // my_buffer_index are *always* 32-bit integers.
1079 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1080 "sh->buffer_index:%d\n",
1081 gtid, my_buffer_index, sh->buffer_index));
1082 }
1083 }
1084
1085 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
1086#if USE_ITT_BUILD
1087 &cur_chunk,
1088#endif
1089 chunk, (T)th->th.th_team_nproc,
1090 (T)th->th.th_info.ds.ds_tid);
1091 if (active) {
1092 if (pr->flags.ordered == 0) {
1093 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
1094 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
1095 } else {
1096 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
1097 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
1098 }
1099 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1100 th->th.th_dispatch->th_dispatch_sh_current =
1102#if USE_ITT_BUILD
1103 if (pr->flags.ordered) {
1104 __kmp_itt_ordered_init(gtid);
1105 }
1106 // Report loop metadata
1107 if (itt_need_metadata_reporting) {
1108 // Only report metadata by primary thread of active team at level 1
1109 kmp_uint64 schedtype = 0;
1110 switch (schedule) {
1112 case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1113 break;
1115 cur_chunk = pr->u.p.parm1;
1116 break;
1118 schedtype = 1;
1119 break;
1123 schedtype = 2;
1124 break;
1125 default:
1126 // Should we put this case under "static"?
1127 // case kmp_sch_static_steal:
1128 schedtype = 3;
1129 break;
1130 }
1131 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
1132 }
1133#if KMP_USE_HIER_SCHED
1134 if (pr->flags.use_hier) {
1135 pr->u.p.count = 0;
1136 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
1137 }
1138#endif // KMP_USER_HIER_SCHED
1139#endif /* USE_ITT_BUILD */
1140 }
1141
1142#ifdef KMP_DEBUG
1143 {
1144 char *buff;
1145 // create format specifiers before the debug output
1146 buff = __kmp_str_format(
1147 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1148 "lb:%%%s ub:%%%s"
1149 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1150 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1151 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1152 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1153 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1154 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1155 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
1156 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1157 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1158 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
1159 __kmp_str_free(&buff);
1160 }
1161#endif
1162#if OMPT_SUPPORT && OMPT_OPTIONAL
1163 if (ompt_enabled.ompt_callback_work) {
1164 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1166 ompt_callbacks.ompt_callback(ompt_callback_work)(
1167 ompt_get_work_schedule(pr->schedule), ompt_scope_begin,
1168 &(team_info->parallel_data), &(task_info->task_data), pr->u.p.tc,
1169 OMPT_LOAD_RETURN_ADDRESS(gtid));
1170 }
1171#endif
1172 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
1173}
1174
1175/* For ordered loops, either __kmp_dispatch_finish() should be called after
1176 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1177 * every chunk of iterations. If the ordered section(s) were not executed
1178 * for this iteration (or every iteration in this chunk), we need to set the
1179 * ordered iteration counters so that the next thread can proceed. */
1180template <typename UT>
1181static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1182 typedef typename traits_t<UT>::signed_t ST;
1184 kmp_info_t *th = __kmp_threads[gtid];
1185
1186 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1187 if (!th->th.th_team->t.t_serialized) {
1188
1190 reinterpret_cast<dispatch_private_info_template<UT> *>(
1191 th->th.th_dispatch->th_dispatch_pr_current);
1193 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1194 th->th.th_dispatch->th_dispatch_sh_current);
1195 KMP_DEBUG_ASSERT(pr);
1196 KMP_DEBUG_ASSERT(sh);
1197 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1198 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1199
1200 if (pr->ordered_bumped) {
1201 KD_TRACE(
1202 1000,
1203 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1204 gtid));
1205 pr->ordered_bumped = 0;
1206 } else {
1207 UT lower = pr->u.p.ordered_lower;
1208
1209#ifdef KMP_DEBUG
1210 {
1211 char *buff;
1212 // create format specifiers before the debug output
1213 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1214 "ordered_iteration:%%%s lower:%%%s\n",
1215 traits_t<UT>::spec, traits_t<UT>::spec);
1216 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1217 __kmp_str_free(&buff);
1218 }
1219#endif
1220
1221 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1223 KMP_MB(); /* is this necessary? */
1224#ifdef KMP_DEBUG
1225 {
1226 char *buff;
1227 // create format specifiers before the debug output
1228 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1229 "ordered_iteration:%%%s lower:%%%s\n",
1230 traits_t<UT>::spec, traits_t<UT>::spec);
1231 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1232 __kmp_str_free(&buff);
1233 }
1234#endif
1235
1236 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1237 } // if
1238 } // if
1239 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1240}
1241
1242#ifdef KMP_GOMP_COMPAT
1243
1244template <typename UT>
1245static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1246 typedef typename traits_t<UT>::signed_t ST;
1248 kmp_info_t *th = __kmp_threads[gtid];
1249
1250 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1251 if (!th->th.th_team->t.t_serialized) {
1253 reinterpret_cast<dispatch_private_info_template<UT> *>(
1254 th->th.th_dispatch->th_dispatch_pr_current);
1256 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1257 th->th.th_dispatch->th_dispatch_sh_current);
1258 KMP_DEBUG_ASSERT(pr);
1259 KMP_DEBUG_ASSERT(sh);
1260 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1261 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1262
1263 UT lower = pr->u.p.ordered_lower;
1264 UT upper = pr->u.p.ordered_upper;
1265 UT inc = upper - lower + 1;
1266
1267 if (pr->ordered_bumped == inc) {
1268 KD_TRACE(
1269 1000,
1270 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1271 gtid));
1272 pr->ordered_bumped = 0;
1273 } else {
1274 inc -= pr->ordered_bumped;
1275
1276#ifdef KMP_DEBUG
1277 {
1278 char *buff;
1279 // create format specifiers before the debug output
1280 buff = __kmp_str_format(
1281 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1282 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1283 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1284 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1285 __kmp_str_free(&buff);
1286 }
1287#endif
1288
1289 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1291
1292 KMP_MB(); /* is this necessary? */
1293 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1294 "ordered_bumped to zero\n",
1295 gtid));
1296 pr->ordered_bumped = 0;
1297//!!!!! TODO check if the inc should be unsigned, or signed???
1298#ifdef KMP_DEBUG
1299 {
1300 char *buff;
1301 // create format specifiers before the debug output
1302 buff = __kmp_str_format(
1303 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1304 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1305 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1306 traits_t<UT>::spec);
1307 KD_TRACE(1000,
1308 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1309 __kmp_str_free(&buff);
1310 }
1311#endif
1312
1313 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1314 }
1315 // }
1316 }
1317 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1318}
1319
1320#endif /* KMP_GOMP_COMPAT */
1321
1322template <typename T>
1326 kmp_int32 *p_last, T *p_lb, T *p_ub,
1327 typename traits_t<T>::signed_t *p_st, T nproc,
1328 T tid) {
1329 typedef typename traits_t<T>::unsigned_t UT;
1330 typedef typename traits_t<T>::signed_t ST;
1331 typedef typename traits_t<T>::floating_t DBL;
1332 int status = 0;
1333 bool last = false;
1334 T start;
1335 ST incr;
1336 UT limit, trip, init;
1337 kmp_info_t *th = __kmp_threads[gtid];
1338 kmp_team_t *team = th->th.th_team;
1339
1340 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1341 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1342 KMP_DEBUG_ASSERT(pr);
1343 KMP_DEBUG_ASSERT(sh);
1344 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1345#ifdef KMP_DEBUG
1346 {
1347 char *buff;
1348 // create format specifiers before the debug output
1349 buff =
1350 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1351 "sh:%%p nproc:%%%s tid:%%%s\n",
1352 traits_t<T>::spec, traits_t<T>::spec);
1353 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1354 __kmp_str_free(&buff);
1355 }
1356#endif
1357
1358 // zero trip count
1359 if (pr->u.p.tc == 0) {
1360 KD_TRACE(10,
1361 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1362 "zero status:%d\n",
1363 gtid, status));
1364 return 0;
1365 }
1366
1367 switch (pr->schedule) {
1368#if KMP_STATIC_STEAL_ENABLED
1369 case kmp_sch_static_steal: {
1370 T chunk = pr->u.p.parm1;
1371 UT nchunks = pr->u.p.parm2;
1372 KD_TRACE(100,
1373 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1374 gtid));
1375
1376 trip = pr->u.p.tc - 1;
1377
1378 if (traits_t<T>::type_size > 4) {
1379 // use lock for 8-byte induction variable.
1380 // TODO (optional): check presence and use 16-byte CAS
1381 kmp_lock_t *lck = pr->u.p.steal_lock;
1382 KMP_DEBUG_ASSERT(lck != NULL);
1383 if (pr->u.p.count < (UT)pr->u.p.ub) {
1384 KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1385 __kmp_acquire_lock(lck, gtid);
1386 // try to get own chunk of iterations
1387 init = (pr->u.p.count)++;
1388 status = (init < (UT)pr->u.p.ub);
1389 __kmp_release_lock(lck, gtid);
1390 } else {
1391 status = 0; // no own chunks
1392 }
1393 if (!status) { // try to steal
1394 kmp_lock_t *lckv; // victim buffer's lock
1395 T while_limit = pr->u.p.parm3;
1396 T while_index = 0;
1397 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1398 __kmp_dispatch_num_buffers; // current loop index
1399 // note: victim thread can potentially execute another loop
1400 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1401 while ((!status) && (while_limit != ++while_index)) {
1403 T remaining;
1404 T victimId = pr->u.p.parm4;
1405 T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1406 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1407 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1409 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1410 oldVictimId != victimId) {
1411 victimId = (victimId + 1) % nproc;
1412 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1413 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1415 }
1416 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1417 continue; // try once more (nproc attempts in total)
1418 }
1419 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1420 kmp_uint32 old = UNUSED;
1421 // try to steal whole range from inactive victim
1422 status = v->steal_flag.compare_exchange_strong(old, THIEF);
1423 if (status) {
1424 // initialize self buffer with victim's whole range of chunks
1425 T id = victimId;
1426 T small_chunk = 0, extras = 0, p_extra = 0;
1427 __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
1428 init, small_chunk, extras,
1429 p_extra);
1430 __kmp_acquire_lock(lck, gtid);
1431 pr->u.p.count = init + 1; // exclude one we execute immediately
1432 pr->u.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
1433 __kmp_release_lock(lck, gtid);
1434 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1435 // no need to reinitialize other thread invariants: lb, st, etc.
1436#ifdef KMP_DEBUG
1437 {
1438 char *buff;
1439 // create format specifiers before the debug output
1440 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1441 "stolen chunks from T#%%d, "
1442 "count:%%%s ub:%%%s\n",
1443 traits_t<UT>::spec, traits_t<T>::spec);
1444 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1445 __kmp_str_free(&buff);
1446 }
1447#endif
1448 // activate non-empty buffer and let others steal from us
1449 if (pr->u.p.count < (UT)pr->u.p.ub)
1450 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1451 break;
1452 }
1453 }
1454 if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1455 v->u.p.count >= (UT)v->u.p.ub) {
1456 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1457 continue; // no chunks to steal, try next victim
1458 }
1459 lckv = v->u.p.steal_lock;
1460 KMP_ASSERT(lckv != NULL);
1461 __kmp_acquire_lock(lckv, gtid);
1462 limit = v->u.p.ub; // keep initial ub
1463 if (v->u.p.count >= limit) {
1464 __kmp_release_lock(lckv, gtid);
1465 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid
1466 continue; // no chunks to steal, try next victim
1467 }
1468
1469 // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1470 // TODO: is this heuristics good enough??
1471 remaining = limit - v->u.p.count;
1472 if (remaining > 7) {
1473 // steal 1/4 of remaining
1474 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1475 init = (v->u.p.ub -= (remaining >> 2));
1476 } else {
1477 // steal 1 chunk of 1..7 remaining
1478 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1479 init = (v->u.p.ub -= 1);
1480 }
1481 __kmp_release_lock(lckv, gtid);
1482#ifdef KMP_DEBUG
1483 {
1484 char *buff;
1485 // create format specifiers before the debug output
1486 buff = __kmp_str_format(
1487 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1488 "count:%%%s ub:%%%s\n",
1489 traits_t<UT>::spec, traits_t<UT>::spec);
1490 KD_TRACE(10, (buff, gtid, victimId, init, limit));
1491 __kmp_str_free(&buff);
1492 }
1493#endif
1494 KMP_DEBUG_ASSERT(init + 1 <= limit);
1495 pr->u.p.parm4 = victimId; // remember victim to steal from
1496 status = 1;
1497 // now update own count and ub with stolen range excluding init chunk
1498 __kmp_acquire_lock(lck, gtid);
1499 pr->u.p.count = init + 1;
1500 pr->u.p.ub = limit;
1501 __kmp_release_lock(lck, gtid);
1502 // activate non-empty buffer and let others steal from us
1503 if (init + 1 < limit)
1504 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1505 } // while (search for victim)
1506 } // if (try to find victim and steal)
1507 } else {
1508 // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1509 // as all operations on pair (count, ub) must be done atomically
1510 typedef union {
1511 struct {
1512 UT count;
1513 T ub;
1514 } p;
1515 kmp_int64 b;
1516 } union_i4;
1517 union_i4 vold, vnew;
1518 if (pr->u.p.count < (UT)pr->u.p.ub) {
1519 KMP_DEBUG_ASSERT(pr->steal_flag == READY);
1520 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1521 vnew.b = vold.b;
1522 vnew.p.count++; // get chunk from head of self range
1524 (volatile kmp_int64 *)&pr->u.p.count,
1525 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1526 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1527 KMP_CPU_PAUSE();
1528 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1529 vnew.b = vold.b;
1530 vnew.p.count++;
1531 }
1532 init = vold.p.count;
1533 status = (init < (UT)vold.p.ub);
1534 } else {
1535 status = 0; // no own chunks
1536 }
1537 if (!status) { // try to steal
1538 T while_limit = pr->u.p.parm3;
1539 T while_index = 0;
1540 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1541 __kmp_dispatch_num_buffers; // current loop index
1542 // note: victim thread can potentially execute another loop
1543 KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive
1544 while ((!status) && (while_limit != ++while_index)) {
1546 T remaining;
1547 T victimId = pr->u.p.parm4;
1548 T oldVictimId = victimId ? victimId - 1 : nproc - 1;
1549 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1550 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1552 while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) &&
1553 oldVictimId != victimId) {
1554 victimId = (victimId + 1) % nproc;
1555 v = reinterpret_cast<dispatch_private_info_template<T> *>(
1556 &team->t.t_dispatch[victimId].th_disp_buffer[idx]);
1558 }
1559 if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) {
1560 continue; // try once more (nproc attempts in total)
1561 }
1562 if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) {
1563 kmp_uint32 old = UNUSED;
1564 // try to steal whole range from inactive victim
1565 status = v->steal_flag.compare_exchange_strong(old, THIEF);
1566 if (status) {
1567 // initialize self buffer with victim's whole range of chunks
1568 T id = victimId;
1569 T small_chunk = 0, extras = 0, p_extra = 0;
1570 __kmp_initialize_self_buffer<T>(team, id, pr, nchunks, nproc,
1571 init, small_chunk, extras,
1572 p_extra);
1573 vnew.p.count = init + 1;
1574 vnew.p.ub = init + small_chunk + p_extra + (id < extras ? 1 : 0);
1575 // write pair (count, ub) at once atomically
1576#if KMP_ARCH_X86
1577 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b);
1578#else
1579 *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b;
1580#endif
1581 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
1582 // no need to initialize other thread invariants: lb, st, etc.
1583#ifdef KMP_DEBUG
1584 {
1585 char *buff;
1586 // create format specifiers before the debug output
1587 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1588 "stolen chunks from T#%%d, "
1589 "count:%%%s ub:%%%s\n",
1590 traits_t<UT>::spec, traits_t<T>::spec);
1591 KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub));
1592 __kmp_str_free(&buff);
1593 }
1594#endif
1595 // activate non-empty buffer and let others steal from us
1596 if (pr->u.p.count < (UT)pr->u.p.ub)
1597 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1598 break;
1599 }
1600 }
1601 while (1) { // CAS loop with check if victim still has enough chunks
1602 // many threads may be stealing concurrently from same victim
1603 vold.b = *(volatile kmp_int64 *)(&v->u.p.count);
1604 if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY ||
1605 vold.p.count >= (UT)vold.p.ub) {
1606 pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id
1607 break; // no chunks to steal, try next victim
1608 }
1609 vnew.b = vold.b;
1610 remaining = vold.p.ub - vold.p.count;
1611 // try to steal 1/4 of remaining
1612 // TODO: is this heuristics good enough??
1613 if (remaining > 7) {
1614 vnew.p.ub -= remaining >> 2; // steal from tail of victim's range
1615 } else {
1616 vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining
1617 }
1618 KMP_DEBUG_ASSERT(vnew.p.ub * (UT)chunk <= trip);
1620 (volatile kmp_int64 *)&v->u.p.count,
1621 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1622 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1623 // stealing succedded
1624#ifdef KMP_DEBUG
1625 {
1626 char *buff;
1627 // create format specifiers before the debug output
1628 buff = __kmp_str_format(
1629 "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, "
1630 "count:%%%s ub:%%%s\n",
1631 traits_t<T>::spec, traits_t<T>::spec);
1632 KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub));
1633 __kmp_str_free(&buff);
1634 }
1635#endif
1636 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1637 vold.p.ub - vnew.p.ub);
1638 status = 1;
1639 pr->u.p.parm4 = victimId; // keep victim id
1640 // now update own count and ub
1641 init = vnew.p.ub;
1642 vold.p.count = init + 1;
1643#if KMP_ARCH_X86
1644 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1645#else
1646 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1647#endif
1648 // activate non-empty buffer and let others steal from us
1649 if (vold.p.count < (UT)vold.p.ub)
1650 KMP_ATOMIC_ST_REL(&pr->steal_flag, READY);
1651 break;
1652 } // if (check CAS result)
1653 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1654 } // while (try to steal from particular victim)
1655 } // while (search for victim)
1656 } // if (try to find victim and steal)
1657 } // if (4-byte induction variable)
1658 if (!status) {
1659 *p_lb = 0;
1660 *p_ub = 0;
1661 if (p_st != NULL)
1662 *p_st = 0;
1663 } else {
1664 start = pr->u.p.lb;
1665 init *= chunk;
1666 limit = chunk + init - 1;
1667 incr = pr->u.p.st;
1668 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1669
1670 KMP_DEBUG_ASSERT(init <= trip);
1671 // keep track of done chunks for possible early exit from stealing
1672 // TODO: count executed chunks locally with rare update of shared location
1673 // test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1674 if ((last = (limit >= trip)) != 0)
1675 limit = trip;
1676 if (p_st != NULL)
1677 *p_st = incr;
1678
1679 if (incr == 1) {
1680 *p_lb = start + init;
1681 *p_ub = start + limit;
1682 } else {
1683 *p_lb = start + init * incr;
1684 *p_ub = start + limit * incr;
1685 }
1686 } // if
1687 break;
1688 } // case
1689#endif // KMP_STATIC_STEAL_ENABLED
1691 KD_TRACE(
1692 10,
1693 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1694 gtid));
1695 /* check if thread has any iteration to do */
1696 if ((status = !pr->u.p.count) != 0) {
1697 pr->u.p.count = 1;
1698 *p_lb = pr->u.p.lb;
1699 *p_ub = pr->u.p.ub;
1700 last = (pr->u.p.parm1 != 0);
1701 if (p_st != NULL)
1702 *p_st = pr->u.p.st;
1703 } else { /* no iterations to do */
1704 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1705 }
1706 } // case
1707 break;
1708 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1709 merged here */
1711 T parm1;
1712
1713 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1714 "kmp_sch_static_[affinity|chunked] case\n",
1715 gtid));
1716 parm1 = pr->u.p.parm1;
1717
1718 trip = pr->u.p.tc - 1;
1719 init = parm1 * (pr->u.p.count + tid);
1720
1721 if ((status = (init <= trip)) != 0) {
1722 start = pr->u.p.lb;
1723 incr = pr->u.p.st;
1724 limit = parm1 + init - 1;
1725
1726 if ((last = (limit >= trip)) != 0)
1727 limit = trip;
1728
1729 if (p_st != NULL)
1730 *p_st = incr;
1731
1732 pr->u.p.count += nproc;
1733
1734 if (incr == 1) {
1735 *p_lb = start + init;
1736 *p_ub = start + limit;
1737 } else {
1738 *p_lb = start + init * incr;
1739 *p_ub = start + limit * incr;
1740 }
1741
1742 if (pr->flags.ordered) {
1743 pr->u.p.ordered_lower = init;
1744 pr->u.p.ordered_upper = limit;
1745 } // if
1746 } // if
1747 } // case
1748 break;
1749
1751 UT chunk_number;
1752 UT chunk_size = pr->u.p.parm1;
1753 UT nchunks = pr->u.p.parm2;
1754
1755 KD_TRACE(
1756 100,
1757 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1758 gtid));
1759
1760 chunk_number = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1761 status = (chunk_number < nchunks);
1762 if (!status) {
1763 *p_lb = 0;
1764 *p_ub = 0;
1765 if (p_st != NULL)
1766 *p_st = 0;
1767 } else {
1768 init = chunk_size * chunk_number;
1769 trip = pr->u.p.tc - 1;
1770 start = pr->u.p.lb;
1771 incr = pr->u.p.st;
1772
1773 if ((last = (trip - init < (UT)chunk_size)))
1774 limit = trip;
1775 else
1776 limit = chunk_size + init - 1;
1777
1778 if (p_st != NULL)
1779 *p_st = incr;
1780
1781 if (incr == 1) {
1782 *p_lb = start + init;
1783 *p_ub = start + limit;
1784 } else {
1785 *p_lb = start + init * incr;
1786 *p_ub = start + limit * incr;
1787 }
1788
1789 if (pr->flags.ordered) {
1790 pr->u.p.ordered_lower = init;
1791 pr->u.p.ordered_upper = limit;
1792 } // if
1793 } // if
1794 } // case
1795 break;
1796
1798 T chunkspec = pr->u.p.parm1;
1799 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1800 "iterative case\n",
1801 gtid));
1802 trip = pr->u.p.tc;
1803 // Start atomic part of calculations
1804 while (1) {
1805 ST remaining; // signed, because can be < 0
1806 init = sh->u.s.iteration; // shared value
1807 remaining = trip - init;
1808 if (remaining <= 0) { // AC: need to compare with 0 first
1809 // nothing to do, don't try atomic op
1810 status = 0;
1811 break;
1812 }
1813 if ((T)remaining <
1814 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1815 // use dynamic-style schedule
1816 // atomically increment iterations, get old value
1817 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1818 (ST)chunkspec);
1819 remaining = trip - init;
1820 if (remaining <= 0) {
1821 status = 0; // all iterations got by other threads
1822 } else {
1823 // got some iterations to work on
1824 status = 1;
1825 if ((T)remaining > chunkspec) {
1826 limit = init + chunkspec - 1;
1827 } else {
1828 last = true; // the last chunk
1829 limit = init + remaining - 1;
1830 } // if
1831 } // if
1832 break;
1833 } // if
1834 limit = init + (UT)((double)remaining *
1835 *(double *)&pr->u.p.parm3); // divide by K*nproc
1836 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1837 (ST)init, (ST)limit)) {
1838 // CAS was successful, chunk obtained
1839 status = 1;
1840 --limit;
1841 break;
1842 } // if
1843 } // while
1844 if (status != 0) {
1845 start = pr->u.p.lb;
1846 incr = pr->u.p.st;
1847 if (p_st != NULL)
1848 *p_st = incr;
1849 *p_lb = start + init * incr;
1850 *p_ub = start + limit * incr;
1851 if (pr->flags.ordered) {
1852 pr->u.p.ordered_lower = init;
1853 pr->u.p.ordered_upper = limit;
1854 } // if
1855 } else {
1856 *p_lb = 0;
1857 *p_ub = 0;
1858 if (p_st != NULL)
1859 *p_st = 0;
1860 } // if
1861 } // case
1862 break;
1863
1864 case kmp_sch_guided_simd: {
1865 // same as iterative but curr-chunk adjusted to be multiple of given
1866 // chunk
1867 T chunk = pr->u.p.parm1;
1868 KD_TRACE(100,
1869 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1870 gtid));
1871 trip = pr->u.p.tc;
1872 // Start atomic part of calculations
1873 while (1) {
1874 ST remaining; // signed, because can be < 0
1875 init = sh->u.s.iteration; // shared value
1876 remaining = trip - init;
1877 if (remaining <= 0) { // AC: need to compare with 0 first
1878 status = 0; // nothing to do, don't try atomic op
1879 break;
1880 }
1881 KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
1882 // compare with K*nproc*(chunk+1), K=2 by default
1883 if ((T)remaining < pr->u.p.parm2) {
1884 // use dynamic-style schedule
1885 // atomically increment iterations, get old value
1886 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1887 (ST)chunk);
1888 remaining = trip - init;
1889 if (remaining <= 0) {
1890 status = 0; // all iterations got by other threads
1891 } else {
1892 // got some iterations to work on
1893 status = 1;
1894 if ((T)remaining > chunk) {
1895 limit = init + chunk - 1;
1896 } else {
1897 last = true; // the last chunk
1898 limit = init + remaining - 1;
1899 } // if
1900 } // if
1901 break;
1902 } // if
1903 // divide by K*nproc
1904 UT span;
1905 __kmp_type_convert((double)remaining * (*(double *)&pr->u.p.parm3),
1906 &span);
1907 UT rem = span % chunk;
1908 if (rem) // adjust so that span%chunk == 0
1909 span += chunk - rem;
1910 limit = init + span;
1911 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1912 (ST)init, (ST)limit)) {
1913 // CAS was successful, chunk obtained
1914 status = 1;
1915 --limit;
1916 break;
1917 } // if
1918 } // while
1919 if (status != 0) {
1920 start = pr->u.p.lb;
1921 incr = pr->u.p.st;
1922 if (p_st != NULL)
1923 *p_st = incr;
1924 *p_lb = start + init * incr;
1925 *p_ub = start + limit * incr;
1926 if (pr->flags.ordered) {
1927 pr->u.p.ordered_lower = init;
1928 pr->u.p.ordered_upper = limit;
1929 } // if
1930 } else {
1931 *p_lb = 0;
1932 *p_ub = 0;
1933 if (p_st != NULL)
1934 *p_st = 0;
1935 } // if
1936 } // case
1937 break;
1938
1940 T chunkspec = pr->u.p.parm1;
1941 UT chunkIdx;
1942#if KMP_USE_X87CONTROL
1943 /* for storing original FPCW value for Windows* OS on
1944 IA-32 architecture 8-byte version */
1945 unsigned int oldFpcw;
1946 unsigned int fpcwSet = 0;
1947#endif
1948 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1949 "kmp_sch_guided_analytical_chunked case\n",
1950 gtid));
1951
1952 trip = pr->u.p.tc;
1953
1954 KMP_DEBUG_ASSERT(nproc > 1);
1955 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1956
1957 while (1) { /* this while loop is a safeguard against unexpected zero
1958 chunk sizes */
1959 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1960 if (chunkIdx >= (UT)pr->u.p.parm2) {
1961 --trip;
1962 /* use dynamic-style scheduling */
1963 init = chunkIdx * chunkspec + pr->u.p.count;
1964 /* need to verify init > 0 in case of overflow in the above
1965 * calculation */
1966 if ((status = (init > 0 && init <= trip)) != 0) {
1967 limit = init + chunkspec - 1;
1968
1969 if ((last = (limit >= trip)) != 0)
1970 limit = trip;
1971 }
1972 break;
1973 } else {
1974/* use exponential-style scheduling */
1975/* The following check is to workaround the lack of long double precision on
1976 Windows* OS.
1977 This check works around the possible effect that init != 0 for chunkIdx == 0.
1978 */
1979#if KMP_USE_X87CONTROL
1980 /* If we haven't already done so, save original
1981 FPCW and set precision to 64-bit, as Windows* OS
1982 on IA-32 architecture defaults to 53-bit */
1983 if (!fpcwSet) {
1984 oldFpcw = _control87(0, 0);
1985 _control87(_PC_64, _MCW_PC);
1986 fpcwSet = 0x30000;
1987 }
1988#endif
1989 if (chunkIdx) {
1991 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1993 init = trip - init;
1994 } else
1995 init = 0;
1997 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1998 KMP_ASSERT(init <= limit);
1999 if (init < limit) {
2000 KMP_DEBUG_ASSERT(limit <= trip);
2001 --limit;
2002 status = 1;
2003 break;
2004 } // if
2005 } // if
2006 } // while (1)
2007#if KMP_USE_X87CONTROL
2008 /* restore FPCW if necessary
2009 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2010 */
2011 if (fpcwSet && (oldFpcw & fpcwSet))
2012 _control87(oldFpcw, _MCW_PC);
2013#endif
2014 if (status != 0) {
2015 start = pr->u.p.lb;
2016 incr = pr->u.p.st;
2017 if (p_st != NULL)
2018 *p_st = incr;
2019 *p_lb = start + init * incr;
2020 *p_ub = start + limit * incr;
2021 if (pr->flags.ordered) {
2022 pr->u.p.ordered_lower = init;
2023 pr->u.p.ordered_upper = limit;
2024 }
2025 } else {
2026 *p_lb = 0;
2027 *p_ub = 0;
2028 if (p_st != NULL)
2029 *p_st = 0;
2030 }
2031 } // case
2032 break;
2033
2034 case kmp_sch_trapezoidal: {
2035 UT index;
2036 T parm2 = pr->u.p.parm2;
2037 T parm3 = pr->u.p.parm3;
2038 T parm4 = pr->u.p.parm4;
2039 KD_TRACE(100,
2040 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
2041 gtid));
2042
2043 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2044
2045 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2046 trip = pr->u.p.tc - 1;
2047
2048 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2049 *p_lb = 0;
2050 *p_ub = 0;
2051 if (p_st != NULL)
2052 *p_st = 0;
2053 } else {
2054 start = pr->u.p.lb;
2055 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2056 incr = pr->u.p.st;
2057
2058 if ((last = (limit >= trip)) != 0)
2059 limit = trip;
2060
2061 if (p_st != NULL)
2062 *p_st = incr;
2063
2064 if (incr == 1) {
2065 *p_lb = start + init;
2066 *p_ub = start + limit;
2067 } else {
2068 *p_lb = start + init * incr;
2069 *p_ub = start + limit * incr;
2070 }
2071
2072 if (pr->flags.ordered) {
2073 pr->u.p.ordered_lower = init;
2074 pr->u.p.ordered_upper = limit;
2075 } // if
2076 } // if
2077 } // case
2078 break;
2079 default: {
2080 status = 0; // to avoid complaints on uninitialized variable use
2081 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
2082 KMP_HNT(GetNewerLibrary), // Hint
2083 __kmp_msg_null // Variadic argument list terminator
2084 );
2085 } break;
2086 } // switch
2087 if (p_last)
2088 *p_last = last;
2089#ifdef KMP_DEBUG
2090 if (pr->flags.ordered) {
2091 char *buff;
2092 // create format specifiers before the debug output
2093 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
2094 "ordered_lower:%%%s ordered_upper:%%%s\n",
2095 traits_t<UT>::spec, traits_t<UT>::spec);
2096 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
2097 __kmp_str_free(&buff);
2098 }
2099 {
2100 char *buff;
2101 // create format specifiers before the debug output
2102 buff = __kmp_str_format(
2103 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
2104 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
2105 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2106 KMP_DEBUG_ASSERT(p_last);
2107 KMP_DEBUG_ASSERT(p_st);
2108 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
2109 __kmp_str_free(&buff);
2110 }
2111#endif
2112 return status;
2113}
2114
2115/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
2116 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
2117 is not called. */
2118#if OMPT_SUPPORT && OMPT_OPTIONAL
2119#define OMPT_LOOP_END \
2120 if (status == 0) { \
2121 if (ompt_enabled.ompt_callback_work) { \
2122 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
2123 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
2124 ompt_callbacks.ompt_callback(ompt_callback_work)( \
2125 ompt_get_work_schedule(pr->schedule), ompt_scope_end, \
2126 &(team_info->parallel_data), &(task_info->task_data), 0, codeptr); \
2127 } \
2128 }
2129#define OMPT_LOOP_DISPATCH(lb, ub, st, status) \
2130 if (ompt_enabled.ompt_callback_dispatch && status) { \
2131 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
2132 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
2133 ompt_dispatch_chunk_t chunk; \
2134 ompt_data_t instance = ompt_data_none; \
2135 OMPT_GET_DISPATCH_CHUNK(chunk, lb, ub, st); \
2136 instance.ptr = &chunk; \
2137 ompt_callbacks.ompt_callback(ompt_callback_dispatch)( \
2138 &(team_info->parallel_data), &(task_info->task_data), \
2139 ompt_dispatch_ws_loop_chunk, instance); \
2140 }
2141// TODO: implement count
2142#else
2143#define OMPT_LOOP_END // no-op
2144#define OMPT_LOOP_DISPATCH(lb, ub, st, status) // no-op
2145#endif
2146
2147#if KMP_STATS_ENABLED
2148#define KMP_STATS_LOOP_END \
2149 { \
2150 kmp_int64 u, l, t, i; \
2151 l = (kmp_int64)(*p_lb); \
2152 u = (kmp_int64)(*p_ub); \
2153 i = (kmp_int64)(pr->u.p.st); \
2154 if (status == 0) { \
2155 t = 0; \
2156 KMP_POP_PARTITIONED_TIMER(); \
2157 } else if (i == 1) { \
2158 if (u >= l) \
2159 t = u - l + 1; \
2160 else \
2161 t = 0; \
2162 } else if (i < 0) { \
2163 if (l >= u) \
2164 t = (l - u) / (-i) + 1; \
2165 else \
2166 t = 0; \
2167 } else { \
2168 if (u >= l) \
2169 t = (u - l) / i + 1; \
2170 else \
2171 t = 0; \
2172 } \
2173 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
2174 }
2175#else
2176#define KMP_STATS_LOOP_END /* Nothing */
2177#endif
2178
2179template <typename T>
2180static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
2181 T *p_lb, T *p_ub,
2182 typename traits_t<T>::signed_t *p_st
2183#if OMPT_SUPPORT && OMPT_OPTIONAL
2184 ,
2185 void *codeptr
2186#endif
2187) {
2188
2189 typedef typename traits_t<T>::unsigned_t UT;
2190 typedef typename traits_t<T>::signed_t ST;
2191 // This is potentially slightly misleading, schedule(runtime) will appear here
2192 // even if the actual runtime schedule is static. (Which points out a
2193 // disadvantage of schedule(runtime): even when static scheduling is used it
2194 // costs more than a compile time choice to use static scheduling would.)
2195 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
2196
2197 int status;
2200 kmp_info_t *th = __kmp_threads[gtid];
2201 kmp_team_t *team = th->th.th_team;
2202
2203 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
2204 KD_TRACE(
2205 1000,
2206 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
2207 gtid, p_lb, p_ub, p_st, p_last));
2208
2209 if (team->t.t_serialized) {
2210 /* NOTE: serialize this dispatch because we are not at the active level */
2211 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2212 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
2213 KMP_DEBUG_ASSERT(pr);
2214
2215 if ((status = (pr->u.p.tc != 0)) == 0) {
2216 *p_lb = 0;
2217 *p_ub = 0;
2218 // if ( p_last != NULL )
2219 // *p_last = 0;
2220 if (p_st != NULL)
2221 *p_st = 0;
2223 if (pr->pushed_ws != ct_none) {
2224 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2225 }
2226 }
2227 } else if (pr->flags.nomerge) {
2228 kmp_int32 last;
2229 T start;
2230 UT limit, trip, init;
2231 ST incr;
2232 T chunk = pr->u.p.parm1;
2233
2234 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
2235 gtid));
2236
2237 init = chunk * pr->u.p.count++;
2238 trip = pr->u.p.tc - 1;
2239
2240 if ((status = (init <= trip)) == 0) {
2241 *p_lb = 0;
2242 *p_ub = 0;
2243 // if ( p_last != NULL )
2244 // *p_last = 0;
2245 if (p_st != NULL)
2246 *p_st = 0;
2248 if (pr->pushed_ws != ct_none) {
2249 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2250 }
2251 }
2252 } else {
2253 start = pr->u.p.lb;
2254 limit = chunk + init - 1;
2255 incr = pr->u.p.st;
2256
2257 if ((last = (limit >= trip)) != 0) {
2258 limit = trip;
2259#if KMP_OS_WINDOWS
2260 pr->u.p.last_upper = pr->u.p.ub;
2261#endif /* KMP_OS_WINDOWS */
2262 }
2263 if (p_last != NULL)
2264 *p_last = last;
2265 if (p_st != NULL)
2266 *p_st = incr;
2267 if (incr == 1) {
2268 *p_lb = start + init;
2269 *p_ub = start + limit;
2270 } else {
2271 *p_lb = start + init * incr;
2272 *p_ub = start + limit * incr;
2273 }
2274
2275 if (pr->flags.ordered) {
2276 pr->u.p.ordered_lower = init;
2277 pr->u.p.ordered_upper = limit;
2278#ifdef KMP_DEBUG
2279 {
2280 char *buff;
2281 // create format specifiers before the debug output
2282 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2283 "ordered_lower:%%%s ordered_upper:%%%s\n",
2284 traits_t<UT>::spec, traits_t<UT>::spec);
2285 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2286 pr->u.p.ordered_upper));
2287 __kmp_str_free(&buff);
2288 }
2289#endif
2290 } // if
2291 } // if
2292 } else {
2293 pr->u.p.tc = 0;
2294 *p_lb = pr->u.p.lb;
2295 *p_ub = pr->u.p.ub;
2296#if KMP_OS_WINDOWS
2297 pr->u.p.last_upper = *p_ub;
2298#endif /* KMP_OS_WINDOWS */
2299 if (p_last != NULL)
2300 *p_last = TRUE;
2301 if (p_st != NULL)
2302 *p_st = pr->u.p.st;
2303 } // if
2304#ifdef KMP_DEBUG
2305 {
2306 char *buff;
2307 // create format specifiers before the debug output
2308 buff = __kmp_str_format(
2309 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2310 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2311 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2312 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2313 (p_last ? *p_last : 0), status));
2314 __kmp_str_free(&buff);
2315 }
2316#endif
2317#if INCLUDE_SSC_MARKS
2318 SSC_MARK_DISPATCH_NEXT();
2319#endif
2320 OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2323 return status;
2324 } else {
2325 kmp_int32 last = 0;
2327
2328 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2329 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2330
2331 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2332 th->th.th_dispatch->th_dispatch_pr_current);
2333 KMP_DEBUG_ASSERT(pr);
2334 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2335 th->th.th_dispatch->th_dispatch_sh_current);
2336 KMP_DEBUG_ASSERT(sh);
2337
2338#if KMP_USE_HIER_SCHED
2339 if (pr->flags.use_hier)
2340 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2341 else
2342#endif // KMP_USE_HIER_SCHED
2343 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2344 p_st, th->th.th_team_nproc,
2345 th->th.th_info.ds.ds_tid);
2346 // status == 0: no more iterations to execute
2347 if (status == 0) {
2348 ST num_done;
2349 num_done = test_then_inc<ST>(&sh->u.s.num_done);
2350#ifdef KMP_DEBUG
2351 {
2352 char *buff;
2353 // create format specifiers before the debug output
2354 buff = __kmp_str_format(
2355 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2356 traits_t<ST>::spec);
2357 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2358 __kmp_str_free(&buff);
2359 }
2360#endif
2361
2362#if KMP_USE_HIER_SCHED
2363 pr->flags.use_hier = FALSE;
2364#endif
2365 if (num_done == th->th.th_team_nproc - 1) {
2366#if KMP_STATIC_STEAL_ENABLED
2367 if (pr->schedule == kmp_sch_static_steal) {
2368 int i;
2369 int idx = (th->th.th_dispatch->th_disp_index - 1) %
2370 __kmp_dispatch_num_buffers; // current loop index
2371 // loop complete, safe to destroy locks used for stealing
2372 for (i = 0; i < th->th.th_team_nproc; ++i) {
2374 reinterpret_cast<dispatch_private_info_template<T> *>(
2375 &team->t.t_dispatch[i].th_disp_buffer[idx]);
2376 KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive
2377 KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED);
2378 if (traits_t<T>::type_size > 4) {
2379 // destroy locks used for stealing
2380 kmp_lock_t *lck = buf->u.p.steal_lock;
2381 KMP_ASSERT(lck != NULL);
2383 __kmp_free(lck);
2384 buf->u.p.steal_lock = NULL;
2385 }
2386 }
2387 }
2388#endif
2389 /* NOTE: release shared buffer to be reused */
2390
2391 KMP_MB(); /* Flush all pending memory write invalidates. */
2392
2393 sh->u.s.num_done = 0;
2394 sh->u.s.iteration = 0;
2395
2396 /* TODO replace with general release procedure? */
2397 if (pr->flags.ordered) {
2398 sh->u.s.ordered_iteration = 0;
2399 }
2400
2401 KMP_MB(); /* Flush all pending memory write invalidates. */
2402
2404 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2405 gtid, sh->buffer_index));
2406
2407 KMP_MB(); /* Flush all pending memory write invalidates. */
2408
2409 } // if
2411 if (pr->pushed_ws != ct_none) {
2412 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2413 }
2414 }
2415
2416 th->th.th_dispatch->th_deo_fcn = NULL;
2417 th->th.th_dispatch->th_dxo_fcn = NULL;
2418 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2419 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2420 } // if (status == 0)
2421#if KMP_OS_WINDOWS
2422 else if (last) {
2423 pr->u.p.last_upper = pr->u.p.ub;
2424 }
2425#endif /* KMP_OS_WINDOWS */
2426 if (p_last != NULL && status != 0)
2427 *p_last = last;
2428 } // if
2429
2430#ifdef KMP_DEBUG
2431 {
2432 char *buff;
2433 // create format specifiers before the debug output
2434 buff = __kmp_str_format(
2435 "__kmp_dispatch_next: T#%%d normal case: "
2436 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2437 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2438 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2439 (p_last ? *p_last : 0), status));
2440 __kmp_str_free(&buff);
2441 }
2442#endif
2443#if INCLUDE_SSC_MARKS
2444 SSC_MARK_DISPATCH_NEXT();
2445#endif
2446 OMPT_LOOP_DISPATCH(*p_lb, *p_ub, pr->u.p.st, status);
2449 return status;
2450}
2451
2452/*!
2453@ingroup WORK_SHARING
2454@param loc source location information
2455@param global_tid global thread number
2456@return Zero if the parallel region is not active and this thread should execute
2457all sections, non-zero otherwise.
2458
2459Beginning of sections construct.
2460There are no implicit barriers in the "sections" calls, rather the compiler
2461should introduce an explicit barrier if it is required.
2462
2463This implementation is based on __kmp_dispatch_init, using same constructs for
2464shared data (we can't have sections nested directly in omp for loop, there
2465should be a parallel region in between)
2466*/
2468
2469 int active;
2470 kmp_info_t *th;
2471 kmp_team_t *team;
2472 kmp_uint32 my_buffer_index;
2474
2476
2480
2481 /* setup data */
2482 th = __kmp_threads[gtid];
2483 team = th->th.th_team;
2484 active = !team->t.t_serialized;
2485 th->th.th_ident = loc;
2486
2487 KMP_COUNT_BLOCK(OMP_SECTIONS);
2488 KD_TRACE(10, ("__kmpc_sections: called by T#%d\n", gtid));
2489
2490 if (active) {
2491 // Setup sections in the same way as dynamic scheduled loops.
2492 // We need one shared data: which section is to execute next.
2493 // (in case parallel is not active, all sections will be executed on the
2494 // same thread)
2495 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2496 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2497
2498 my_buffer_index = th->th.th_dispatch->th_disp_index++;
2499
2500 // reuse shared data structures from dynamic sched loops:
2501 sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2502 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
2503 KD_TRACE(10, ("__kmpc_sections_init: T#%d my_buffer_index:%d\n", gtid,
2504 my_buffer_index));
2505
2506 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
2507 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
2508
2509 KD_TRACE(100, ("__kmpc_sections_init: T#%d before wait: my_buffer_index:%d "
2510 "sh->buffer_index:%d\n",
2511 gtid, my_buffer_index, sh->buffer_index));
2512 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
2514 // Note: KMP_WAIT() cannot be used there: buffer index and
2515 // my_buffer_index are *always* 32-bit integers.
2516 KMP_MB();
2517 KD_TRACE(100, ("__kmpc_sections_init: T#%d after wait: my_buffer_index:%d "
2518 "sh->buffer_index:%d\n",
2519 gtid, my_buffer_index, sh->buffer_index));
2520
2521 th->th.th_dispatch->th_dispatch_pr_current =
2522 nullptr; // sections construct doesn't need private data
2523 th->th.th_dispatch->th_dispatch_sh_current =
2525 }
2526
2527#if OMPT_SUPPORT && OMPT_OPTIONAL
2528 if (ompt_enabled.ompt_callback_work) {
2529 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2531 ompt_callbacks.ompt_callback(ompt_callback_work)(
2532 ompt_work_sections, ompt_scope_begin, &(team_info->parallel_data),
2533 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2534 }
2535#endif
2536 KMP_PUSH_PARTITIONED_TIMER(OMP_sections);
2537
2538 return active;
2539}
2540
2541/*!
2542@ingroup WORK_SHARING
2543@param loc source location information
2544@param global_tid global thread number
2545@param numberOfSections number of sections in the 'sections' construct
2546@return unsigned [from 0 to n) - number (id) of the section to execute next on
2547this thread. n (or any other number not in range) - nothing to execute on this
2548thread
2549*/
2550
2552 kmp_int32 numberOfSections) {
2553
2554 KMP_TIME_PARTITIONED_BLOCK(OMP_sections_overhead);
2555
2556 kmp_info_t *th = __kmp_threads[gtid];
2557#ifdef KMP_DEBUG
2558 kmp_team_t *team = th->th.th_team;
2559#endif
2560
2561 KD_TRACE(1000, ("__kmp_dispatch_next: T#%d; number of sections:%d\n", gtid,
2562 numberOfSections));
2563
2564 // For serialized case we should not call this function:
2565 KMP_DEBUG_ASSERT(!team->t.t_serialized);
2566
2568
2569 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2570 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2571
2572 KMP_DEBUG_ASSERT(!(th->th.th_dispatch->th_dispatch_pr_current));
2573 sh = reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
2574 th->th.th_dispatch->th_dispatch_sh_current);
2575 KMP_DEBUG_ASSERT(sh);
2576
2577 kmp_int32 sectionIndex = 0;
2578 bool moreSectionsToExecute = true;
2579
2580 // Find section to execute:
2581 sectionIndex = test_then_inc<kmp_int32>((kmp_int32 *)&sh->u.s.iteration);
2582 if (sectionIndex >= numberOfSections) {
2583 moreSectionsToExecute = false;
2584 }
2585
2586 // status == 0: no more sections to execute;
2587 // OMPTODO: __kmpc_end_sections could be bypassed?
2588 if (!moreSectionsToExecute) {
2589 kmp_int32 num_done;
2590
2591 num_done = test_then_inc<kmp_int32>((kmp_int32 *)(&sh->u.s.num_done));
2592
2593 if (num_done == th->th.th_team_nproc - 1) {
2594 /* NOTE: release this buffer to be reused */
2595
2596 KMP_MB(); /* Flush all pending memory write invalidates. */
2597
2598 sh->u.s.num_done = 0;
2599 sh->u.s.iteration = 0;
2600
2601 KMP_MB(); /* Flush all pending memory write invalidates. */
2602
2604 KD_TRACE(100, ("__kmpc_next_section: T#%d change buffer_index:%d\n", gtid,
2605 sh->buffer_index));
2606
2607 KMP_MB(); /* Flush all pending memory write invalidates. */
2608
2609 } // if
2610
2611 th->th.th_dispatch->th_deo_fcn = NULL;
2612 th->th.th_dispatch->th_dxo_fcn = NULL;
2613 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2614 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2615
2616#if OMPT_SUPPORT && OMPT_OPTIONAL
2617 if (ompt_enabled.ompt_callback_dispatch) {
2618 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2620 ompt_data_t instance = ompt_data_none;
2622 ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
2623 &(team_info->parallel_data), &(task_info->task_data),
2624 ompt_dispatch_section, instance);
2625 }
2626#endif
2627 }
2628
2629 return sectionIndex;
2630}
2631
2632/*!
2633@ingroup WORK_SHARING
2634@param loc source location information
2635@param global_tid global thread number
2636
2637End of "sections" construct.
2638Don't need to wait here: barrier is added separately when needed.
2639*/
2641
2642 kmp_info_t *th = __kmp_threads[gtid];
2643 int active = !th->th.th_team->t.t_serialized;
2644
2645 KD_TRACE(100, ("__kmpc_end_sections: T#%d called\n", gtid));
2646
2647 if (!active) {
2648 // In active case call finalization is done in __kmpc_next_section
2649#if OMPT_SUPPORT && OMPT_OPTIONAL
2650 if (ompt_enabled.ompt_callback_work) {
2651 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
2653 ompt_callbacks.ompt_callback(ompt_callback_work)(
2654 ompt_work_sections, ompt_scope_end, &(team_info->parallel_data),
2655 &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
2656 }
2657#endif
2658 }
2659
2661 KD_TRACE(100, ("__kmpc_end_sections: T#%d returned\n", gtid));
2662}
2663
2664template <typename T>
2666 kmp_int32 *plastiter, T *plower, T *pupper,
2667 typename traits_t<T>::signed_t incr) {
2668 typedef typename traits_t<T>::unsigned_t UT;
2669 kmp_uint32 team_id;
2670 kmp_uint32 nteams;
2671 UT trip_count;
2672 kmp_team_t *team;
2673 kmp_info_t *th;
2674
2675 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2676 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2677#ifdef KMP_DEBUG
2678 typedef typename traits_t<T>::signed_t ST;
2679 {
2680 char *buff;
2681 // create format specifiers before the debug output
2682 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2683 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2684 traits_t<T>::spec, traits_t<T>::spec,
2685 traits_t<ST>::spec, traits_t<T>::spec);
2686 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2687 __kmp_str_free(&buff);
2688 }
2689#endif
2690
2692 if (incr == 0) {
2693 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2694 loc);
2695 }
2696 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2697 // The loop is illegal.
2698 // Some zero-trip loops maintained by compiler, e.g.:
2699 // for(i=10;i<0;++i) // lower >= upper - run-time check
2700 // for(i=0;i>10;--i) // lower <= upper - run-time check
2701 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2702 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2703 // Compiler does not check the following illegal loops:
2704 // for(i=0;i<10;i+=incr) // where incr<0
2705 // for(i=10;i>0;i-=incr) // where incr<0
2706 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2707 }
2708 }
2710 th = __kmp_threads[gtid];
2711 team = th->th.th_team;
2712 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2713 nteams = th->th.th_teams_size.nteams;
2714 team_id = team->t.t_master_tid;
2715 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2716
2717 // compute global trip count
2718 if (incr == 1) {
2719 trip_count = *pupper - *plower + 1;
2720 } else if (incr == -1) {
2721 trip_count = *plower - *pupper + 1;
2722 } else if (incr > 0) {
2723 // upper-lower can exceed the limit of signed type
2724 trip_count = (UT)(*pupper - *plower) / incr + 1;
2725 } else {
2726 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2727 }
2728
2729 if (trip_count <= nteams) {
2732 __kmp_static ==
2733 kmp_sch_static_balanced); // Unknown static scheduling type.
2734 // only some teams get single iteration, others get nothing
2735 if (team_id < trip_count) {
2736 *pupper = *plower = *plower + team_id * incr;
2737 } else {
2738 *plower = *pupper + incr; // zero-trip loop
2739 }
2740 if (plastiter != NULL)
2741 *plastiter = (team_id == trip_count - 1);
2742 } else {
2744 UT chunk = trip_count / nteams;
2745 UT extras = trip_count % nteams;
2746 *plower +=
2747 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2748 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2749 if (plastiter != NULL)
2750 *plastiter = (team_id == nteams - 1);
2751 } else {
2752 T chunk_inc_count =
2753 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2754 T upper = *pupper;
2756 // Unknown static scheduling type.
2757 *plower += team_id * chunk_inc_count;
2758 *pupper = *plower + chunk_inc_count - incr;
2759 // Check/correct bounds if needed
2760 if (incr > 0) {
2761 if (*pupper < *plower)
2762 *pupper = traits_t<T>::max_value;
2763 if (plastiter != NULL)
2764 *plastiter = *plower <= upper && *pupper > upper - incr;
2765 if (*pupper > upper)
2766 *pupper = upper; // tracker C73258
2767 } else {
2768 if (*pupper > *plower)
2769 *pupper = traits_t<T>::min_value;
2770 if (plastiter != NULL)
2771 *plastiter = *plower >= upper && *pupper < upper - incr;
2772 if (*pupper < upper)
2773 *pupper = upper; // tracker C73258
2774 }
2775 }
2776 }
2777}
2778
2779//-----------------------------------------------------------------------------
2780// Dispatch routines
2781// Transfer call to template< type T >
2782// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2783// T lb, T ub, ST st, ST chunk )
2784extern "C" {
2785
2786/*!
2787@ingroup WORK_SHARING
2788@{
2789@param loc Source location
2790@param gtid Global thread id
2791@param schedule Schedule type
2792@param lb Lower bound
2793@param ub Upper bound
2794@param st Step (or increment if you prefer)
2795@param chunk The chunk size to block with
2796
2797This function prepares the runtime to start a dynamically scheduled for loop,
2798saving the loop arguments.
2799These functions are all identical apart from the types of the arguments.
2800*/
2801
2803 enum sched_type schedule, kmp_int32 lb,
2804 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2806#if OMPT_SUPPORT && OMPT_OPTIONAL
2807 OMPT_STORE_RETURN_ADDRESS(gtid);
2808#endif
2809 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2810}
2811/*!
2812See @ref __kmpc_dispatch_init_4
2813*/
2815 enum sched_type schedule, kmp_uint32 lb,
2816 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2818#if OMPT_SUPPORT && OMPT_OPTIONAL
2819 OMPT_STORE_RETURN_ADDRESS(gtid);
2820#endif
2821 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2822}
2823
2824/*!
2825See @ref __kmpc_dispatch_init_4
2826*/
2828 enum sched_type schedule, kmp_int64 lb,
2829 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2831#if OMPT_SUPPORT && OMPT_OPTIONAL
2832 OMPT_STORE_RETURN_ADDRESS(gtid);
2833#endif
2834 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2835}
2836
2837/*!
2838See @ref __kmpc_dispatch_init_4
2839*/
2841 enum sched_type schedule, kmp_uint64 lb,
2842 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2844#if OMPT_SUPPORT && OMPT_OPTIONAL
2845 OMPT_STORE_RETURN_ADDRESS(gtid);
2846#endif
2847 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2848}
2849
2850/*!
2851See @ref __kmpc_dispatch_init_4
2852
2853Difference from __kmpc_dispatch_init set of functions is these functions
2854are called for composite distribute parallel for construct. Thus before
2855regular iterations dispatching we need to calc per-team iteration space.
2856
2857These functions are all identical apart from the types of the arguments.
2858*/
2860 enum sched_type schedule, kmp_int32 *p_last,
2861 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2862 kmp_int32 chunk) {
2864#if OMPT_SUPPORT && OMPT_OPTIONAL
2865 OMPT_STORE_RETURN_ADDRESS(gtid);
2866#endif
2867 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2868 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2869}
2870
2872 enum sched_type schedule, kmp_int32 *p_last,
2873 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2874 kmp_int32 chunk) {
2876#if OMPT_SUPPORT && OMPT_OPTIONAL
2877 OMPT_STORE_RETURN_ADDRESS(gtid);
2878#endif
2879 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2880 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2881}
2882
2884 enum sched_type schedule, kmp_int32 *p_last,
2885 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2886 kmp_int64 chunk) {
2888#if OMPT_SUPPORT && OMPT_OPTIONAL
2889 OMPT_STORE_RETURN_ADDRESS(gtid);
2890#endif
2891 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2892 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2893}
2894
2896 enum sched_type schedule, kmp_int32 *p_last,
2897 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2898 kmp_int64 chunk) {
2900#if OMPT_SUPPORT && OMPT_OPTIONAL
2901 OMPT_STORE_RETURN_ADDRESS(gtid);
2902#endif
2903 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2904 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2905}
2906
2907/*!
2908@param loc Source code location
2909@param gtid Global thread id
2910@param p_last Pointer to a flag set to one if this is the last chunk or zero
2911otherwise
2912@param p_lb Pointer to the lower bound for the next chunk of work
2913@param p_ub Pointer to the upper bound for the next chunk of work
2914@param p_st Pointer to the stride for the next chunk of work
2915@return one if there is work to be done, zero otherwise
2916
2917Get the next dynamically allocated chunk of work for this thread.
2918If there is no more work, then the lb,ub and stride need not be modified.
2919*/
2921 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2922#if OMPT_SUPPORT && OMPT_OPTIONAL
2923 OMPT_STORE_RETURN_ADDRESS(gtid);
2924#endif
2925 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2926#if OMPT_SUPPORT && OMPT_OPTIONAL
2927 ,
2928 OMPT_LOAD_RETURN_ADDRESS(gtid)
2929#endif
2930 );
2931}
2932
2933/*!
2934See @ref __kmpc_dispatch_next_4
2935*/
2937 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2938 kmp_int32 *p_st) {
2939#if OMPT_SUPPORT && OMPT_OPTIONAL
2940 OMPT_STORE_RETURN_ADDRESS(gtid);
2941#endif
2942 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2943#if OMPT_SUPPORT && OMPT_OPTIONAL
2944 ,
2945 OMPT_LOAD_RETURN_ADDRESS(gtid)
2946#endif
2947 );
2948}
2949
2950/*!
2951See @ref __kmpc_dispatch_next_4
2952*/
2954 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2955#if OMPT_SUPPORT && OMPT_OPTIONAL
2956 OMPT_STORE_RETURN_ADDRESS(gtid);
2957#endif
2958 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2959#if OMPT_SUPPORT && OMPT_OPTIONAL
2960 ,
2961 OMPT_LOAD_RETURN_ADDRESS(gtid)
2962#endif
2963 );
2964}
2965
2966/*!
2967See @ref __kmpc_dispatch_next_4
2968*/
2970 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2971 kmp_int64 *p_st) {
2972#if OMPT_SUPPORT && OMPT_OPTIONAL
2973 OMPT_STORE_RETURN_ADDRESS(gtid);
2974#endif
2975 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2976#if OMPT_SUPPORT && OMPT_OPTIONAL
2977 ,
2978 OMPT_LOAD_RETURN_ADDRESS(gtid)
2979#endif
2980 );
2981}
2982
2983/*!
2984@param loc Source code location
2985@param gtid Global thread id
2986
2987Mark the end of a dynamic loop.
2988*/
2992
2993/*!
2994See @ref __kmpc_dispatch_fini_4
2995*/
2999
3000/*!
3001See @ref __kmpc_dispatch_fini_4
3002*/
3006
3007/*!
3008See @ref __kmpc_dispatch_fini_4
3009*/
3013
3014/*!
3015See @ref __kmpc_dispatch_deinit
3016*/
3018/*! @} */
3019
3020//-----------------------------------------------------------------------------
3021// Non-template routines from kmp_dispatch.cpp used in other sources
3022
3026
3030
3034
3038
3042
3046 void *obj // Higher-level synchronization object, or NULL.
3047) {
3048 // note: we may not belong to a team at this point
3049 volatile kmp_uint32 *spin = spinner;
3051 kmp_uint32 spins;
3052 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
3053 kmp_uint32 r;
3054 kmp_uint64 time;
3055
3056 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
3057 KMP_INIT_YIELD(spins);
3058 KMP_INIT_BACKOFF(time);
3059 // main wait spin loop
3060 while (!f(r = TCR_4(*spin), check)) {
3062 /* GEH - remove this since it was accidentally introduced when kmp_wait was
3063 split. It causes problems with infinite recursion because of exit lock */
3064 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
3065 __kmp_abort_thread(); */
3066 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3067 }
3069 return r;
3070}
3071
3073 kmp_uint32 (*pred)(void *, kmp_uint32),
3074 void *obj // Higher-level synchronization object, or NULL.
3075) {
3076 // note: we may not belong to a team at this point
3077 void *spin = spinner;
3079 kmp_uint32 spins;
3080 kmp_uint32 (*f)(void *, kmp_uint32) = pred;
3081 kmp_uint64 time;
3082
3083 KMP_FSYNC_SPIN_INIT(obj, spin);
3084 KMP_INIT_YIELD(spins);
3085 KMP_INIT_BACKOFF(time);
3086 // main wait spin loop
3087 while (!f(spin, check)) {
3089 /* if we have waited a bit, or are noversubscribed, yield */
3090 /* pause is in the following code */
3091 KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3092 }
3094}
3095
3096} // extern "C"
3097
3098#ifdef KMP_GOMP_COMPAT
3099
3100void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
3101 enum sched_type schedule, kmp_int32 lb,
3102 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
3103 int push_ws) {
3104 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
3105 push_ws);
3106}
3107
3108void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
3109 enum sched_type schedule, kmp_uint32 lb,
3110 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
3111 int push_ws) {
3112 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
3113 push_ws);
3114}
3115
3116void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
3117 enum sched_type schedule, kmp_int64 lb,
3118 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
3119 int push_ws) {
3120 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
3121 push_ws);
3122}
3123
3124void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
3125 enum sched_type schedule, kmp_uint64 lb,
3126 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
3127 int push_ws) {
3128 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
3129 push_ws);
3130}
3131
3132void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
3133 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3134}
3135
3136void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
3137 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
3138}
3139
3140void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
3141 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
3142}
3143
3144void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
3145 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
3146}
3147
3148#endif /* KMP_GOMP_COMPAT */
3149
3150/* ------------------------------------------------------------------------ */
char buf[BUFFER_SIZE]
void * target(void *task)
int64_t kmp_int64
Definition common.h:10
void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
sched_type
Describes the loop schedule to be used for a parallel for loop.
Definition kmp.h:353
void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_deinit(ident_t *loc, kmp_int32 gtid)
See __kmpc_dispatch_deinit.
kmp_int32 __kmpc_next_section(ident_t *loc, kmp_int32 gtid, kmp_int32 numberOfSections)
void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
See __kmpc_dispatch_init_4.
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
See __kmpc_dispatch_next_4.
void __kmpc_end_sections(ident_t *loc, kmp_int32 gtid)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
See __kmpc_dispatch_next_4.
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
See __kmpc_dispatch_fini_4.
kmp_int32 __kmpc_sections_init(ident_t *loc, kmp_int32 gtid)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
See __kmpc_dispatch_next_4.
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
See __kmpc_dispatch_fini_4.
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
See __kmpc_dispatch_init_4.
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
See __kmpc_dispatch_init_4.
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
See __kmpc_dispatch_fini_4.
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)
See __kmpc_dispatch_init_4.
@ kmp_sch_runtime_simd
runtime with chunk adjustment
Definition kmp.h:375
@ kmp_sch_auto
auto
Definition kmp.h:360
@ kmp_sch_static
static unspecialized
Definition kmp.h:356
@ kmp_sch_guided_simd
guided with chunk adjustment
Definition kmp.h:374
@ kmp_sch_runtime
Definition kmp.h:359
@ kmp_sch_guided_chunked
guided unspecialized
Definition kmp.h:358
@ kmp_sch_dynamic_chunked
Definition kmp.h:357
@ kmp_sch_guided_analytical_chunked
Definition kmp.h:368
@ kmp_sch_static_balanced
Definition kmp.h:365
@ kmp_sch_static_greedy
Definition kmp.h:364
@ kmp_sch_lower
lower bound for unordered values
Definition kmp.h:354
@ kmp_sch_static_chunked
Definition kmp.h:355
@ kmp_sch_trapezoidal
Definition kmp.h:361
@ kmp_nm_upper
upper bound for nomerge values
Definition kmp.h:425
@ kmp_ord_lower
lower bound for ordered values, must be power of 2
Definition kmp.h:380
@ kmp_sch_guided_iterative_chunked
Definition kmp.h:367
@ kmp_sch_static_balanced_chunked
Definition kmp.h:373
@ kmp_sch_upper
upper bound for unordered values
Definition kmp.h:378
@ kmp_nm_lower
lower bound for nomerge values
Definition kmp.h:398
@ kmp_sch_static_steal
Definition kmp.h:370
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance * instance
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t count
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long value
void const char const char int ITT_FORMAT __itt_group_sync p
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type size_t void ITT_FORMAT p const __itt_domain __itt_id __itt_string_handle const wchar_t size_t ITT_FORMAT lu const __itt_domain __itt_id __itt_relation __itt_id ITT_FORMAT p const wchar_t int ITT_FORMAT __itt_group_mark d int
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type type
#define __kmp_free(ptr)
Definition kmp.h:3759
#define KMP_CPU_PAUSE()
Definition kmp.h:1597
#define KMP_DEFAULT_CHUNK
Definition kmp.h:1317
@ ct_ordered_in_pdo
Definition kmp.h:1703
@ ct_none
Definition kmp.h:1695
@ ct_pdo_ordered
Definition kmp.h:1698
@ ct_pdo
Definition kmp.h:1697
#define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time)
Definition kmp.h:1673
struct dispatch_shared_info dispatch_shared_info_t
struct KMP_ALIGN_CACHE dispatch_private_info dispatch_private_info_t
#define SCHEDULE_MONOTONIC
Definition kmp.h:459
#define SCHEDULE_HAS_MONOTONIC(s)
Definition kmp.h:449
enum sched_type __kmp_auto
enum sched_type __kmp_static
int __kmp_force_monotonic
kmp_info_t ** __kmp_threads
int __kmp_dispatch_num_buffers
#define SCHEDULE_WITHOUT_MODIFIERS(s)
Definition kmp.h:446
union kmp_team kmp_team_t
Definition kmp.h:254
#define KMP_INIT_YIELD(count)
Definition kmp.h:1600
#define KMP_MASTER_GTID(gtid)
Definition kmp.h:1347
void __kmp_parallel_initialize(void)
#define KMP_INIT_BACKOFF(time)
Definition kmp.h:1603
volatile int __kmp_init_parallel
#define __kmp_allocate(size)
Definition kmp.h:3757
#define TRUE
Definition kmp.h:1353
#define FALSE
Definition kmp.h:1352
static bool __kmp_is_hybrid_cpu()
Definition kmp.h:3371
int __kmp_env_consistency_check
#define SCHEDULE_HAS_NONMONOTONIC(s)
Definition kmp.h:450
static int __kmp_gtid_from_tid(int tid, const kmp_team_t *team)
Definition kmp.h:3627
enum sched_type __kmp_guided
void __kmp_resume_if_soft_paused()
static void __kmp_assert_valid_gtid(kmp_int32 gtid)
Definition kmp.h:3647
kmp_hw_core_type_t
Definition kmp.h:621
@ KMP_HW_CORE_TYPE_UNKNOWN
Definition kmp.h:622
volatile int __kmp_init_serial
static void __kmp_type_convert(T1 src, T2 *dest)
Definition kmp.h:4887
union KMP_ALIGN_CACHE kmp_info kmp_info_t
#define SCHEDULE_NONMONOTONIC
Definition kmp.h:458
KMP_ARCH_X86 KMP_ARCH_X86 long double
KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86<<, 2i, 1, KMP_ARCH_X86) ATOMIC_CMPXCHG(fixed2, shr, kmp_int16, 16, > KMP_ARCH_X86 KMP_ARCH_X86 kmp_uint32
#define KE_TRACE(d, x)
Definition kmp_debug.h:161
#define KMP_ASSERT(cond)
Definition kmp_debug.h:59
#define KMP_BUILD_ASSERT(expr)
Definition kmp_debug.h:26
#define KD_TRACE(d, x)
Definition kmp_debug.h:160
#define KMP_DEBUG_ASSERT(cond)
Definition kmp_debug.h:61
#define KMP_ASSERT2(cond, msg)
Definition kmp_debug.h:60
unsigned long long kmp_uint64
kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker)
#define GUIDED_ANALYTICAL_WORKAROUND
#define OMPT_LOOP_END
int __kmp_dispatch_next_algorithm(int gtid, dispatch_private_info_template< T > *pr, dispatch_shared_info_template< T > volatile *sh, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st, T nproc, T tid)
kmp_uint32 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker, kmp_uint32(*pred)(kmp_uint32, kmp_uint32), void *obj)
void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker, kmp_uint32(*pred)(void *, kmp_uint32), void *obj)
#define KMP_STATS_LOOP_END
static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st)
kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker)
void __kmp_initialize_self_buffer(kmp_team_t *team, T id, dispatch_private_info_template< T > *pr, typename traits_t< T >::unsigned_t nchunks, T nproc, typename traits_t< T >::unsigned_t &init, T &small_chunk, T &extras, T &p_extra)
void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref)
kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker)
static void __kmp_dispatch_finish(int gtid, ident_t *loc)
kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker)
static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid, kmp_int32 *plastiter, T *plower, T *pupper, typename traits_t< T >::signed_t incr)
void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid, dispatch_private_info_template< T > *pr, enum sched_type schedule, T lb, T ub, typename traits_t< T >::signed_t st, typename traits_t< T >::signed_t chunk, T nproc, T tid)
kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker)
void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref)
#define OMPT_LOOP_DISPATCH(lb, ub, st, status)
static void __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb, T ub, typename traits_t< T >::signed_t st, typename traits_t< T >::signed_t chunk, int push_ws)
static int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule, bool use_hier=false)
static __inline traits_t< T >::unsigned_t __kmp_dispatch_guided_remaining(T tc, typename traits_t< T >::floating_t base, typename traits_t< T >::unsigned_t idx)
static const int guided_int_param
__forceinline kmp_int32 test_then_inc< kmp_int32 >(volatile kmp_int32 *p)
kmp_uint32 __kmp_eq(T value, T checker)
void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref)
static UT __kmp_wait(volatile UT *spinner, UT checker, kmp_uint32(*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj))
static __forceinline T test_then_inc_acq(volatile T *p)
static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s)
static __forceinline T test_then_inc(volatile T *p)
static __forceinline long double __kmp_pow(long double x, UT y)
static const double guided_flt_param
static __forceinline T test_then_add(volatile T *p, T d)
void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref)
kmp_uint32 __kmp_ge(T value, T checker)
void __kmp_dispatch_init_hierarchy(ident_t *loc, int n, kmp_hier_layer_e *new_layers, enum sched_type *new_scheds, typename traits_t< T >::signed_t *new_chunks, T lb, T ub, typename traits_t< T >::signed_t st)
kmp_hier_sched_env_t __kmp_hier_scheds
void __kmp_dispatch_free_hierarchies(kmp_team_t *team)
void __kmp_push_sync(int gtid, enum cons_type ct, ident_t const *ident, kmp_user_lock_p lck)
enum cons_type __kmp_pop_workshare(int gtid, enum cons_type ct, ident_t const *ident)
void __kmp_error_construct(kmp_i18n_id_t id, enum cons_type ct, ident_t const *ident)
void __kmp_pop_sync(int gtid, enum cons_type ct, ident_t const *ident)
static volatile kmp_i18n_cat_status_t status
Definition kmp_i18n.cpp:48
kmp_msg_t __kmp_msg_null
Definition kmp_i18n.cpp:36
void __kmp_fatal(kmp_msg_t message,...)
Definition kmp_i18n.cpp:875
#define KMP_WARNING(...)
Definition kmp_i18n.h:144
#define KMP_MSG(...)
Definition kmp_i18n.h:121
#define KMP_HNT(...)
Definition kmp_i18n.h:122
#define KMP_FSYNC_SPIN_ACQUIRED(obj)
Definition kmp_itt.h:339
#define KMP_FSYNC_SPIN_PREPARE(obj)
Definition kmp_itt.h:338
#define USE_ITT_BUILD_ARG(x)
Definition kmp_itt.h:346
#define KMP_FSYNC_SPIN_INIT(obj, spin)
Definition kmp_itt.h:337
static int __kmp_acquire_lock(kmp_lock_t *lck, kmp_int32 gtid)
Definition kmp_lock.h:559
static void __kmp_init_lock(kmp_lock_t *lck)
Definition kmp_lock.h:571
static void __kmp_destroy_lock(kmp_lock_t *lck)
Definition kmp_lock.h:575
static void __kmp_release_lock(kmp_lock_t *lck, kmp_int32 gtid)
Definition kmp_lock.h:567
kmp_ticket_lock_t kmp_lock_t
Definition kmp_lock.h:555
#define KMP_COMPARE_AND_STORE_REL64(p, cv, sv)
Definition kmp_os.h:860
#define KMP_ATOMIC_ST_REL(p, v)
Definition kmp_os.h:1261
#define RCAST(type, var)
Definition kmp_os.h:292
#define KMP_XCHG_FIXED64(p, v)
Definition kmp_os.h:880
#define KMP_ATOMIC_LD_ACQ(p)
Definition kmp_os.h:1259
#define KMP_ATOMIC_ST_RLX(p, v)
Definition kmp_os.h:1262
#define VOLATILE_CAST(x)
Definition kmp_os.h:1190
#define CCAST(type, var)
Definition kmp_os.h:291
#define KMP_MB()
Definition kmp_os.h:1066
#define TCR_4(a)
Definition kmp_os.h:1137
#define KMP_ATOMIC_LD_RLX(p)
Definition kmp_os.h:1260
Functions for collecting statistics.
#define KMP_COUNT_VALUE(n, v)
Definition kmp_stats.h:1000
#define KMP_PUSH_PARTITIONED_TIMER(name)
Definition kmp_stats.h:1014
#define KMP_POP_PARTITIONED_TIMER()
Definition kmp_stats.h:1015
#define KMP_COUNT_DEVELOPER_VALUE(n, v)
Definition kmp_stats.h:1006
#define KMP_TIME_PARTITIONED_BLOCK(name)
Definition kmp_stats.h:1013
#define KMP_COUNT_BLOCK(n)
Definition kmp_stats.h:1001
char * __kmp_str_format(char const *format,...)
Definition kmp_str.cpp:448
void __kmp_str_free(char **str)
Definition kmp_str.cpp:494
#define i
Definition kmp_stub.cpp:87
#define ST
int a
Definition check.py:1
int32_t kmp_int32
const int chunk_size
omp_lock_t lck
Definition omp_lock.c:7
void init(int &A, int val)
ompt_callbacks_active_t ompt_enabled
ompt_callbacks_internal_t ompt_callbacks
#define OMPT_GET_RETURN_ADDRESS(level)
ompt_team_info_t * __ompt_get_teaminfo(int depth, int *size)
ompt_task_info_t * __ompt_get_task_info_object(int depth)
static id loc
static int checker
std::atomic< kmp_uint32 > steal_flag
union KMP_ALIGN_CACHE dispatch_private_info_template::private_info_tmpl u
union dispatch_shared_info_template::shared_info_tmpl u
volatile kmp_uint32 buffer_index
unsigned ordered
Definition kmp.h:1909
unsigned use_hier
Definition kmp.h:1912
unsigned use_hybrid
Definition kmp.h:1913
unsigned nomerge
Definition kmp.h:1910
ompt_data_t task_data
ompt_data_t parallel_data
dispatch_private_infoXX_template< T > p
dispatch_shared_infoXX_template< UT > s
kmp_base_team_t t
Definition kmp.h:3237