LLVM OpenMP 20.0.0git
kmp_affinity.h
Go to the documentation of this file.
1/*
2 * kmp_affinity.h -- header for affinity management
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef KMP_AFFINITY_H
14#define KMP_AFFINITY_H
15
16#include "kmp.h"
17#include "kmp_os.h"
18#include <limits>
19
20#if KMP_AFFINITY_SUPPORTED
21#if KMP_USE_HWLOC
22class KMPHwlocAffinity : public KMPAffinity {
23public:
24 class Mask : public KMPAffinity::Mask {
25 hwloc_cpuset_t mask;
26
27 public:
28 Mask() {
29 mask = hwloc_bitmap_alloc();
30 this->zero();
31 }
32 Mask(const Mask &other) = delete;
33 Mask &operator=(const Mask &other) = delete;
34 ~Mask() { hwloc_bitmap_free(mask); }
35 void set(int i) override { hwloc_bitmap_set(mask, i); }
36 bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
37 void clear(int i) override { hwloc_bitmap_clr(mask, i); }
38 void zero() override { hwloc_bitmap_zero(mask); }
39 bool empty() const override { return hwloc_bitmap_iszero(mask); }
40 void copy(const KMPAffinity::Mask *src) override {
41 const Mask *convert = static_cast<const Mask *>(src);
42 hwloc_bitmap_copy(mask, convert->mask);
43 }
44 void bitwise_and(const KMPAffinity::Mask *rhs) override {
45 const Mask *convert = static_cast<const Mask *>(rhs);
46 hwloc_bitmap_and(mask, mask, convert->mask);
47 }
48 void bitwise_or(const KMPAffinity::Mask *rhs) override {
49 const Mask *convert = static_cast<const Mask *>(rhs);
50 hwloc_bitmap_or(mask, mask, convert->mask);
51 }
52 void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
53 bool is_equal(const KMPAffinity::Mask *rhs) const override {
54 const Mask *convert = static_cast<const Mask *>(rhs);
55 return hwloc_bitmap_isequal(mask, convert->mask);
56 }
57 int begin() const override { return hwloc_bitmap_first(mask); }
58 int end() const override { return -1; }
59 int next(int previous) const override {
60 return hwloc_bitmap_next(mask, previous);
61 }
62 int get_system_affinity(bool abort_on_error) override {
63 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
64 "Illegal get affinity operation when not capable");
65 long retval =
66 hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
67 if (retval >= 0) {
68 return 0;
69 }
70 int error = errno;
71 if (abort_on_error) {
72 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
73 KMP_ERR(error), __kmp_msg_null);
74 }
75 return error;
76 }
77 int set_system_affinity(bool abort_on_error) const override {
78 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
79 "Illegal set affinity operation when not capable");
80 long retval =
81 hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
82 if (retval >= 0) {
83 return 0;
84 }
85 int error = errno;
86 if (abort_on_error) {
87 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
88 KMP_ERR(error), __kmp_msg_null);
89 }
90 return error;
91 }
92#if KMP_OS_WINDOWS
93 int set_process_affinity(bool abort_on_error) const override {
94 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
95 "Illegal set process affinity operation when not capable");
96 int error = 0;
97 const hwloc_topology_support *support =
98 hwloc_topology_get_support(__kmp_hwloc_topology);
99 if (support->cpubind->set_proc_cpubind) {
100 int retval;
101 retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
102 HWLOC_CPUBIND_PROCESS);
103 if (retval >= 0)
104 return 0;
105 error = errno;
106 if (abort_on_error)
107 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
108 KMP_ERR(error), __kmp_msg_null);
109 }
110 return error;
111 }
112#endif
113 int get_proc_group() const override {
114 int group = -1;
115#if KMP_OS_WINDOWS
116 if (__kmp_num_proc_groups == 1) {
117 return 1;
118 }
119 for (int i = 0; i < __kmp_num_proc_groups; i++) {
120 // On windows, the long type is always 32 bits
121 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
122 unsigned long second_32_bits =
123 hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
124 if (first_32_bits == 0 && second_32_bits == 0) {
125 continue;
126 }
127 if (group >= 0) {
128 return -1;
129 }
130 group = i;
131 }
132#endif /* KMP_OS_WINDOWS */
133 return group;
134 }
135 };
136 void determine_capable(const char *var) override {
137 const hwloc_topology_support *topology_support;
138 if (__kmp_hwloc_topology == NULL) {
139 if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
140 __kmp_hwloc_error = TRUE;
141 if (__kmp_affinity.flags.verbose) {
142 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
143 }
144 }
145 if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
146 __kmp_hwloc_error = TRUE;
147 if (__kmp_affinity.flags.verbose) {
148 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
149 }
150 }
151 }
152 topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
153 // Is the system capable of setting/getting this thread's affinity?
154 // Also, is topology discovery possible? (pu indicates ability to discover
155 // processing units). And finally, were there no errors when calling any
156 // hwloc_* API functions?
157 if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
158 topology_support->cpubind->get_thisthread_cpubind &&
159 topology_support->discovery->pu && !__kmp_hwloc_error) {
160 // enables affinity according to KMP_AFFINITY_CAPABLE() macro
161 KMP_AFFINITY_ENABLE(TRUE);
162 } else {
163 // indicate that hwloc didn't work and disable affinity
164 __kmp_hwloc_error = TRUE;
165 KMP_AFFINITY_DISABLE();
166 }
167 }
168 void bind_thread(int which) override {
169 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
170 "Illegal set affinity operation when not capable");
171 KMPAffinity::Mask *mask;
172 KMP_CPU_ALLOC_ON_STACK(mask);
173 KMP_CPU_ZERO(mask);
174 KMP_CPU_SET(which, mask);
175 __kmp_set_system_affinity(mask, TRUE);
176 KMP_CPU_FREE_FROM_STACK(mask);
177 }
178 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
179 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
180 KMPAffinity::Mask *allocate_mask_array(int num) override {
181 return new Mask[num];
182 }
183 void deallocate_mask_array(KMPAffinity::Mask *array) override {
184 Mask *hwloc_array = static_cast<Mask *>(array);
185 delete[] hwloc_array;
186 }
187 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
188 int index) override {
189 Mask *hwloc_array = static_cast<Mask *>(array);
190 return &(hwloc_array[index]);
191 }
192 api_type get_api_type() const override { return HWLOC; }
193};
194#endif /* KMP_USE_HWLOC */
195
196#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
197 KMP_OS_AIX
198#if KMP_OS_LINUX
199/* On some of the older OS's that we build on, these constants aren't present
200 in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
201 all systems of the same arch where they are defined, and they cannot change.
202 stone forever. */
203#include <sys/syscall.h>
204#if KMP_ARCH_X86 || KMP_ARCH_ARM
205#ifndef __NR_sched_setaffinity
206#define __NR_sched_setaffinity 241
207#elif __NR_sched_setaffinity != 241
208#error Wrong code for setaffinity system call.
209#endif /* __NR_sched_setaffinity */
210#ifndef __NR_sched_getaffinity
211#define __NR_sched_getaffinity 242
212#elif __NR_sched_getaffinity != 242
213#error Wrong code for getaffinity system call.
214#endif /* __NR_sched_getaffinity */
215#elif KMP_ARCH_AARCH64
216#ifndef __NR_sched_setaffinity
217#define __NR_sched_setaffinity 122
218#elif __NR_sched_setaffinity != 122
219#error Wrong code for setaffinity system call.
220#endif /* __NR_sched_setaffinity */
221#ifndef __NR_sched_getaffinity
222#define __NR_sched_getaffinity 123
223#elif __NR_sched_getaffinity != 123
224#error Wrong code for getaffinity system call.
225#endif /* __NR_sched_getaffinity */
226#elif KMP_ARCH_X86_64
227#ifndef __NR_sched_setaffinity
228#define __NR_sched_setaffinity 203
229#elif __NR_sched_setaffinity != 203
230#error Wrong code for setaffinity system call.
231#endif /* __NR_sched_setaffinity */
232#ifndef __NR_sched_getaffinity
233#define __NR_sched_getaffinity 204
234#elif __NR_sched_getaffinity != 204
235#error Wrong code for getaffinity system call.
236#endif /* __NR_sched_getaffinity */
237#elif KMP_ARCH_PPC64
238#ifndef __NR_sched_setaffinity
239#define __NR_sched_setaffinity 222
240#elif __NR_sched_setaffinity != 222
241#error Wrong code for setaffinity system call.
242#endif /* __NR_sched_setaffinity */
243#ifndef __NR_sched_getaffinity
244#define __NR_sched_getaffinity 223
245#elif __NR_sched_getaffinity != 223
246#error Wrong code for getaffinity system call.
247#endif /* __NR_sched_getaffinity */
248#elif KMP_ARCH_MIPS
249#ifndef __NR_sched_setaffinity
250#define __NR_sched_setaffinity 4239
251#elif __NR_sched_setaffinity != 4239
252#error Wrong code for setaffinity system call.
253#endif /* __NR_sched_setaffinity */
254#ifndef __NR_sched_getaffinity
255#define __NR_sched_getaffinity 4240
256#elif __NR_sched_getaffinity != 4240
257#error Wrong code for getaffinity system call.
258#endif /* __NR_sched_getaffinity */
259#elif KMP_ARCH_MIPS64
260#ifndef __NR_sched_setaffinity
261#define __NR_sched_setaffinity 5195
262#elif __NR_sched_setaffinity != 5195
263#error Wrong code for setaffinity system call.
264#endif /* __NR_sched_setaffinity */
265#ifndef __NR_sched_getaffinity
266#define __NR_sched_getaffinity 5196
267#elif __NR_sched_getaffinity != 5196
268#error Wrong code for getaffinity system call.
269#endif /* __NR_sched_getaffinity */
270#elif KMP_ARCH_LOONGARCH64
271#ifndef __NR_sched_setaffinity
272#define __NR_sched_setaffinity 122
273#elif __NR_sched_setaffinity != 122
274#error Wrong code for setaffinity system call.
275#endif /* __NR_sched_setaffinity */
276#ifndef __NR_sched_getaffinity
277#define __NR_sched_getaffinity 123
278#elif __NR_sched_getaffinity != 123
279#error Wrong code for getaffinity system call.
280#endif /* __NR_sched_getaffinity */
281#elif KMP_ARCH_RISCV64
282#ifndef __NR_sched_setaffinity
283#define __NR_sched_setaffinity 122
284#elif __NR_sched_setaffinity != 122
285#error Wrong code for setaffinity system call.
286#endif /* __NR_sched_setaffinity */
287#ifndef __NR_sched_getaffinity
288#define __NR_sched_getaffinity 123
289#elif __NR_sched_getaffinity != 123
290#error Wrong code for getaffinity system call.
291#endif /* __NR_sched_getaffinity */
292#elif KMP_ARCH_VE
293#ifndef __NR_sched_setaffinity
294#define __NR_sched_setaffinity 203
295#elif __NR_sched_setaffinity != 203
296#error Wrong code for setaffinity system call.
297#endif /* __NR_sched_setaffinity */
298#ifndef __NR_sched_getaffinity
299#define __NR_sched_getaffinity 204
300#elif __NR_sched_getaffinity != 204
301#error Wrong code for getaffinity system call.
302#endif /* __NR_sched_getaffinity */
303#elif KMP_ARCH_S390X
304#ifndef __NR_sched_setaffinity
305#define __NR_sched_setaffinity 239
306#elif __NR_sched_setaffinity != 239
307#error Wrong code for setaffinity system call.
308#endif /* __NR_sched_setaffinity */
309#ifndef __NR_sched_getaffinity
310#define __NR_sched_getaffinity 240
311#elif __NR_sched_getaffinity != 240
312#error Wrong code for getaffinity system call.
313#endif /* __NR_sched_getaffinity */
314#else
315#error Unknown or unsupported architecture
316#endif /* KMP_ARCH_* */
317#elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
318#include <pthread.h>
319#include <pthread_np.h>
320#elif KMP_OS_NETBSD
321#include <pthread.h>
322#include <sched.h>
323#elif KMP_OS_AIX
324#include <sys/dr.h>
325#include <sys/rset.h>
326#define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
327#define GET_NUMBER_SMT_SETS 0x0004
328extern "C" int syssmt(int flags, int, int, int *);
329#endif
330class KMPNativeAffinity : public KMPAffinity {
331 class Mask : public KMPAffinity::Mask {
332 typedef unsigned long mask_t;
333 typedef decltype(__kmp_affin_mask_size) mask_size_type;
334 static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
335 static const mask_t ONE = 1;
336 mask_size_type get_num_mask_types() const {
337 return __kmp_affin_mask_size / sizeof(mask_t);
338 }
339
340 public:
341 mask_t *mask;
342 Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
343 ~Mask() {
344 if (mask)
346 }
347 void set(int i) override {
348 mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
349 }
350 bool is_set(int i) const override {
351 return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
352 }
353 void clear(int i) override {
354 mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
355 }
356 void zero() override {
357 mask_size_type e = get_num_mask_types();
358 for (mask_size_type i = 0; i < e; ++i)
359 mask[i] = (mask_t)0;
360 }
361 bool empty() const override {
362 mask_size_type e = get_num_mask_types();
363 for (mask_size_type i = 0; i < e; ++i)
364 if (mask[i] != (mask_t)0)
365 return false;
366 return true;
367 }
368 void copy(const KMPAffinity::Mask *src) override {
369 const Mask *convert = static_cast<const Mask *>(src);
370 mask_size_type e = get_num_mask_types();
371 for (mask_size_type i = 0; i < e; ++i)
372 mask[i] = convert->mask[i];
373 }
374 void bitwise_and(const KMPAffinity::Mask *rhs) override {
375 const Mask *convert = static_cast<const Mask *>(rhs);
376 mask_size_type e = get_num_mask_types();
377 for (mask_size_type i = 0; i < e; ++i)
378 mask[i] &= convert->mask[i];
379 }
380 void bitwise_or(const KMPAffinity::Mask *rhs) override {
381 const Mask *convert = static_cast<const Mask *>(rhs);
382 mask_size_type e = get_num_mask_types();
383 for (mask_size_type i = 0; i < e; ++i)
384 mask[i] |= convert->mask[i];
385 }
386 void bitwise_not() override {
387 mask_size_type e = get_num_mask_types();
388 for (mask_size_type i = 0; i < e; ++i)
389 mask[i] = ~(mask[i]);
390 }
391 bool is_equal(const KMPAffinity::Mask *rhs) const override {
392 const Mask *convert = static_cast<const Mask *>(rhs);
393 mask_size_type e = get_num_mask_types();
394 for (mask_size_type i = 0; i < e; ++i)
395 if (mask[i] != convert->mask[i])
396 return false;
397 return true;
398 }
399 int begin() const override {
400 int retval = 0;
401 while (retval < end() && !is_set(retval))
402 ++retval;
403 return retval;
404 }
405 int end() const override {
406 int e;
407 __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
408 return e;
409 }
410 int next(int previous) const override {
411 int retval = previous + 1;
412 while (retval < end() && !is_set(retval))
413 ++retval;
414 return retval;
415 }
416#if KMP_OS_AIX
417 // On AIX, we don't have a way to get CPU(s) a thread is bound to.
418 // This routine is only used to get the full mask.
419 int get_system_affinity(bool abort_on_error) override {
420 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
421 "Illegal get affinity operation when not capable");
422
423 (void)abort_on_error;
424
425 // Set the mask with all CPUs that are available.
426 for (int i = 0; i < __kmp_xproc; ++i)
427 KMP_CPU_SET(i, this);
428 return 0;
429 }
430 int set_system_affinity(bool abort_on_error) const override {
431 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
432
433 "Illegal set affinity operation when not capable");
434
435 int location;
436 int gtid = __kmp_entry_gtid();
437 int tid = thread_self();
438
439 // Unbind the thread if it was bound to any processors before so that
440 // we can bind the thread to CPUs specified by the mask not others.
441 int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
442
443 // On AIX, we can only bind to one instead of a set of CPUs with the
444 // bindprocessor() system call.
445 KMP_CPU_SET_ITERATE(location, this) {
446 if (KMP_CPU_ISSET(location, this)) {
447 retval = bindprocessor(BINDTHREAD, tid, location);
448 if (retval == -1 && errno == 1) {
449 rsid_t rsid;
450 rsethandle_t rsh;
451 // Put something in rsh to prevent compiler warning
452 // about uninitalized use
453 rsh = rs_alloc(RS_EMPTY);
454 rsid.at_pid = getpid();
455 if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
456 retval = ra_detachrset(R_PROCESS, rsid, 0);
457 retval = bindprocessor(BINDTHREAD, tid, location);
458 }
459 }
460 if (retval == 0) {
461 KA_TRACE(10, ("__kmp_set_system_affinity: Done binding "
462 "T#%d to cpu=%d.\n",
463 gtid, location));
464 continue;
465 }
466 int error = errno;
467 if (abort_on_error) {
468 __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
469 KMP_ERR(error), __kmp_msg_null);
470 KA_TRACE(10, ("__kmp_set_system_affinity: Error binding "
471 "T#%d to cpu=%d, errno=%d.\n",
472 gtid, location, error));
473 return error;
474 }
475 }
476 }
477 return 0;
478 }
479#else // !KMP_OS_AIX
480 int get_system_affinity(bool abort_on_error) override {
481 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
482 "Illegal get affinity operation when not capable");
483#if KMP_OS_LINUX
484 long retval =
485 syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
486#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
487 int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
488 reinterpret_cast<cpuset_t *>(mask));
489 int retval = (r == 0 ? 0 : -1);
490#endif
491 if (retval >= 0) {
492 return 0;
493 }
494 int error = errno;
495 if (abort_on_error) {
496 __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
497 KMP_ERR(error), __kmp_msg_null);
498 }
499 return error;
500 }
501 int set_system_affinity(bool abort_on_error) const override {
502 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
503 "Illegal set affinity operation when not capable");
504#if KMP_OS_LINUX
505 long retval =
506 syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
507#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
508 int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
509 reinterpret_cast<cpuset_t *>(mask));
510 int retval = (r == 0 ? 0 : -1);
511#endif
512 if (retval >= 0) {
513 return 0;
514 }
515 int error = errno;
516 if (abort_on_error) {
517 __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
518 KMP_ERR(error), __kmp_msg_null);
519 }
520 return error;
521 }
522#endif // KMP_OS_AIX
523 };
524 void determine_capable(const char *env_var) override {
526 }
527 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
528 KMPAffinity::Mask *allocate_mask() override {
529 KMPNativeAffinity::Mask *retval = new Mask();
530 return retval;
531 }
532 void deallocate_mask(KMPAffinity::Mask *m) override {
533 KMPNativeAffinity::Mask *native_mask =
534 static_cast<KMPNativeAffinity::Mask *>(m);
535 delete native_mask;
536 }
537 KMPAffinity::Mask *allocate_mask_array(int num) override {
538 return new Mask[num];
539 }
540 void deallocate_mask_array(KMPAffinity::Mask *array) override {
541 Mask *linux_array = static_cast<Mask *>(array);
542 delete[] linux_array;
543 }
544 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
545 int index) override {
546 Mask *linux_array = static_cast<Mask *>(array);
547 return &(linux_array[index]);
548 }
549 api_type get_api_type() const override { return NATIVE_OS; }
550};
551#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY \
552 || KMP_OS_AIX */
553
554#if KMP_OS_WINDOWS
555class KMPNativeAffinity : public KMPAffinity {
556 class Mask : public KMPAffinity::Mask {
557 typedef ULONG_PTR mask_t;
558 static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
559 mask_t *mask;
560
561 public:
562 Mask() {
563 mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
564 }
565 ~Mask() {
566 if (mask)
568 }
569 void set(int i) override {
570 mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
571 }
572 bool is_set(int i) const override {
573 return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
574 }
575 void clear(int i) override {
576 mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
577 }
578 void zero() override {
579 for (int i = 0; i < __kmp_num_proc_groups; ++i)
580 mask[i] = 0;
581 }
582 bool empty() const override {
583 for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
584 if (mask[i])
585 return false;
586 return true;
587 }
588 void copy(const KMPAffinity::Mask *src) override {
589 const Mask *convert = static_cast<const Mask *>(src);
590 for (int i = 0; i < __kmp_num_proc_groups; ++i)
591 mask[i] = convert->mask[i];
592 }
593 void bitwise_and(const KMPAffinity::Mask *rhs) override {
594 const Mask *convert = static_cast<const Mask *>(rhs);
595 for (int i = 0; i < __kmp_num_proc_groups; ++i)
596 mask[i] &= convert->mask[i];
597 }
598 void bitwise_or(const KMPAffinity::Mask *rhs) override {
599 const Mask *convert = static_cast<const Mask *>(rhs);
600 for (int i = 0; i < __kmp_num_proc_groups; ++i)
601 mask[i] |= convert->mask[i];
602 }
603 void bitwise_not() override {
604 for (int i = 0; i < __kmp_num_proc_groups; ++i)
605 mask[i] = ~(mask[i]);
606 }
607 bool is_equal(const KMPAffinity::Mask *rhs) const override {
608 const Mask *convert = static_cast<const Mask *>(rhs);
609 for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
610 if (mask[i] != convert->mask[i])
611 return false;
612 return true;
613 }
614 int begin() const override {
615 int retval = 0;
616 while (retval < end() && !is_set(retval))
617 ++retval;
618 return retval;
619 }
620 int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
621 int next(int previous) const override {
622 int retval = previous + 1;
623 while (retval < end() && !is_set(retval))
624 ++retval;
625 return retval;
626 }
627 int set_process_affinity(bool abort_on_error) const override {
628 if (__kmp_num_proc_groups <= 1) {
629 if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
630 DWORD error = GetLastError();
631 if (abort_on_error) {
632 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
634 }
635 return error;
636 }
637 }
638 return 0;
639 }
640 int set_system_affinity(bool abort_on_error) const override {
641 if (__kmp_num_proc_groups > 1) {
642 // Check for a valid mask.
643 GROUP_AFFINITY ga;
644 int group = get_proc_group();
645 if (group < 0) {
646 if (abort_on_error) {
647 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
648 }
649 return -1;
650 }
651 // Transform the bit vector into a GROUP_AFFINITY struct
652 // and make the system call to set affinity.
653 ga.Group = group;
654 ga.Mask = mask[group];
655 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
656
657 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
658 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
659 DWORD error = GetLastError();
660 if (abort_on_error) {
661 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
663 }
664 return error;
665 }
666 } else {
667 if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
668 DWORD error = GetLastError();
669 if (abort_on_error) {
670 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
672 }
673 return error;
674 }
675 }
676 return 0;
677 }
678 int get_system_affinity(bool abort_on_error) override {
679 if (__kmp_num_proc_groups > 1) {
680 this->zero();
681 GROUP_AFFINITY ga;
682 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
683 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
684 DWORD error = GetLastError();
685 if (abort_on_error) {
686 __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
687 KMP_ERR(error), __kmp_msg_null);
688 }
689 return error;
690 }
691 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
692 (ga.Mask == 0)) {
693 return -1;
694 }
695 mask[ga.Group] = ga.Mask;
696 } else {
697 mask_t newMask, sysMask, retval;
698 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
699 DWORD error = GetLastError();
700 if (abort_on_error) {
701 __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
702 KMP_ERR(error), __kmp_msg_null);
703 }
704 return error;
705 }
706 retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
707 if (!retval) {
708 DWORD error = GetLastError();
709 if (abort_on_error) {
710 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
711 KMP_ERR(error), __kmp_msg_null);
712 }
713 return error;
714 }
715 newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
716 if (!newMask) {
717 DWORD error = GetLastError();
718 if (abort_on_error) {
719 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
720 KMP_ERR(error), __kmp_msg_null);
721 }
722 }
723 *mask = retval;
724 }
725 return 0;
726 }
727 int get_proc_group() const override {
728 int group = -1;
729 if (__kmp_num_proc_groups == 1) {
730 return 1;
731 }
732 for (int i = 0; i < __kmp_num_proc_groups; i++) {
733 if (mask[i] == 0)
734 continue;
735 if (group >= 0)
736 return -1;
737 group = i;
738 }
739 return group;
740 }
741 };
742 void determine_capable(const char *env_var) override {
744 }
745 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
746 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
747 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
748 KMPAffinity::Mask *allocate_mask_array(int num) override {
749 return new Mask[num];
750 }
751 void deallocate_mask_array(KMPAffinity::Mask *array) override {
752 Mask *windows_array = static_cast<Mask *>(array);
753 delete[] windows_array;
754 }
755 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
756 int index) override {
757 Mask *windows_array = static_cast<Mask *>(array);
758 return &(windows_array[index]);
759 }
760 api_type get_api_type() const override { return NATIVE_OS; }
761};
762#endif /* KMP_OS_WINDOWS */
763#endif /* KMP_AFFINITY_SUPPORTED */
764
765// Describe an attribute for a level in the machine topology
767 int core_type : 8;
768 int core_eff : 8;
769 unsigned valid : 1;
770 unsigned reserved : 15;
771
772 static const int UNKNOWN_CORE_EFF = -1;
773
776 valid(0), reserved(0) {}
778 valid = 1;
779 core_type = type;
780 }
781 void set_core_eff(int eff) {
782 valid = 1;
783 core_eff = eff;
784 }
787 }
788 int get_core_eff() const { return core_eff; }
789 bool is_core_type_valid() const {
791 }
792 bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
793 operator bool() const { return valid; }
794 void clear() {
797 valid = 0;
798 }
799 bool contains(const kmp_hw_attr_t &other) const {
800 if (!valid && !other.valid)
801 return true;
802 if (valid && other.valid) {
803 if (other.is_core_type_valid()) {
804 if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
805 return false;
806 }
807 if (other.is_core_eff_valid()) {
808 if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
809 return false;
810 }
811 return true;
812 }
813 return false;
814 }
815#if KMP_AFFINITY_SUPPORTED
816 bool contains(const kmp_affinity_attrs_t &attr) const {
817 if (!valid && !attr.valid)
818 return true;
819 if (valid && attr.valid) {
820 if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
821 return (is_core_type_valid() &&
822 (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
823 if (attr.core_eff != UNKNOWN_CORE_EFF)
824 return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
825 return true;
826 }
827 return false;
828 }
829#endif // KMP_AFFINITY_SUPPORTED
830 bool operator==(const kmp_hw_attr_t &rhs) const {
831 return (rhs.valid == valid && rhs.core_eff == core_eff &&
832 rhs.core_type == core_type);
833 }
834 bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
835};
836
837#if KMP_AFFINITY_SUPPORTED
838KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
839#endif
840
842public:
843 static const int UNKNOWN_ID = -1;
844 static const int MULTIPLE_ID = -2;
845 static int compare_ids(const void *a, const void *b);
846 static int compare_compact(const void *a, const void *b);
849 bool leader;
850 int os_id;
853
854 void print() const;
855 void clear() {
856 for (int i = 0; i < (int)KMP_HW_LAST; ++i)
857 ids[i] = UNKNOWN_ID;
858 leader = false;
859 attrs.clear();
860 }
861};
862
864
865 struct flags_t {
866 int uniform : 1;
867 int reserved : 31;
868 };
869
870 int depth;
871
872 // The following arrays are all 'depth' long and have been
873 // allocated to hold up to KMP_HW_LAST number of objects if
874 // needed so layers can be added without reallocation of any array
875
876 // Orderd array of the types in the topology
877 kmp_hw_t *types;
878
879 // Keep quick topology ratios, for non-uniform topologies,
880 // this ratio holds the max number of itemAs per itemB
881 // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
882 int *ratio;
883
884 // Storage containing the absolute number of each topology layer
885 int *count;
886
887 // The number of core efficiencies. This is only useful for hybrid
888 // topologies. Core efficiencies will range from 0 to num efficiencies - 1
889 int num_core_efficiencies;
890 int num_core_types;
892
893 // The hardware threads array
894 // hw_threads is num_hw_threads long
895 // Each hw_thread's ids and sub_ids are depth deep
896 int num_hw_threads;
897 kmp_hw_thread_t *hw_threads;
898
899 // Equivalence hash where the key is the hardware topology item
900 // and the value is the equivalent hardware topology type in the
901 // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
902 // known equivalence for the topology type
903 kmp_hw_t equivalent[KMP_HW_LAST];
904
905 // Flags describing the topology
906 flags_t flags;
907
908 // Compact value used during sort_compact()
909 int compact;
910
911#if KMP_GROUP_AFFINITY
912 // Insert topology information about Windows Processor groups
913 void _insert_windows_proc_groups();
914#endif
915
916 // Count each item & get the num x's per y
917 // e.g., get the number of cores and the number of threads per core
918 // for each (x, y) in (KMP_HW_* , KMP_HW_*)
919 void _gather_enumeration_information();
920
921 // Remove layers that don't add information to the topology.
922 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
923 void _remove_radix1_layers();
924
925 // Find out if the topology is uniform
926 void _discover_uniformity();
927
928 // Set all the sub_ids for each hardware thread
929 void _set_sub_ids();
930
931 // Set global affinity variables describing the number of threads per
932 // core, the number of packages, the number of cores per package, and
933 // the number of cores.
934 void _set_globals();
935
936 // Set the last level cache equivalent type
937 void _set_last_level_cache();
938
939 // Return the number of cores with a particular attribute, 'attr'.
940 // If 'find_all' is true, then find all cores on the machine, otherwise find
941 // all cores per the layer 'above'
942 int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
943 bool find_all = false) const;
944
945public:
946 // Force use of allocate()/deallocate()
947 kmp_topology_t() = delete;
948 kmp_topology_t(const kmp_topology_t &t) = delete;
952
953 static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
954 static void deallocate(kmp_topology_t *);
955
956 // Functions used in create_map() routines
957 kmp_hw_thread_t &at(int index) {
958 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
959 return hw_threads[index];
960 }
961 const kmp_hw_thread_t &at(int index) const {
962 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
963 return hw_threads[index];
964 }
965 int get_num_hw_threads() const { return num_hw_threads; }
966 void sort_ids() {
967 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
969 }
970
971 // Insert a new topology layer after allocation
972 void insert_layer(kmp_hw_t type, const int *ids);
973
974 // Check if the hardware ids are unique, if they are
975 // return true, otherwise return false
976 bool check_ids() const;
977
978 // Function to call after the create_map() routine
979 void canonicalize();
980 void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
981
982// Functions used after canonicalize() called
983
984#if KMP_AFFINITY_SUPPORTED
985 // Set the granularity for affinity settings
986 void set_granularity(kmp_affinity_t &stgs) const;
987 bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
988 bool restrict_to_mask(const kmp_affin_mask_t *mask);
989 bool filter_hw_subset();
990#endif
991 bool is_uniform() const { return flags.uniform; }
992 // Tell whether a type is a valid type in the topology
993 // returns KMP_HW_UNKNOWN when there is no equivalent type
995 if (type == KMP_HW_UNKNOWN)
996 return KMP_HW_UNKNOWN;
997 return equivalent[type];
998 }
999 // Set type1 = type2
1003 kmp_hw_t real_type2 = equivalent[type2];
1004 if (real_type2 == KMP_HW_UNKNOWN)
1005 real_type2 = type2;
1006 equivalent[type1] = real_type2;
1007 // This loop is required since any of the types may have been set to
1008 // be equivalent to type1. They all must be checked and reset to type2.
1010 if (equivalent[type] == type1) {
1011 equivalent[type] = real_type2;
1012 }
1013 }
1014 }
1015 // Calculate number of types corresponding to level1
1016 // per types corresponding to level2 (e.g., number of threads per core)
1017 int calculate_ratio(int level1, int level2) const {
1018 KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
1019 KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
1020 int r = 1;
1021 for (int level = level1; level > level2; --level)
1022 r *= ratio[level];
1023 return r;
1024 }
1025 int get_ratio(int level) const {
1026 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1027 return ratio[level];
1028 }
1029 int get_depth() const { return depth; };
1031 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1032 return types[level];
1033 }
1036 int eq_type = equivalent[type];
1037 if (eq_type == KMP_HW_UNKNOWN)
1038 return -1;
1039 for (int i = 0; i < depth; ++i)
1040 if (types[i] == eq_type)
1041 return i;
1042 return -1;
1043 }
1044 int get_count(int level) const {
1045 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1046 return count[level];
1047 }
1048 // Return the total number of cores with attribute 'attr'
1049 int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1050 return _get_ncores_with_attr(attr, -1, true);
1051 }
1052 // Return the number of cores with attribute
1053 // 'attr' per topology level 'above'
1054 int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1055 return _get_ncores_with_attr(attr, above, false);
1056 }
1057
1058#if KMP_AFFINITY_SUPPORTED
1059 friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
1060 void sort_compact(kmp_affinity_t &affinity) {
1061 compact = affinity.compact;
1062 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
1064 }
1065#endif
1066 void print(const char *env_var = "KMP_AFFINITY") const;
1067 void dump() const;
1068};
1070
1072 const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1073
1074public:
1075 // Describe a machine topology item in KMP_HW_SUBSET
1076 struct item_t {
1079 int num[MAX_ATTRS];
1080 int offset[MAX_ATTRS];
1082 };
1083 // Put parenthesis around max to avoid accidental use of Windows max macro.
1084 const static int USE_ALL = (std::numeric_limits<int>::max)();
1085
1086private:
1087 int depth;
1088 int capacity;
1089 item_t *items;
1090 kmp_uint64 set;
1091 bool absolute;
1092 // The set must be able to handle up to KMP_HW_LAST number of layers
1093 KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1094 // Sorting the KMP_HW_SUBSET items to follow topology order
1095 // All unknown topology types will be at the beginning of the subset
1096 static int hw_subset_compare(const void *i1, const void *i2) {
1097 kmp_hw_t type1 = ((const item_t *)i1)->type;
1098 kmp_hw_t type2 = ((const item_t *)i2)->type;
1099 int level1 = __kmp_topology->get_level(type1);
1100 int level2 = __kmp_topology->get_level(type2);
1101 return level1 - level2;
1102 }
1103
1104public:
1105 // Force use of allocate()/deallocate()
1111
1113 int initial_capacity = 5;
1114 kmp_hw_subset_t *retval =
1116 retval->depth = 0;
1117 retval->capacity = initial_capacity;
1118 retval->set = 0ull;
1119 retval->absolute = false;
1120 retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1121 return retval;
1122 }
1123 static void deallocate(kmp_hw_subset_t *subset) {
1124 __kmp_free(subset->items);
1125 __kmp_free(subset);
1126 }
1127 void set_absolute() { absolute = true; }
1128 bool is_absolute() const { return absolute; }
1129 void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1130 for (int i = 0; i < depth; ++i) {
1131 // Found an existing item for this layer type
1132 // Add the num, offset, and attr to this item
1133 if (items[i].type == type) {
1134 int idx = items[i].num_attrs++;
1135 if ((size_t)idx >= MAX_ATTRS)
1136 return;
1137 items[i].num[idx] = num;
1138 items[i].offset[idx] = offset;
1139 items[i].attr[idx] = attr;
1140 return;
1141 }
1142 }
1143 if (depth == capacity - 1) {
1144 capacity *= 2;
1145 item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1146 for (int i = 0; i < depth; ++i)
1147 new_items[i] = items[i];
1148 __kmp_free(items);
1149 items = new_items;
1150 }
1151 items[depth].num_attrs = 1;
1152 items[depth].type = type;
1153 items[depth].num[0] = num;
1154 items[depth].offset[0] = offset;
1155 items[depth].attr[0] = attr;
1156 depth++;
1157 set |= (1ull << type);
1158 }
1159 int get_depth() const { return depth; }
1160 const item_t &at(int index) const {
1161 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1162 return items[index];
1163 }
1164 item_t &at(int index) {
1165 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1166 return items[index];
1167 }
1168 void remove(int index) {
1169 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1170 set &= ~(1ull << items[index].type);
1171 for (int j = index + 1; j < depth; ++j) {
1172 items[j - 1] = items[j];
1173 }
1174 depth--;
1175 }
1176 void sort() {
1178 qsort(items, depth, sizeof(item_t), hw_subset_compare);
1179 }
1180 bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1181
1182 // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.
1183 // This means putting each of {sockets, cores, threads} in the topology if
1184 // they are not specified:
1185 // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc.
1186 // e.g., 3module => *s,3module,*c,*t
1187 // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET
1188 // are expecting the traditional sockets/cores/threads topology. For newer
1189 // hardware, there can be intervening layers like dies/tiles/modules
1190 // (usually corresponding to a cache level). So when a user asks for
1191 // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user
1192 // should get 12 hardware threads across 6 cores and effectively ignore the
1193 // module layer.
1194 void canonicalize(const kmp_topology_t *top) {
1195 // Layers to target for KMP_HW_SUBSET canonicalization
1197
1198 // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS
1199 if (is_absolute())
1200 return;
1201
1202 // Do not target-layer-canonicalize KMP_HW_SUBSETS when the
1203 // topology doesn't have these layers
1204 for (kmp_hw_t type : targeted)
1205 if (top->get_level(type) == KMP_HW_UNKNOWN)
1206 return;
1207
1208 // Put targeted layers in topology if they do not exist
1209 for (kmp_hw_t type : targeted) {
1210 bool found = false;
1211 for (int i = 0; i < get_depth(); ++i) {
1212 if (top->get_equivalent_type(items[i].type) == type) {
1213 found = true;
1214 break;
1215 }
1216 }
1217 if (!found) {
1219 }
1220 }
1221 sort();
1222 // Set as an absolute topology that only targets the targeted layers
1223 set_absolute();
1224 }
1225 void dump() const {
1226 printf("**********************\n");
1227 printf("*** kmp_hw_subset: ***\n");
1228 printf("* depth: %d\n", depth);
1229 printf("* items:\n");
1230 for (int i = 0; i < depth; ++i) {
1231 printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1232 for (int j = 0; j < items[i].num_attrs; ++j) {
1233 printf(" num: %d, offset: %d, attr: ", items[i].num[j],
1234 items[i].offset[j]);
1235 if (!items[i].attr[j]) {
1236 printf(" (none)\n");
1237 } else {
1238 printf(
1239 " core_type = %s, core_eff = %d\n",
1240 __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1241 items[i].attr[j].get_core_eff());
1242 }
1243 }
1244 }
1245 printf("* set: 0x%llx\n", set);
1246 printf("* absolute: %d\n", absolute);
1247 printf("**********************\n");
1248 }
1249};
1251
1252/* A structure for holding machine-specific hierarchy info to be computed once
1253 at init. This structure represents a mapping of threads to the actual machine
1254 hierarchy, or to our best guess at what the hierarchy might be, for the
1255 purpose of performing an efficient barrier. In the worst case, when there is
1256 no machine hierarchy information, it produces a tree suitable for a barrier,
1257 similar to the tree used in the hyper barrier. */
1259public:
1260 /* Good default values for number of leaves and branching factor, given no
1261 affinity information. Behaves a bit like hyper barrier. */
1262 static const kmp_uint32 maxLeaves = 4;
1263 static const kmp_uint32 minBranch = 4;
1264 /** Number of levels in the hierarchy. Typical levels are threads/core,
1265 cores/package or socket, packages/node, nodes/machine, etc. We don't want
1266 to get specific with nomenclature. When the machine is oversubscribed we
1267 add levels to duplicate the hierarchy, doubling the thread capacity of the
1268 hierarchy each time we add a level. */
1270
1271 /** This is specifically the depth of the machine configuration hierarchy, in
1272 terms of the number of levels along the longest path from root to any
1273 leaf. It corresponds to the number of entries in numPerLevel if we exclude
1274 all but one trailing 1. */
1278 volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1279 // 2=initialization in progress
1280 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1281
1282 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
1283 the parent of a node at level i has. For example, if we have a machine
1284 with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
1285 {2, 4, 4, 1, 1}. All empty levels are set to 1. */
1288
1290 int hier_depth = __kmp_topology->get_depth();
1291 for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1293 }
1294 }
1295
1298
1299 void fini() {
1300 if (!uninitialized && numPerLevel) {
1302 numPerLevel = NULL;
1304 }
1305 }
1306
1307 void init(int num_addrs) {
1310 if (bool_result == 0) { // Wait for initialization
1311 while (TCR_1(uninitialized) != initialized)
1312 KMP_CPU_PAUSE();
1313 return;
1314 }
1315 KMP_DEBUG_ASSERT(bool_result == 1);
1316
1317 /* Added explicit initialization of the data fields here to prevent usage of
1318 dirty value observed when static library is re-initialized multiple times
1319 (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1320 OpenMP). */
1321 depth = 1;
1322 resizing = 0;
1323 maxLevels = 7;
1324 numPerLevel =
1327 for (kmp_uint32 i = 0; i < maxLevels;
1328 ++i) { // init numPerLevel[*] to 1 item per level
1329 numPerLevel[i] = 1;
1330 skipPerLevel[i] = 1;
1331 }
1332
1333 // Sort table by physical ID
1334 if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1335 deriveLevels();
1336 } else {
1338 numPerLevel[1] = num_addrs / maxLeaves;
1339 if (num_addrs % maxLeaves)
1340 numPerLevel[1]++;
1341 }
1342
1343 base_num_threads = num_addrs;
1344 for (int i = maxLevels - 1; i >= 0;
1345 --i) // count non-empty levels to get depth
1346 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1347 depth++;
1348
1349 kmp_uint32 branch = minBranch;
1350 if (numPerLevel[0] == 1)
1351 branch = num_addrs / maxLeaves;
1352 if (branch < minBranch)
1353 branch = minBranch;
1354 for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1355 while (numPerLevel[d] > branch ||
1356 (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1357 if (numPerLevel[d] & 1)
1358 numPerLevel[d]++;
1359 numPerLevel[d] = numPerLevel[d] >> 1;
1360 if (numPerLevel[d + 1] == 1)
1361 depth++;
1362 numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1363 }
1364 if (numPerLevel[0] == 1) {
1365 branch = branch >> 1;
1366 if (branch < 4)
1367 branch = minBranch;
1368 }
1369 }
1370
1371 for (kmp_uint32 i = 1; i < depth; ++i)
1372 skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1373 // Fill in hierarchy in the case of oversubscription
1374 for (kmp_uint32 i = depth; i < maxLevels; ++i)
1375 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1376
1377 uninitialized = initialized; // One writer
1378 }
1379
1380 // Resize the hierarchy if nproc changes to something larger than before
1381 void resize(kmp_uint32 nproc) {
1382 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1383 while (bool_result == 0) { // someone else is trying to resize
1384 KMP_CPU_PAUSE();
1385 if (nproc <= base_num_threads) // happy with other thread's resize
1386 return;
1387 else // try to resize
1388 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1389 }
1390 KMP_DEBUG_ASSERT(bool_result != 0);
1391 if (nproc <= base_num_threads)
1392 return; // happy with other thread's resize
1393
1394 // Calculate new maxLevels
1395 kmp_uint32 old_sz = skipPerLevel[depth - 1];
1396 kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1397 // First see if old maxLevels is enough to contain new size
1398 for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1399 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1400 numPerLevel[i - 1] *= 2;
1401 old_sz *= 2;
1402 depth++;
1403 }
1404 if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1405 while (nproc > old_sz) {
1406 old_sz *= 2;
1407 incs++;
1408 depth++;
1409 }
1410 maxLevels += incs;
1411
1412 // Resize arrays
1413 kmp_uint32 *old_numPerLevel = numPerLevel;
1414 kmp_uint32 *old_skipPerLevel = skipPerLevel;
1415 numPerLevel = skipPerLevel = NULL;
1416 numPerLevel =
1419
1420 // Copy old elements from old arrays
1421 for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1422 // init numPerLevel[*] to 1 item per level
1423 numPerLevel[i] = old_numPerLevel[i];
1424 skipPerLevel[i] = old_skipPerLevel[i];
1425 }
1426
1427 // Init new elements in arrays to 1
1428 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1429 // init numPerLevel[*] to 1 item per level
1430 numPerLevel[i] = 1;
1431 skipPerLevel[i] = 1;
1432 }
1433
1434 // Free old arrays
1435 __kmp_free(old_numPerLevel);
1436 }
1437
1438 // Fill in oversubscription levels of hierarchy
1439 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1440 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1441
1442 base_num_threads = nproc;
1443 resizing = 0; // One writer
1444 }
1445};
1446#endif // KMP_AFFINITY_H
char bool
kmp_uint32 * numPerLevel
Level 0 corresponds to leaves.
static const kmp_uint32 maxLeaves
kmp_uint32 * skipPerLevel
void resize(kmp_uint32 nproc)
kmp_uint32 base_num_threads
volatile kmp_int8 uninitialized
kmp_uint32 maxLevels
Number of levels in the hierarchy.
kmp_uint32 depth
This is specifically the depth of the machine configuration hierarchy, in terms of the number of leve...
void init(int num_addrs)
volatile kmp_int8 resizing
static const kmp_uint32 minBranch
bool specified(kmp_hw_t type) const
void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr)
bool is_absolute() const
kmp_hw_subset_t()=delete
kmp_hw_subset_t(kmp_hw_subset_t &&t)=delete
void canonicalize(const kmp_topology_t *top)
static kmp_hw_subset_t * allocate()
static void deallocate(kmp_hw_subset_t *subset)
void remove(int index)
kmp_hw_subset_t(const kmp_hw_subset_t &t)=delete
int get_depth() const
static const int USE_ALL
const item_t & at(int index) const
item_t & at(int index)
kmp_hw_subset_t & operator=(kmp_hw_subset_t &&t)=delete
kmp_hw_subset_t & operator=(const kmp_hw_subset_t &t)=delete
void dump() const
kmp_hw_attr_t attrs
Definition: kmp_affinity.h:852
static const int UNKNOWN_ID
Definition: kmp_affinity.h:843
int sub_ids[KMP_HW_LAST]
Definition: kmp_affinity.h:848
static int compare_compact(const void *a, const void *b)
void print() const
static int compare_ids(const void *a, const void *b)
static const int MULTIPLE_ID
Definition: kmp_affinity.h:844
int ids[KMP_HW_LAST]
Definition: kmp_affinity.h:847
kmp_hw_thread_t & at(int index)
Definition: kmp_affinity.h:957
void dump() const
int get_level(kmp_hw_t type) const
int get_count(int level) const
int get_ratio(int level) const
static void deallocate(kmp_topology_t *)
kmp_hw_t get_equivalent_type(kmp_hw_t type) const
Definition: kmp_affinity.h:994
void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2)
int get_num_hw_threads() const
Definition: kmp_affinity.h:965
int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const
const kmp_hw_thread_t & at(int index) const
Definition: kmp_affinity.h:961
int get_depth() const
void insert_layer(kmp_hw_t type, const int *ids)
int calculate_ratio(int level1, int level2) const
bool is_uniform() const
Definition: kmp_affinity.h:991
static kmp_topology_t * allocate(int nproc, int ndepth, const kmp_hw_t *types)
void print(const char *env_var="KMP_AFFINITY") const
kmp_topology_t & operator=(kmp_topology_t &&t)=delete
kmp_topology_t(const kmp_topology_t &t)=delete
kmp_hw_t get_type(int level) const
kmp_topology_t()=delete
kmp_topology_t(kmp_topology_t &&t)=delete
kmp_topology_t & operator=(const kmp_topology_t &t)=delete
bool check_ids() const
int get_ncores_with_attr(const kmp_hw_attr_t &attr) const
void
Definition: ittnotify.h:3324
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int mask
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp end
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp begin
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t count
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type size_t void ITT_FORMAT p const __itt_domain __itt_id __itt_string_handle const wchar_t size_t ITT_FORMAT lu const __itt_domain __itt_id __itt_relation __itt_id ITT_FORMAT p const wchar_t int ITT_FORMAT __itt_group_mark d int
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type type
#define __kmp_free(ptr)
Definition: kmp.h:3765
#define KMP_HW_MAX_NUM_CORE_EFFS
Definition: kmp.h:656
#define KMP_CPU_PAUSE()
Definition: kmp.h:1573
int __kmp_xproc
Definition: kmp_global.cpp:122
#define KMP_FOREACH_HW_TYPE(type)
Definition: kmp.h:663
#define __kmp_entry_gtid()
Definition: kmp.h:3610
const char * __kmp_hw_get_keyword(kmp_hw_t type, bool plural=false)
#define __kmp_allocate(size)
Definition: kmp.h:3763
#define TRUE
Definition: kmp.h:1333
const char * __kmp_hw_get_core_type_string(kmp_hw_core_type_t type)
kmp_hw_t
Definition: kmp.h:628
@ KMP_HW_UNKNOWN
Definition: kmp.h:629
@ KMP_HW_SOCKET
Definition: kmp.h:630
@ KMP_HW_CORE
Definition: kmp.h:640
@ KMP_HW_THREAD
Definition: kmp.h:641
@ KMP_HW_LAST
Definition: kmp.h:642
kmp_hw_core_type_t
Definition: kmp.h:645
@ KMP_HW_MAX_NUM_CORE_TYPES
Definition: kmp.h:652
@ KMP_HW_CORE_TYPE_UNKNOWN
Definition: kmp.h:646
static void __kmp_type_convert(T1 src, T2 *dest)
Definition: kmp.h:4886
#define KMP_DEBUG_ASSERT_VALID_HW_TYPE(type)
Definition: kmp.h:658
kmp_hw_subset_t * __kmp_hw_subset
kmp_topology_t * __kmp_topology
KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 kmp_int8
Definition: kmp_atomic.cpp:985
#define KA_TRACE(d, x)
Definition: kmp_debug.h:157
#define KMP_BUILD_ASSERT(expr)
Definition: kmp_debug.h:26
#define KMP_DEBUG_ASSERT(cond)
Definition: kmp_debug.h:61
#define KMP_ASSERT2(cond, msg)
Definition: kmp_debug.h:60
unsigned long long kmp_uint64
kmp_msg_t __kmp_msg_null
Definition: kmp_i18n.cpp:36
void __kmp_fatal(kmp_msg_t message,...)
Definition: kmp_i18n.cpp:864
#define KMP_WARNING(...)
Definition: kmp_i18n.h:144
#define KMP_MSG(...)
Definition: kmp_i18n.h:121
#define KMP_FATAL(...)
Definition: kmp_i18n.h:146
#define KMP_ERR
Definition: kmp_i18n.h:125
#define TCR_1(a)
Definition: kmp_os.h:1133
#define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv)
Definition: kmp_os.h:801
#define i
Definition: kmp_stub.cpp:87
int a
def error(msg)
Definition: libomputils.py:23
bool contains(const kmp_hw_attr_t &other) const
Definition: kmp_affinity.h:799
int get_core_eff() const
Definition: kmp_affinity.h:788
unsigned reserved
Definition: kmp_affinity.h:770
bool is_core_type_valid() const
Definition: kmp_affinity.h:789
kmp_hw_core_type_t get_core_type() const
Definition: kmp_affinity.h:785
void set_core_type(kmp_hw_core_type_t type)
Definition: kmp_affinity.h:777
static const int UNKNOWN_CORE_EFF
Definition: kmp_affinity.h:772
unsigned valid
Definition: kmp_affinity.h:769
void set_core_eff(int eff)
Definition: kmp_affinity.h:781
bool operator==(const kmp_hw_attr_t &rhs) const
Definition: kmp_affinity.h:830
bool is_core_eff_valid() const
Definition: kmp_affinity.h:792
bool operator!=(const kmp_hw_attr_t &rhs) const
Definition: kmp_affinity.h:834
kmp_hw_attr_t attr[MAX_ATTRS]
void __kmp_affinity_determine_capable(const char *env_var)
void __kmp_affinity_bind_thread(int proc)