LLVM OpenMP 19.0.0git
kmp_affinity.h
Go to the documentation of this file.
1/*
2 * kmp_affinity.h -- header for affinity management
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef KMP_AFFINITY_H
14#define KMP_AFFINITY_H
15
16#include "kmp.h"
17#include "kmp_os.h"
18#include <limits>
19
20#if KMP_AFFINITY_SUPPORTED
21#if KMP_USE_HWLOC
22class KMPHwlocAffinity : public KMPAffinity {
23public:
24 class Mask : public KMPAffinity::Mask {
25 hwloc_cpuset_t mask;
26
27 public:
28 Mask() {
29 mask = hwloc_bitmap_alloc();
30 this->zero();
31 }
32 ~Mask() { hwloc_bitmap_free(mask); }
33 void set(int i) override { hwloc_bitmap_set(mask, i); }
34 bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
35 void clear(int i) override { hwloc_bitmap_clr(mask, i); }
36 void zero() override { hwloc_bitmap_zero(mask); }
37 bool empty() const override { return hwloc_bitmap_iszero(mask); }
38 void copy(const KMPAffinity::Mask *src) override {
39 const Mask *convert = static_cast<const Mask *>(src);
40 hwloc_bitmap_copy(mask, convert->mask);
41 }
42 void bitwise_and(const KMPAffinity::Mask *rhs) override {
43 const Mask *convert = static_cast<const Mask *>(rhs);
44 hwloc_bitmap_and(mask, mask, convert->mask);
45 }
46 void bitwise_or(const KMPAffinity::Mask *rhs) override {
47 const Mask *convert = static_cast<const Mask *>(rhs);
48 hwloc_bitmap_or(mask, mask, convert->mask);
49 }
50 void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
51 bool is_equal(const KMPAffinity::Mask *rhs) const override {
52 const Mask *convert = static_cast<const Mask *>(rhs);
53 return hwloc_bitmap_isequal(mask, convert->mask);
54 }
55 int begin() const override { return hwloc_bitmap_first(mask); }
56 int end() const override { return -1; }
57 int next(int previous) const override {
58 return hwloc_bitmap_next(mask, previous);
59 }
60 int get_system_affinity(bool abort_on_error) override {
61 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
62 "Illegal get affinity operation when not capable");
63 long retval =
64 hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
65 if (retval >= 0) {
66 return 0;
67 }
68 int error = errno;
69 if (abort_on_error) {
70 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
71 KMP_ERR(error), __kmp_msg_null);
72 }
73 return error;
74 }
75 int set_system_affinity(bool abort_on_error) const override {
76 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
77 "Illegal set affinity operation when not capable");
78 long retval =
79 hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
80 if (retval >= 0) {
81 return 0;
82 }
83 int error = errno;
84 if (abort_on_error) {
85 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
86 KMP_ERR(error), __kmp_msg_null);
87 }
88 return error;
89 }
90#if KMP_OS_WINDOWS
91 int set_process_affinity(bool abort_on_error) const override {
92 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
93 "Illegal set process affinity operation when not capable");
94 int error = 0;
95 const hwloc_topology_support *support =
96 hwloc_topology_get_support(__kmp_hwloc_topology);
97 if (support->cpubind->set_proc_cpubind) {
98 int retval;
99 retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
100 HWLOC_CPUBIND_PROCESS);
101 if (retval >= 0)
102 return 0;
103 error = errno;
104 if (abort_on_error)
105 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
106 KMP_ERR(error), __kmp_msg_null);
107 }
108 return error;
109 }
110#endif
111 int get_proc_group() const override {
112 int group = -1;
113#if KMP_OS_WINDOWS
114 if (__kmp_num_proc_groups == 1) {
115 return 1;
116 }
117 for (int i = 0; i < __kmp_num_proc_groups; i++) {
118 // On windows, the long type is always 32 bits
119 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
120 unsigned long second_32_bits =
121 hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
122 if (first_32_bits == 0 && second_32_bits == 0) {
123 continue;
124 }
125 if (group >= 0) {
126 return -1;
127 }
128 group = i;
129 }
130#endif /* KMP_OS_WINDOWS */
131 return group;
132 }
133 };
134 void determine_capable(const char *var) override {
135 const hwloc_topology_support *topology_support;
136 if (__kmp_hwloc_topology == NULL) {
137 if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
138 __kmp_hwloc_error = TRUE;
139 if (__kmp_affinity.flags.verbose) {
140 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
141 }
142 }
143 if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
144 __kmp_hwloc_error = TRUE;
145 if (__kmp_affinity.flags.verbose) {
146 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
147 }
148 }
149 }
150 topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
151 // Is the system capable of setting/getting this thread's affinity?
152 // Also, is topology discovery possible? (pu indicates ability to discover
153 // processing units). And finally, were there no errors when calling any
154 // hwloc_* API functions?
155 if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
156 topology_support->cpubind->get_thisthread_cpubind &&
157 topology_support->discovery->pu && !__kmp_hwloc_error) {
158 // enables affinity according to KMP_AFFINITY_CAPABLE() macro
159 KMP_AFFINITY_ENABLE(TRUE);
160 } else {
161 // indicate that hwloc didn't work and disable affinity
162 __kmp_hwloc_error = TRUE;
163 KMP_AFFINITY_DISABLE();
164 }
165 }
166 void bind_thread(int which) override {
167 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
168 "Illegal set affinity operation when not capable");
169 KMPAffinity::Mask *mask;
170 KMP_CPU_ALLOC_ON_STACK(mask);
171 KMP_CPU_ZERO(mask);
172 KMP_CPU_SET(which, mask);
173 __kmp_set_system_affinity(mask, TRUE);
174 KMP_CPU_FREE_FROM_STACK(mask);
175 }
176 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
177 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
178 KMPAffinity::Mask *allocate_mask_array(int num) override {
179 return new Mask[num];
180 }
181 void deallocate_mask_array(KMPAffinity::Mask *array) override {
182 Mask *hwloc_array = static_cast<Mask *>(array);
183 delete[] hwloc_array;
184 }
185 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
186 int index) override {
187 Mask *hwloc_array = static_cast<Mask *>(array);
188 return &(hwloc_array[index]);
189 }
190 api_type get_api_type() const override { return HWLOC; }
191};
192#endif /* KMP_USE_HWLOC */
193
194#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
195 KMP_OS_AIX
196#if KMP_OS_LINUX
197/* On some of the older OS's that we build on, these constants aren't present
198 in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
199 all systems of the same arch where they are defined, and they cannot change.
200 stone forever. */
201#include <sys/syscall.h>
202#if KMP_ARCH_X86 || KMP_ARCH_ARM
203#ifndef __NR_sched_setaffinity
204#define __NR_sched_setaffinity 241
205#elif __NR_sched_setaffinity != 241
206#error Wrong code for setaffinity system call.
207#endif /* __NR_sched_setaffinity */
208#ifndef __NR_sched_getaffinity
209#define __NR_sched_getaffinity 242
210#elif __NR_sched_getaffinity != 242
211#error Wrong code for getaffinity system call.
212#endif /* __NR_sched_getaffinity */
213#elif KMP_ARCH_AARCH64
214#ifndef __NR_sched_setaffinity
215#define __NR_sched_setaffinity 122
216#elif __NR_sched_setaffinity != 122
217#error Wrong code for setaffinity system call.
218#endif /* __NR_sched_setaffinity */
219#ifndef __NR_sched_getaffinity
220#define __NR_sched_getaffinity 123
221#elif __NR_sched_getaffinity != 123
222#error Wrong code for getaffinity system call.
223#endif /* __NR_sched_getaffinity */
224#elif KMP_ARCH_X86_64
225#ifndef __NR_sched_setaffinity
226#define __NR_sched_setaffinity 203
227#elif __NR_sched_setaffinity != 203
228#error Wrong code for setaffinity system call.
229#endif /* __NR_sched_setaffinity */
230#ifndef __NR_sched_getaffinity
231#define __NR_sched_getaffinity 204
232#elif __NR_sched_getaffinity != 204
233#error Wrong code for getaffinity system call.
234#endif /* __NR_sched_getaffinity */
235#elif KMP_ARCH_PPC64
236#ifndef __NR_sched_setaffinity
237#define __NR_sched_setaffinity 222
238#elif __NR_sched_setaffinity != 222
239#error Wrong code for setaffinity system call.
240#endif /* __NR_sched_setaffinity */
241#ifndef __NR_sched_getaffinity
242#define __NR_sched_getaffinity 223
243#elif __NR_sched_getaffinity != 223
244#error Wrong code for getaffinity system call.
245#endif /* __NR_sched_getaffinity */
246#elif KMP_ARCH_MIPS
247#ifndef __NR_sched_setaffinity
248#define __NR_sched_setaffinity 4239
249#elif __NR_sched_setaffinity != 4239
250#error Wrong code for setaffinity system call.
251#endif /* __NR_sched_setaffinity */
252#ifndef __NR_sched_getaffinity
253#define __NR_sched_getaffinity 4240
254#elif __NR_sched_getaffinity != 4240
255#error Wrong code for getaffinity system call.
256#endif /* __NR_sched_getaffinity */
257#elif KMP_ARCH_MIPS64
258#ifndef __NR_sched_setaffinity
259#define __NR_sched_setaffinity 5195
260#elif __NR_sched_setaffinity != 5195
261#error Wrong code for setaffinity system call.
262#endif /* __NR_sched_setaffinity */
263#ifndef __NR_sched_getaffinity
264#define __NR_sched_getaffinity 5196
265#elif __NR_sched_getaffinity != 5196
266#error Wrong code for getaffinity system call.
267#endif /* __NR_sched_getaffinity */
268#elif KMP_ARCH_LOONGARCH64
269#ifndef __NR_sched_setaffinity
270#define __NR_sched_setaffinity 122
271#elif __NR_sched_setaffinity != 122
272#error Wrong code for setaffinity system call.
273#endif /* __NR_sched_setaffinity */
274#ifndef __NR_sched_getaffinity
275#define __NR_sched_getaffinity 123
276#elif __NR_sched_getaffinity != 123
277#error Wrong code for getaffinity system call.
278#endif /* __NR_sched_getaffinity */
279#elif KMP_ARCH_RISCV64
280#ifndef __NR_sched_setaffinity
281#define __NR_sched_setaffinity 122
282#elif __NR_sched_setaffinity != 122
283#error Wrong code for setaffinity system call.
284#endif /* __NR_sched_setaffinity */
285#ifndef __NR_sched_getaffinity
286#define __NR_sched_getaffinity 123
287#elif __NR_sched_getaffinity != 123
288#error Wrong code for getaffinity system call.
289#endif /* __NR_sched_getaffinity */
290#elif KMP_ARCH_VE
291#ifndef __NR_sched_setaffinity
292#define __NR_sched_setaffinity 203
293#elif __NR_sched_setaffinity != 203
294#error Wrong code for setaffinity system call.
295#endif /* __NR_sched_setaffinity */
296#ifndef __NR_sched_getaffinity
297#define __NR_sched_getaffinity 204
298#elif __NR_sched_getaffinity != 204
299#error Wrong code for getaffinity system call.
300#endif /* __NR_sched_getaffinity */
301#elif KMP_ARCH_S390X
302#ifndef __NR_sched_setaffinity
303#define __NR_sched_setaffinity 239
304#elif __NR_sched_setaffinity != 239
305#error Wrong code for setaffinity system call.
306#endif /* __NR_sched_setaffinity */
307#ifndef __NR_sched_getaffinity
308#define __NR_sched_getaffinity 240
309#elif __NR_sched_getaffinity != 240
310#error Wrong code for getaffinity system call.
311#endif /* __NR_sched_getaffinity */
312#else
313#error Unknown or unsupported architecture
314#endif /* KMP_ARCH_* */
315#elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
316#include <pthread.h>
317#include <pthread_np.h>
318#elif KMP_OS_NETBSD
319#include <pthread.h>
320#include <sched.h>
321#elif KMP_OS_AIX
322#include <sys/dr.h>
323#include <sys/rset.h>
324#define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
325#define GET_NUMBER_SMT_SETS 0x0004
326extern "C" int syssmt(int flags, int, int, int *);
327#endif
328class KMPNativeAffinity : public KMPAffinity {
329 class Mask : public KMPAffinity::Mask {
330 typedef unsigned long mask_t;
331 typedef decltype(__kmp_affin_mask_size) mask_size_type;
332 static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
333 static const mask_t ONE = 1;
334 mask_size_type get_num_mask_types() const {
335 return __kmp_affin_mask_size / sizeof(mask_t);
336 }
337
338 public:
339 mask_t *mask;
340 Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
341 ~Mask() {
342 if (mask)
344 }
345 void set(int i) override {
346 mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
347 }
348 bool is_set(int i) const override {
349 return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
350 }
351 void clear(int i) override {
352 mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
353 }
354 void zero() override {
355 mask_size_type e = get_num_mask_types();
356 for (mask_size_type i = 0; i < e; ++i)
357 mask[i] = (mask_t)0;
358 }
359 bool empty() const override {
360 mask_size_type e = get_num_mask_types();
361 for (mask_size_type i = 0; i < e; ++i)
362 if (mask[i] != (mask_t)0)
363 return false;
364 return true;
365 }
366 void copy(const KMPAffinity::Mask *src) override {
367 const Mask *convert = static_cast<const Mask *>(src);
368 mask_size_type e = get_num_mask_types();
369 for (mask_size_type i = 0; i < e; ++i)
370 mask[i] = convert->mask[i];
371 }
372 void bitwise_and(const KMPAffinity::Mask *rhs) override {
373 const Mask *convert = static_cast<const Mask *>(rhs);
374 mask_size_type e = get_num_mask_types();
375 for (mask_size_type i = 0; i < e; ++i)
376 mask[i] &= convert->mask[i];
377 }
378 void bitwise_or(const KMPAffinity::Mask *rhs) override {
379 const Mask *convert = static_cast<const Mask *>(rhs);
380 mask_size_type e = get_num_mask_types();
381 for (mask_size_type i = 0; i < e; ++i)
382 mask[i] |= convert->mask[i];
383 }
384 void bitwise_not() override {
385 mask_size_type e = get_num_mask_types();
386 for (mask_size_type i = 0; i < e; ++i)
387 mask[i] = ~(mask[i]);
388 }
389 bool is_equal(const KMPAffinity::Mask *rhs) const override {
390 const Mask *convert = static_cast<const Mask *>(rhs);
391 mask_size_type e = get_num_mask_types();
392 for (mask_size_type i = 0; i < e; ++i)
393 if (mask[i] != convert->mask[i])
394 return false;
395 return true;
396 }
397 int begin() const override {
398 int retval = 0;
399 while (retval < end() && !is_set(retval))
400 ++retval;
401 return retval;
402 }
403 int end() const override {
404 int e;
405 __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
406 return e;
407 }
408 int next(int previous) const override {
409 int retval = previous + 1;
410 while (retval < end() && !is_set(retval))
411 ++retval;
412 return retval;
413 }
414#if KMP_OS_AIX
415 // On AIX, we don't have a way to get CPU(s) a thread is bound to.
416 // This routine is only used to get the full mask.
417 int get_system_affinity(bool abort_on_error) override {
418 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
419 "Illegal get affinity operation when not capable");
420
421 (void)abort_on_error;
422
423 // Set the mask with all CPUs that are available.
424 for (int i = 0; i < __kmp_xproc; ++i)
425 KMP_CPU_SET(i, this);
426 return 0;
427 }
428 int set_system_affinity(bool abort_on_error) const override {
429 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
430
431 "Illegal set affinity operation when not capable");
432
433 int location;
434 int gtid = __kmp_entry_gtid();
435 int tid = thread_self();
436
437 // Unbind the thread if it was bound to any processors before so that
438 // we can bind the thread to CPUs specified by the mask not others.
439 int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
440
441 // On AIX, we can only bind to one instead of a set of CPUs with the
442 // bindprocessor() system call.
443 KMP_CPU_SET_ITERATE(location, this) {
444 if (KMP_CPU_ISSET(location, this)) {
445 retval = bindprocessor(BINDTHREAD, tid, location);
446 if (retval == -1 && errno == 1) {
447 rsid_t rsid;
448 rsethandle_t rsh;
449 // Put something in rsh to prevent compiler warning
450 // about uninitalized use
451 rsh = rs_alloc(RS_EMPTY);
452 rsid.at_pid = getpid();
453 if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
454 retval = ra_detachrset(R_PROCESS, rsid, 0);
455 retval = bindprocessor(BINDTHREAD, tid, location);
456 }
457 }
458 if (retval == 0) {
459 KA_TRACE(10, ("__kmp_set_system_affinity: Done binding "
460 "T#%d to cpu=%d.\n",
461 gtid, location));
462 continue;
463 }
464 int error = errno;
465 if (abort_on_error) {
466 __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
467 KMP_ERR(error), __kmp_msg_null);
468 KA_TRACE(10, ("__kmp_set_system_affinity: Error binding "
469 "T#%d to cpu=%d, errno=%d.\n",
470 gtid, location, error));
471 return error;
472 }
473 }
474 }
475 return 0;
476 }
477#else // !KMP_OS_AIX
478 int get_system_affinity(bool abort_on_error) override {
479 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
480 "Illegal get affinity operation when not capable");
481#if KMP_OS_LINUX
482 long retval =
483 syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
484#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
485 int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
486 reinterpret_cast<cpuset_t *>(mask));
487 int retval = (r == 0 ? 0 : -1);
488#endif
489 if (retval >= 0) {
490 return 0;
491 }
492 int error = errno;
493 if (abort_on_error) {
494 __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
495 KMP_ERR(error), __kmp_msg_null);
496 }
497 return error;
498 }
499 int set_system_affinity(bool abort_on_error) const override {
500 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
501 "Illegal set affinity operation when not capable");
502#if KMP_OS_LINUX
503 long retval =
504 syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
505#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
506 int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
507 reinterpret_cast<cpuset_t *>(mask));
508 int retval = (r == 0 ? 0 : -1);
509#endif
510 if (retval >= 0) {
511 return 0;
512 }
513 int error = errno;
514 if (abort_on_error) {
515 __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
516 KMP_ERR(error), __kmp_msg_null);
517 }
518 return error;
519 }
520#endif // KMP_OS_AIX
521 };
522 void determine_capable(const char *env_var) override {
524 }
525 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
526 KMPAffinity::Mask *allocate_mask() override {
527 KMPNativeAffinity::Mask *retval = new Mask();
528 return retval;
529 }
530 void deallocate_mask(KMPAffinity::Mask *m) override {
531 KMPNativeAffinity::Mask *native_mask =
532 static_cast<KMPNativeAffinity::Mask *>(m);
533 delete native_mask;
534 }
535 KMPAffinity::Mask *allocate_mask_array(int num) override {
536 return new Mask[num];
537 }
538 void deallocate_mask_array(KMPAffinity::Mask *array) override {
539 Mask *linux_array = static_cast<Mask *>(array);
540 delete[] linux_array;
541 }
542 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
543 int index) override {
544 Mask *linux_array = static_cast<Mask *>(array);
545 return &(linux_array[index]);
546 }
547 api_type get_api_type() const override { return NATIVE_OS; }
548};
549#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY \
550 || KMP_OS_AIX */
551
552#if KMP_OS_WINDOWS
553class KMPNativeAffinity : public KMPAffinity {
554 class Mask : public KMPAffinity::Mask {
555 typedef ULONG_PTR mask_t;
556 static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
557 mask_t *mask;
558
559 public:
560 Mask() {
561 mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
562 }
563 ~Mask() {
564 if (mask)
566 }
567 void set(int i) override {
568 mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
569 }
570 bool is_set(int i) const override {
571 return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
572 }
573 void clear(int i) override {
574 mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
575 }
576 void zero() override {
577 for (int i = 0; i < __kmp_num_proc_groups; ++i)
578 mask[i] = 0;
579 }
580 bool empty() const override {
581 for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
582 if (mask[i])
583 return false;
584 return true;
585 }
586 void copy(const KMPAffinity::Mask *src) override {
587 const Mask *convert = static_cast<const Mask *>(src);
588 for (int i = 0; i < __kmp_num_proc_groups; ++i)
589 mask[i] = convert->mask[i];
590 }
591 void bitwise_and(const KMPAffinity::Mask *rhs) override {
592 const Mask *convert = static_cast<const Mask *>(rhs);
593 for (int i = 0; i < __kmp_num_proc_groups; ++i)
594 mask[i] &= convert->mask[i];
595 }
596 void bitwise_or(const KMPAffinity::Mask *rhs) override {
597 const Mask *convert = static_cast<const Mask *>(rhs);
598 for (int i = 0; i < __kmp_num_proc_groups; ++i)
599 mask[i] |= convert->mask[i];
600 }
601 void bitwise_not() override {
602 for (int i = 0; i < __kmp_num_proc_groups; ++i)
603 mask[i] = ~(mask[i]);
604 }
605 bool is_equal(const KMPAffinity::Mask *rhs) const override {
606 const Mask *convert = static_cast<const Mask *>(rhs);
607 for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
608 if (mask[i] != convert->mask[i])
609 return false;
610 return true;
611 }
612 int begin() const override {
613 int retval = 0;
614 while (retval < end() && !is_set(retval))
615 ++retval;
616 return retval;
617 }
618 int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
619 int next(int previous) const override {
620 int retval = previous + 1;
621 while (retval < end() && !is_set(retval))
622 ++retval;
623 return retval;
624 }
625 int set_process_affinity(bool abort_on_error) const override {
626 if (__kmp_num_proc_groups <= 1) {
627 if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
628 DWORD error = GetLastError();
629 if (abort_on_error) {
630 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
632 }
633 return error;
634 }
635 }
636 return 0;
637 }
638 int set_system_affinity(bool abort_on_error) const override {
639 if (__kmp_num_proc_groups > 1) {
640 // Check for a valid mask.
641 GROUP_AFFINITY ga;
642 int group = get_proc_group();
643 if (group < 0) {
644 if (abort_on_error) {
645 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
646 }
647 return -1;
648 }
649 // Transform the bit vector into a GROUP_AFFINITY struct
650 // and make the system call to set affinity.
651 ga.Group = group;
652 ga.Mask = mask[group];
653 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
654
655 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
656 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
657 DWORD error = GetLastError();
658 if (abort_on_error) {
659 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
661 }
662 return error;
663 }
664 } else {
665 if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
666 DWORD error = GetLastError();
667 if (abort_on_error) {
668 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
670 }
671 return error;
672 }
673 }
674 return 0;
675 }
676 int get_system_affinity(bool abort_on_error) override {
677 if (__kmp_num_proc_groups > 1) {
678 this->zero();
679 GROUP_AFFINITY ga;
680 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
681 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
682 DWORD error = GetLastError();
683 if (abort_on_error) {
684 __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
685 KMP_ERR(error), __kmp_msg_null);
686 }
687 return error;
688 }
689 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
690 (ga.Mask == 0)) {
691 return -1;
692 }
693 mask[ga.Group] = ga.Mask;
694 } else {
695 mask_t newMask, sysMask, retval;
696 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
697 DWORD error = GetLastError();
698 if (abort_on_error) {
699 __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
700 KMP_ERR(error), __kmp_msg_null);
701 }
702 return error;
703 }
704 retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
705 if (!retval) {
706 DWORD error = GetLastError();
707 if (abort_on_error) {
708 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
709 KMP_ERR(error), __kmp_msg_null);
710 }
711 return error;
712 }
713 newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
714 if (!newMask) {
715 DWORD error = GetLastError();
716 if (abort_on_error) {
717 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
718 KMP_ERR(error), __kmp_msg_null);
719 }
720 }
721 *mask = retval;
722 }
723 return 0;
724 }
725 int get_proc_group() const override {
726 int group = -1;
727 if (__kmp_num_proc_groups == 1) {
728 return 1;
729 }
730 for (int i = 0; i < __kmp_num_proc_groups; i++) {
731 if (mask[i] == 0)
732 continue;
733 if (group >= 0)
734 return -1;
735 group = i;
736 }
737 return group;
738 }
739 };
740 void determine_capable(const char *env_var) override {
742 }
743 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
744 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
745 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
746 KMPAffinity::Mask *allocate_mask_array(int num) override {
747 return new Mask[num];
748 }
749 void deallocate_mask_array(KMPAffinity::Mask *array) override {
750 Mask *windows_array = static_cast<Mask *>(array);
751 delete[] windows_array;
752 }
753 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
754 int index) override {
755 Mask *windows_array = static_cast<Mask *>(array);
756 return &(windows_array[index]);
757 }
758 api_type get_api_type() const override { return NATIVE_OS; }
759};
760#endif /* KMP_OS_WINDOWS */
761#endif /* KMP_AFFINITY_SUPPORTED */
762
763// Describe an attribute for a level in the machine topology
765 int core_type : 8;
766 int core_eff : 8;
767 unsigned valid : 1;
768 unsigned reserved : 15;
769
770 static const int UNKNOWN_CORE_EFF = -1;
771
774 valid(0), reserved(0) {}
776 valid = 1;
777 core_type = type;
778 }
779 void set_core_eff(int eff) {
780 valid = 1;
781 core_eff = eff;
782 }
785 }
786 int get_core_eff() const { return core_eff; }
787 bool is_core_type_valid() const {
789 }
790 bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
791 operator bool() const { return valid; }
792 void clear() {
795 valid = 0;
796 }
797 bool contains(const kmp_hw_attr_t &other) const {
798 if (!valid && !other.valid)
799 return true;
800 if (valid && other.valid) {
801 if (other.is_core_type_valid()) {
802 if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
803 return false;
804 }
805 if (other.is_core_eff_valid()) {
806 if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
807 return false;
808 }
809 return true;
810 }
811 return false;
812 }
813#if KMP_AFFINITY_SUPPORTED
814 bool contains(const kmp_affinity_attrs_t &attr) const {
815 if (!valid && !attr.valid)
816 return true;
817 if (valid && attr.valid) {
818 if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
819 return (is_core_type_valid() &&
820 (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
821 if (attr.core_eff != UNKNOWN_CORE_EFF)
822 return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
823 return true;
824 }
825 return false;
826 }
827#endif // KMP_AFFINITY_SUPPORTED
828 bool operator==(const kmp_hw_attr_t &rhs) const {
829 return (rhs.valid == valid && rhs.core_eff == core_eff &&
830 rhs.core_type == core_type);
831 }
832 bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
833};
834
835#if KMP_AFFINITY_SUPPORTED
836KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
837#endif
838
840public:
841 static const int UNKNOWN_ID = -1;
842 static const int MULTIPLE_ID = -2;
843 static int compare_ids(const void *a, const void *b);
844 static int compare_compact(const void *a, const void *b);
847 bool leader;
848 int os_id;
850
851 void print() const;
852 void clear() {
853 for (int i = 0; i < (int)KMP_HW_LAST; ++i)
854 ids[i] = UNKNOWN_ID;
855 leader = false;
856 attrs.clear();
857 }
858};
859
861
862 struct flags_t {
863 int uniform : 1;
864 int reserved : 31;
865 };
866
867 int depth;
868
869 // The following arrays are all 'depth' long and have been
870 // allocated to hold up to KMP_HW_LAST number of objects if
871 // needed so layers can be added without reallocation of any array
872
873 // Orderd array of the types in the topology
874 kmp_hw_t *types;
875
876 // Keep quick topology ratios, for non-uniform topologies,
877 // this ratio holds the max number of itemAs per itemB
878 // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
879 int *ratio;
880
881 // Storage containing the absolute number of each topology layer
882 int *count;
883
884 // The number of core efficiencies. This is only useful for hybrid
885 // topologies. Core efficiencies will range from 0 to num efficiencies - 1
886 int num_core_efficiencies;
887 int num_core_types;
889
890 // The hardware threads array
891 // hw_threads is num_hw_threads long
892 // Each hw_thread's ids and sub_ids are depth deep
893 int num_hw_threads;
894 kmp_hw_thread_t *hw_threads;
895
896 // Equivalence hash where the key is the hardware topology item
897 // and the value is the equivalent hardware topology type in the
898 // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
899 // known equivalence for the topology type
900 kmp_hw_t equivalent[KMP_HW_LAST];
901
902 // Flags describing the topology
903 flags_t flags;
904
905 // Compact value used during sort_compact()
906 int compact;
907
908 // Insert a new topology layer after allocation
909 void _insert_layer(kmp_hw_t type, const int *ids);
910
911#if KMP_GROUP_AFFINITY
912 // Insert topology information about Windows Processor groups
913 void _insert_windows_proc_groups();
914#endif
915
916 // Count each item & get the num x's per y
917 // e.g., get the number of cores and the number of threads per core
918 // for each (x, y) in (KMP_HW_* , KMP_HW_*)
919 void _gather_enumeration_information();
920
921 // Remove layers that don't add information to the topology.
922 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
923 void _remove_radix1_layers();
924
925 // Find out if the topology is uniform
926 void _discover_uniformity();
927
928 // Set all the sub_ids for each hardware thread
929 void _set_sub_ids();
930
931 // Set global affinity variables describing the number of threads per
932 // core, the number of packages, the number of cores per package, and
933 // the number of cores.
934 void _set_globals();
935
936 // Set the last level cache equivalent type
937 void _set_last_level_cache();
938
939 // Return the number of cores with a particular attribute, 'attr'.
940 // If 'find_all' is true, then find all cores on the machine, otherwise find
941 // all cores per the layer 'above'
942 int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
943 bool find_all = false) const;
944
945public:
946 // Force use of allocate()/deallocate()
947 kmp_topology_t() = delete;
948 kmp_topology_t(const kmp_topology_t &t) = delete;
952
953 static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
954 static void deallocate(kmp_topology_t *);
955
956 // Functions used in create_map() routines
957 kmp_hw_thread_t &at(int index) {
958 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
959 return hw_threads[index];
960 }
961 const kmp_hw_thread_t &at(int index) const {
962 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
963 return hw_threads[index];
964 }
965 int get_num_hw_threads() const { return num_hw_threads; }
966 void sort_ids() {
967 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
969 }
970 // Check if the hardware ids are unique, if they are
971 // return true, otherwise return false
972 bool check_ids() const;
973
974 // Function to call after the create_map() routine
975 void canonicalize();
976 void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
977
978// Functions used after canonicalize() called
979
980#if KMP_AFFINITY_SUPPORTED
981 // Set the granularity for affinity settings
982 void set_granularity(kmp_affinity_t &stgs) const;
983 bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
984 bool restrict_to_mask(const kmp_affin_mask_t *mask);
985 bool filter_hw_subset();
986#endif
987 bool is_uniform() const { return flags.uniform; }
988 // Tell whether a type is a valid type in the topology
989 // returns KMP_HW_UNKNOWN when there is no equivalent type
991 if (type == KMP_HW_UNKNOWN)
992 return KMP_HW_UNKNOWN;
993 return equivalent[type];
994 }
995 // Set type1 = type2
999 kmp_hw_t real_type2 = equivalent[type2];
1000 if (real_type2 == KMP_HW_UNKNOWN)
1001 real_type2 = type2;
1002 equivalent[type1] = real_type2;
1003 // This loop is required since any of the types may have been set to
1004 // be equivalent to type1. They all must be checked and reset to type2.
1006 if (equivalent[type] == type1) {
1007 equivalent[type] = real_type2;
1008 }
1009 }
1010 }
1011 // Calculate number of types corresponding to level1
1012 // per types corresponding to level2 (e.g., number of threads per core)
1013 int calculate_ratio(int level1, int level2) const {
1014 KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
1015 KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
1016 int r = 1;
1017 for (int level = level1; level > level2; --level)
1018 r *= ratio[level];
1019 return r;
1020 }
1021 int get_ratio(int level) const {
1022 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1023 return ratio[level];
1024 }
1025 int get_depth() const { return depth; };
1027 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1028 return types[level];
1029 }
1032 int eq_type = equivalent[type];
1033 if (eq_type == KMP_HW_UNKNOWN)
1034 return -1;
1035 for (int i = 0; i < depth; ++i)
1036 if (types[i] == eq_type)
1037 return i;
1038 return -1;
1039 }
1040 int get_count(int level) const {
1041 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1042 return count[level];
1043 }
1044 // Return the total number of cores with attribute 'attr'
1045 int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1046 return _get_ncores_with_attr(attr, -1, true);
1047 }
1048 // Return the number of cores with attribute
1049 // 'attr' per topology level 'above'
1050 int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1051 return _get_ncores_with_attr(attr, above, false);
1052 }
1053
1054#if KMP_AFFINITY_SUPPORTED
1055 friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
1056 void sort_compact(kmp_affinity_t &affinity) {
1057 compact = affinity.compact;
1058 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
1060 }
1061#endif
1062 void print(const char *env_var = "KMP_AFFINITY") const;
1063 void dump() const;
1064};
1066
1068 const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1069
1070public:
1071 // Describe a machine topology item in KMP_HW_SUBSET
1072 struct item_t {
1075 int num[MAX_ATTRS];
1076 int offset[MAX_ATTRS];
1078 };
1079 // Put parenthesis around max to avoid accidental use of Windows max macro.
1080 const static int USE_ALL = (std::numeric_limits<int>::max)();
1081
1082private:
1083 int depth;
1084 int capacity;
1085 item_t *items;
1086 kmp_uint64 set;
1087 bool absolute;
1088 // The set must be able to handle up to KMP_HW_LAST number of layers
1089 KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1090 // Sorting the KMP_HW_SUBSET items to follow topology order
1091 // All unknown topology types will be at the beginning of the subset
1092 static int hw_subset_compare(const void *i1, const void *i2) {
1093 kmp_hw_t type1 = ((const item_t *)i1)->type;
1094 kmp_hw_t type2 = ((const item_t *)i2)->type;
1095 int level1 = __kmp_topology->get_level(type1);
1096 int level2 = __kmp_topology->get_level(type2);
1097 return level1 - level2;
1098 }
1099
1100public:
1101 // Force use of allocate()/deallocate()
1107
1109 int initial_capacity = 5;
1110 kmp_hw_subset_t *retval =
1112 retval->depth = 0;
1113 retval->capacity = initial_capacity;
1114 retval->set = 0ull;
1115 retval->absolute = false;
1116 retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1117 return retval;
1118 }
1119 static void deallocate(kmp_hw_subset_t *subset) {
1120 __kmp_free(subset->items);
1121 __kmp_free(subset);
1122 }
1123 void set_absolute() { absolute = true; }
1124 bool is_absolute() const { return absolute; }
1125 void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1126 for (int i = 0; i < depth; ++i) {
1127 // Found an existing item for this layer type
1128 // Add the num, offset, and attr to this item
1129 if (items[i].type == type) {
1130 int idx = items[i].num_attrs++;
1131 if ((size_t)idx >= MAX_ATTRS)
1132 return;
1133 items[i].num[idx] = num;
1134 items[i].offset[idx] = offset;
1135 items[i].attr[idx] = attr;
1136 return;
1137 }
1138 }
1139 if (depth == capacity - 1) {
1140 capacity *= 2;
1141 item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1142 for (int i = 0; i < depth; ++i)
1143 new_items[i] = items[i];
1144 __kmp_free(items);
1145 items = new_items;
1146 }
1147 items[depth].num_attrs = 1;
1148 items[depth].type = type;
1149 items[depth].num[0] = num;
1150 items[depth].offset[0] = offset;
1151 items[depth].attr[0] = attr;
1152 depth++;
1153 set |= (1ull << type);
1154 }
1155 int get_depth() const { return depth; }
1156 const item_t &at(int index) const {
1157 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1158 return items[index];
1159 }
1160 item_t &at(int index) {
1161 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1162 return items[index];
1163 }
1164 void remove(int index) {
1165 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1166 set &= ~(1ull << items[index].type);
1167 for (int j = index + 1; j < depth; ++j) {
1168 items[j - 1] = items[j];
1169 }
1170 depth--;
1171 }
1172 void sort() {
1174 qsort(items, depth, sizeof(item_t), hw_subset_compare);
1175 }
1176 bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1177
1178 // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.
1179 // This means putting each of {sockets, cores, threads} in the topology if
1180 // they are not specified:
1181 // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc.
1182 // e.g., 3module => *s,3module,*c,*t
1183 // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET
1184 // are expecting the traditional sockets/cores/threads topology. For newer
1185 // hardware, there can be intervening layers like dies/tiles/modules
1186 // (usually corresponding to a cache level). So when a user asks for
1187 // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user
1188 // should get 12 hardware threads across 6 cores and effectively ignore the
1189 // module layer.
1190 void canonicalize(const kmp_topology_t *top) {
1191 // Layers to target for KMP_HW_SUBSET canonicalization
1193
1194 // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS
1195 if (is_absolute())
1196 return;
1197
1198 // Do not target-layer-canonicalize KMP_HW_SUBSETS when the
1199 // topology doesn't have these layers
1200 for (kmp_hw_t type : targeted)
1201 if (top->get_level(type) == KMP_HW_UNKNOWN)
1202 return;
1203
1204 // Put targeted layers in topology if they do not exist
1205 for (kmp_hw_t type : targeted) {
1206 bool found = false;
1207 for (int i = 0; i < get_depth(); ++i) {
1208 if (top->get_equivalent_type(items[i].type) == type) {
1209 found = true;
1210 break;
1211 }
1212 }
1213 if (!found) {
1215 }
1216 }
1217 sort();
1218 // Set as an absolute topology that only targets the targeted layers
1219 set_absolute();
1220 }
1221 void dump() const {
1222 printf("**********************\n");
1223 printf("*** kmp_hw_subset: ***\n");
1224 printf("* depth: %d\n", depth);
1225 printf("* items:\n");
1226 for (int i = 0; i < depth; ++i) {
1227 printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1228 for (int j = 0; j < items[i].num_attrs; ++j) {
1229 printf(" num: %d, offset: %d, attr: ", items[i].num[j],
1230 items[i].offset[j]);
1231 if (!items[i].attr[j]) {
1232 printf(" (none)\n");
1233 } else {
1234 printf(
1235 " core_type = %s, core_eff = %d\n",
1236 __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1237 items[i].attr[j].get_core_eff());
1238 }
1239 }
1240 }
1241 printf("* set: 0x%llx\n", set);
1242 printf("* absolute: %d\n", absolute);
1243 printf("**********************\n");
1244 }
1245};
1247
1248/* A structure for holding machine-specific hierarchy info to be computed once
1249 at init. This structure represents a mapping of threads to the actual machine
1250 hierarchy, or to our best guess at what the hierarchy might be, for the
1251 purpose of performing an efficient barrier. In the worst case, when there is
1252 no machine hierarchy information, it produces a tree suitable for a barrier,
1253 similar to the tree used in the hyper barrier. */
1255public:
1256 /* Good default values for number of leaves and branching factor, given no
1257 affinity information. Behaves a bit like hyper barrier. */
1258 static const kmp_uint32 maxLeaves = 4;
1259 static const kmp_uint32 minBranch = 4;
1260 /** Number of levels in the hierarchy. Typical levels are threads/core,
1261 cores/package or socket, packages/node, nodes/machine, etc. We don't want
1262 to get specific with nomenclature. When the machine is oversubscribed we
1263 add levels to duplicate the hierarchy, doubling the thread capacity of the
1264 hierarchy each time we add a level. */
1266
1267 /** This is specifically the depth of the machine configuration hierarchy, in
1268 terms of the number of levels along the longest path from root to any
1269 leaf. It corresponds to the number of entries in numPerLevel if we exclude
1270 all but one trailing 1. */
1274 volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1275 // 2=initialization in progress
1276 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1277
1278 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
1279 the parent of a node at level i has. For example, if we have a machine
1280 with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
1281 {2, 4, 4, 1, 1}. All empty levels are set to 1. */
1284
1286 int hier_depth = __kmp_topology->get_depth();
1287 for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1289 }
1290 }
1291
1294
1295 void fini() {
1296 if (!uninitialized && numPerLevel) {
1298 numPerLevel = NULL;
1300 }
1301 }
1302
1303 void init(int num_addrs) {
1306 if (bool_result == 0) { // Wait for initialization
1307 while (TCR_1(uninitialized) != initialized)
1308 KMP_CPU_PAUSE();
1309 return;
1310 }
1311 KMP_DEBUG_ASSERT(bool_result == 1);
1312
1313 /* Added explicit initialization of the data fields here to prevent usage of
1314 dirty value observed when static library is re-initialized multiple times
1315 (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1316 OpenMP). */
1317 depth = 1;
1318 resizing = 0;
1319 maxLevels = 7;
1320 numPerLevel =
1323 for (kmp_uint32 i = 0; i < maxLevels;
1324 ++i) { // init numPerLevel[*] to 1 item per level
1325 numPerLevel[i] = 1;
1326 skipPerLevel[i] = 1;
1327 }
1328
1329 // Sort table by physical ID
1330 if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1331 deriveLevels();
1332 } else {
1334 numPerLevel[1] = num_addrs / maxLeaves;
1335 if (num_addrs % maxLeaves)
1336 numPerLevel[1]++;
1337 }
1338
1339 base_num_threads = num_addrs;
1340 for (int i = maxLevels - 1; i >= 0;
1341 --i) // count non-empty levels to get depth
1342 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1343 depth++;
1344
1345 kmp_uint32 branch = minBranch;
1346 if (numPerLevel[0] == 1)
1347 branch = num_addrs / maxLeaves;
1348 if (branch < minBranch)
1349 branch = minBranch;
1350 for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1351 while (numPerLevel[d] > branch ||
1352 (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1353 if (numPerLevel[d] & 1)
1354 numPerLevel[d]++;
1355 numPerLevel[d] = numPerLevel[d] >> 1;
1356 if (numPerLevel[d + 1] == 1)
1357 depth++;
1358 numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1359 }
1360 if (numPerLevel[0] == 1) {
1361 branch = branch >> 1;
1362 if (branch < 4)
1363 branch = minBranch;
1364 }
1365 }
1366
1367 for (kmp_uint32 i = 1; i < depth; ++i)
1368 skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1369 // Fill in hierarchy in the case of oversubscription
1370 for (kmp_uint32 i = depth; i < maxLevels; ++i)
1371 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1372
1373 uninitialized = initialized; // One writer
1374 }
1375
1376 // Resize the hierarchy if nproc changes to something larger than before
1377 void resize(kmp_uint32 nproc) {
1378 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1379 while (bool_result == 0) { // someone else is trying to resize
1380 KMP_CPU_PAUSE();
1381 if (nproc <= base_num_threads) // happy with other thread's resize
1382 return;
1383 else // try to resize
1384 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1385 }
1386 KMP_DEBUG_ASSERT(bool_result != 0);
1387 if (nproc <= base_num_threads)
1388 return; // happy with other thread's resize
1389
1390 // Calculate new maxLevels
1391 kmp_uint32 old_sz = skipPerLevel[depth - 1];
1392 kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1393 // First see if old maxLevels is enough to contain new size
1394 for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1395 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1396 numPerLevel[i - 1] *= 2;
1397 old_sz *= 2;
1398 depth++;
1399 }
1400 if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1401 while (nproc > old_sz) {
1402 old_sz *= 2;
1403 incs++;
1404 depth++;
1405 }
1406 maxLevels += incs;
1407
1408 // Resize arrays
1409 kmp_uint32 *old_numPerLevel = numPerLevel;
1410 kmp_uint32 *old_skipPerLevel = skipPerLevel;
1411 numPerLevel = skipPerLevel = NULL;
1412 numPerLevel =
1415
1416 // Copy old elements from old arrays
1417 for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1418 // init numPerLevel[*] to 1 item per level
1419 numPerLevel[i] = old_numPerLevel[i];
1420 skipPerLevel[i] = old_skipPerLevel[i];
1421 }
1422
1423 // Init new elements in arrays to 1
1424 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1425 // init numPerLevel[*] to 1 item per level
1426 numPerLevel[i] = 1;
1427 skipPerLevel[i] = 1;
1428 }
1429
1430 // Free old arrays
1431 __kmp_free(old_numPerLevel);
1432 }
1433
1434 // Fill in oversubscription levels of hierarchy
1435 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1436 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1437
1438 base_num_threads = nproc;
1439 resizing = 0; // One writer
1440 }
1441};
1442#endif // KMP_AFFINITY_H
char bool
kmp_uint32 * numPerLevel
Level 0 corresponds to leaves.
static const kmp_uint32 maxLeaves
kmp_uint32 * skipPerLevel
void resize(kmp_uint32 nproc)
kmp_uint32 base_num_threads
volatile kmp_int8 uninitialized
kmp_uint32 maxLevels
Number of levels in the hierarchy.
kmp_uint32 depth
This is specifically the depth of the machine configuration hierarchy, in terms of the number of leve...
void init(int num_addrs)
volatile kmp_int8 resizing
static const kmp_uint32 minBranch
bool specified(kmp_hw_t type) const
void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr)
bool is_absolute() const
kmp_hw_subset_t()=delete
kmp_hw_subset_t(kmp_hw_subset_t &&t)=delete
void canonicalize(const kmp_topology_t *top)
static kmp_hw_subset_t * allocate()
static void deallocate(kmp_hw_subset_t *subset)
void remove(int index)
kmp_hw_subset_t(const kmp_hw_subset_t &t)=delete
int get_depth() const
static const int USE_ALL
const item_t & at(int index) const
item_t & at(int index)
kmp_hw_subset_t & operator=(kmp_hw_subset_t &&t)=delete
kmp_hw_subset_t & operator=(const kmp_hw_subset_t &t)=delete
void dump() const
kmp_hw_attr_t attrs
Definition: kmp_affinity.h:849
static const int UNKNOWN_ID
Definition: kmp_affinity.h:841
int sub_ids[KMP_HW_LAST]
Definition: kmp_affinity.h:846
static int compare_compact(const void *a, const void *b)
void print() const
static int compare_ids(const void *a, const void *b)
static const int MULTIPLE_ID
Definition: kmp_affinity.h:842
int ids[KMP_HW_LAST]
Definition: kmp_affinity.h:845
kmp_hw_thread_t & at(int index)
Definition: kmp_affinity.h:957
void dump() const
int get_level(kmp_hw_t type) const
int get_count(int level) const
int get_ratio(int level) const
static void deallocate(kmp_topology_t *)
kmp_hw_t get_equivalent_type(kmp_hw_t type) const
Definition: kmp_affinity.h:990
void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2)
Definition: kmp_affinity.h:996
int get_num_hw_threads() const
Definition: kmp_affinity.h:965
int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const
const kmp_hw_thread_t & at(int index) const
Definition: kmp_affinity.h:961
int get_depth() const
int calculate_ratio(int level1, int level2) const
bool is_uniform() const
Definition: kmp_affinity.h:987
static kmp_topology_t * allocate(int nproc, int ndepth, const kmp_hw_t *types)
void print(const char *env_var="KMP_AFFINITY") const
kmp_topology_t & operator=(kmp_topology_t &&t)=delete
kmp_topology_t(const kmp_topology_t &t)=delete
kmp_hw_t get_type(int level) const
kmp_topology_t()=delete
kmp_topology_t(kmp_topology_t &&t)=delete
kmp_topology_t & operator=(const kmp_topology_t &t)=delete
bool check_ids() const
int get_ncores_with_attr(const kmp_hw_attr_t &attr) const
void
Definition: ittnotify.h:3324
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int mask
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp end
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp begin
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t count
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type size_t void ITT_FORMAT p const __itt_domain __itt_id __itt_string_handle const wchar_t size_t ITT_FORMAT lu const __itt_domain __itt_id __itt_relation __itt_id ITT_FORMAT p const wchar_t int ITT_FORMAT __itt_group_mark d int
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type type
#define __kmp_free(ptr)
Definition: kmp.h:3756
#define KMP_HW_MAX_NUM_CORE_EFFS
Definition: kmp.h:647
#define KMP_CPU_PAUSE()
Definition: kmp.h:1564
int __kmp_xproc
Definition: kmp_global.cpp:122
#define KMP_FOREACH_HW_TYPE(type)
Definition: kmp.h:654
#define __kmp_entry_gtid()
Definition: kmp.h:3601
const char * __kmp_hw_get_keyword(kmp_hw_t type, bool plural=false)
#define __kmp_allocate(size)
Definition: kmp.h:3754
#define TRUE
Definition: kmp.h:1324
const char * __kmp_hw_get_core_type_string(kmp_hw_core_type_t type)
kmp_hw_t
Definition: kmp.h:619
@ KMP_HW_UNKNOWN
Definition: kmp.h:620
@ KMP_HW_SOCKET
Definition: kmp.h:621
@ KMP_HW_CORE
Definition: kmp.h:631
@ KMP_HW_THREAD
Definition: kmp.h:632
@ KMP_HW_LAST
Definition: kmp.h:633
kmp_hw_core_type_t
Definition: kmp.h:636
@ KMP_HW_MAX_NUM_CORE_TYPES
Definition: kmp.h:643
@ KMP_HW_CORE_TYPE_UNKNOWN
Definition: kmp.h:637
static void __kmp_type_convert(T1 src, T2 *dest)
Definition: kmp.h:4855
#define KMP_DEBUG_ASSERT_VALID_HW_TYPE(type)
Definition: kmp.h:649
kmp_hw_subset_t * __kmp_hw_subset
kmp_topology_t * __kmp_topology
KMP_ARCH_X86 KMP_ARCH_X86 KMP_ARCH_X86 kmp_int8
Definition: kmp_atomic.cpp:985
#define KA_TRACE(d, x)
Definition: kmp_debug.h:157
#define KMP_BUILD_ASSERT(expr)
Definition: kmp_debug.h:26
#define KMP_DEBUG_ASSERT(cond)
Definition: kmp_debug.h:61
#define KMP_ASSERT2(cond, msg)
Definition: kmp_debug.h:60
unsigned long long kmp_uint64
kmp_msg_t __kmp_msg_null
Definition: kmp_i18n.cpp:36
void __kmp_fatal(kmp_msg_t message,...)
Definition: kmp_i18n.cpp:864
#define KMP_WARNING(...)
Definition: kmp_i18n.h:144
#define KMP_MSG(...)
Definition: kmp_i18n.h:121
#define KMP_FATAL(...)
Definition: kmp_i18n.h:146
#define KMP_ERR
Definition: kmp_i18n.h:125
#define TCR_1(a)
Definition: kmp_os.h:1133
#define KMP_COMPARE_AND_STORE_ACQ8(p, cv, sv)
Definition: kmp_os.h:801
#define i
Definition: kmp_stub.cpp:87
int a
bool contains(const kmp_hw_attr_t &other) const
Definition: kmp_affinity.h:797
int get_core_eff() const
Definition: kmp_affinity.h:786
unsigned reserved
Definition: kmp_affinity.h:768
bool is_core_type_valid() const
Definition: kmp_affinity.h:787
kmp_hw_core_type_t get_core_type() const
Definition: kmp_affinity.h:783
void set_core_type(kmp_hw_core_type_t type)
Definition: kmp_affinity.h:775
static const int UNKNOWN_CORE_EFF
Definition: kmp_affinity.h:770
unsigned valid
Definition: kmp_affinity.h:767
void set_core_eff(int eff)
Definition: kmp_affinity.h:779
bool operator==(const kmp_hw_attr_t &rhs) const
Definition: kmp_affinity.h:828
bool is_core_eff_valid() const
Definition: kmp_affinity.h:790
bool operator!=(const kmp_hw_attr_t &rhs) const
Definition: kmp_affinity.h:832
kmp_hw_attr_t attr[MAX_ATTRS]
void __kmp_affinity_determine_capable(const char *env_var)
void __kmp_affinity_bind_thread(int proc)