LLVM OpenMP* Runtime Library
Loading...
Searching...
No Matches
kmp_affinity.h
1/*
2 * kmp_affinity.h -- header for affinity management
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef KMP_AFFINITY_H
14#define KMP_AFFINITY_H
15
16#include "kmp.h"
17#include "kmp_os.h"
18#include <limits>
19
20#if KMP_AFFINITY_SUPPORTED
21#if KMP_USE_HWLOC
22class KMPHwlocAffinity : public KMPAffinity {
23public:
24 class Mask : public KMPAffinity::Mask {
25 hwloc_cpuset_t mask;
26
27 public:
28 Mask() {
29 mask = hwloc_bitmap_alloc();
30 this->zero();
31 }
32 ~Mask() { hwloc_bitmap_free(mask); }
33 void set(int i) override { hwloc_bitmap_set(mask, i); }
34 bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
35 void clear(int i) override { hwloc_bitmap_clr(mask, i); }
36 void zero() override { hwloc_bitmap_zero(mask); }
37 bool empty() const override { return hwloc_bitmap_iszero(mask); }
38 void copy(const KMPAffinity::Mask *src) override {
39 const Mask *convert = static_cast<const Mask *>(src);
40 hwloc_bitmap_copy(mask, convert->mask);
41 }
42 void bitwise_and(const KMPAffinity::Mask *rhs) override {
43 const Mask *convert = static_cast<const Mask *>(rhs);
44 hwloc_bitmap_and(mask, mask, convert->mask);
45 }
46 void bitwise_or(const KMPAffinity::Mask *rhs) override {
47 const Mask *convert = static_cast<const Mask *>(rhs);
48 hwloc_bitmap_or(mask, mask, convert->mask);
49 }
50 void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
51 bool is_equal(const KMPAffinity::Mask *rhs) const override {
52 const Mask *convert = static_cast<const Mask *>(rhs);
53 return hwloc_bitmap_isequal(mask, convert->mask);
54 }
55 int begin() const override { return hwloc_bitmap_first(mask); }
56 int end() const override { return -1; }
57 int next(int previous) const override {
58 return hwloc_bitmap_next(mask, previous);
59 }
60 int get_system_affinity(bool abort_on_error) override {
61 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
62 "Illegal get affinity operation when not capable");
63 long retval =
64 hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
65 if (retval >= 0) {
66 return 0;
67 }
68 int error = errno;
69 if (abort_on_error) {
70 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
71 KMP_ERR(error), __kmp_msg_null);
72 }
73 return error;
74 }
75 int set_system_affinity(bool abort_on_error) const override {
76 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
77 "Illegal set affinity operation when not capable");
78 long retval =
79 hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
80 if (retval >= 0) {
81 return 0;
82 }
83 int error = errno;
84 if (abort_on_error) {
85 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
86 KMP_ERR(error), __kmp_msg_null);
87 }
88 return error;
89 }
90#if KMP_OS_WINDOWS
91 int set_process_affinity(bool abort_on_error) const override {
92 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
93 "Illegal set process affinity operation when not capable");
94 int error = 0;
95 const hwloc_topology_support *support =
96 hwloc_topology_get_support(__kmp_hwloc_topology);
97 if (support->cpubind->set_proc_cpubind) {
98 int retval;
99 retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
100 HWLOC_CPUBIND_PROCESS);
101 if (retval >= 0)
102 return 0;
103 error = errno;
104 if (abort_on_error)
105 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
106 KMP_ERR(error), __kmp_msg_null);
107 }
108 return error;
109 }
110#endif
111 int get_proc_group() const override {
112 int group = -1;
113#if KMP_OS_WINDOWS
114 if (__kmp_num_proc_groups == 1) {
115 return 1;
116 }
117 for (int i = 0; i < __kmp_num_proc_groups; i++) {
118 // On windows, the long type is always 32 bits
119 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
120 unsigned long second_32_bits =
121 hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
122 if (first_32_bits == 0 && second_32_bits == 0) {
123 continue;
124 }
125 if (group >= 0) {
126 return -1;
127 }
128 group = i;
129 }
130#endif /* KMP_OS_WINDOWS */
131 return group;
132 }
133 };
134 void determine_capable(const char *var) override {
135 const hwloc_topology_support *topology_support;
136 if (__kmp_hwloc_topology == NULL) {
137 if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
138 __kmp_hwloc_error = TRUE;
139 if (__kmp_affinity.flags.verbose) {
140 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
141 }
142 }
143 if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
144 __kmp_hwloc_error = TRUE;
145 if (__kmp_affinity.flags.verbose) {
146 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
147 }
148 }
149 }
150 topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
151 // Is the system capable of setting/getting this thread's affinity?
152 // Also, is topology discovery possible? (pu indicates ability to discover
153 // processing units). And finally, were there no errors when calling any
154 // hwloc_* API functions?
155 if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
156 topology_support->cpubind->get_thisthread_cpubind &&
157 topology_support->discovery->pu && !__kmp_hwloc_error) {
158 // enables affinity according to KMP_AFFINITY_CAPABLE() macro
159 KMP_AFFINITY_ENABLE(TRUE);
160 } else {
161 // indicate that hwloc didn't work and disable affinity
162 __kmp_hwloc_error = TRUE;
163 KMP_AFFINITY_DISABLE();
164 }
165 }
166 void bind_thread(int which) override {
167 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
168 "Illegal set affinity operation when not capable");
169 KMPAffinity::Mask *mask;
170 KMP_CPU_ALLOC_ON_STACK(mask);
171 KMP_CPU_ZERO(mask);
172 KMP_CPU_SET(which, mask);
173 __kmp_set_system_affinity(mask, TRUE);
174 KMP_CPU_FREE_FROM_STACK(mask);
175 }
176 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
177 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
178 KMPAffinity::Mask *allocate_mask_array(int num) override {
179 return new Mask[num];
180 }
181 void deallocate_mask_array(KMPAffinity::Mask *array) override {
182 Mask *hwloc_array = static_cast<Mask *>(array);
183 delete[] hwloc_array;
184 }
185 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
186 int index) override {
187 Mask *hwloc_array = static_cast<Mask *>(array);
188 return &(hwloc_array[index]);
189 }
190 api_type get_api_type() const override { return HWLOC; }
191};
192#endif /* KMP_USE_HWLOC */
193
194#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
195 KMP_OS_AIX
196#if KMP_OS_LINUX
197/* On some of the older OS's that we build on, these constants aren't present
198 in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
199 all systems of the same arch where they are defined, and they cannot change.
200 stone forever. */
201#include <sys/syscall.h>
202#if KMP_ARCH_X86 || KMP_ARCH_ARM
203#ifndef __NR_sched_setaffinity
204#define __NR_sched_setaffinity 241
205#elif __NR_sched_setaffinity != 241
206#error Wrong code for setaffinity system call.
207#endif /* __NR_sched_setaffinity */
208#ifndef __NR_sched_getaffinity
209#define __NR_sched_getaffinity 242
210#elif __NR_sched_getaffinity != 242
211#error Wrong code for getaffinity system call.
212#endif /* __NR_sched_getaffinity */
213#elif KMP_ARCH_AARCH64
214#ifndef __NR_sched_setaffinity
215#define __NR_sched_setaffinity 122
216#elif __NR_sched_setaffinity != 122
217#error Wrong code for setaffinity system call.
218#endif /* __NR_sched_setaffinity */
219#ifndef __NR_sched_getaffinity
220#define __NR_sched_getaffinity 123
221#elif __NR_sched_getaffinity != 123
222#error Wrong code for getaffinity system call.
223#endif /* __NR_sched_getaffinity */
224#elif KMP_ARCH_RISCV64
225#ifndef __NR_sched_setaffinity
226#define __NR_sched_setaffinity 122
227#elif __NR_sched_setaffinity != 122
228#error Wrong code for setaffinity system call.
229#endif /* __NR_sched_setaffinity */
230#ifndef __NR_sched_getaffinity
231#define __NR_sched_getaffinity 123
232#elif __NR_sched_getaffinity != 123
233#error Wrong code for getaffinity system call.
234#endif /* __NR_sched_getaffinity */
235#elif KMP_ARCH_X86_64
236#ifndef __NR_sched_setaffinity
237#define __NR_sched_setaffinity 203
238#elif __NR_sched_setaffinity != 203
239#error Wrong code for setaffinity system call.
240#endif /* __NR_sched_setaffinity */
241#ifndef __NR_sched_getaffinity
242#define __NR_sched_getaffinity 204
243#elif __NR_sched_getaffinity != 204
244#error Wrong code for getaffinity system call.
245#endif /* __NR_sched_getaffinity */
246#elif KMP_ARCH_PPC64
247#ifndef __NR_sched_setaffinity
248#define __NR_sched_setaffinity 222
249#elif __NR_sched_setaffinity != 222
250#error Wrong code for setaffinity system call.
251#endif /* __NR_sched_setaffinity */
252#ifndef __NR_sched_getaffinity
253#define __NR_sched_getaffinity 223
254#elif __NR_sched_getaffinity != 223
255#error Wrong code for getaffinity system call.
256#endif /* __NR_sched_getaffinity */
257#elif KMP_ARCH_MIPS
258#ifndef __NR_sched_setaffinity
259#define __NR_sched_setaffinity 4239
260#elif __NR_sched_setaffinity != 4239
261#error Wrong code for setaffinity system call.
262#endif /* __NR_sched_setaffinity */
263#ifndef __NR_sched_getaffinity
264#define __NR_sched_getaffinity 4240
265#elif __NR_sched_getaffinity != 4240
266#error Wrong code for getaffinity system call.
267#endif /* __NR_sched_getaffinity */
268#elif KMP_ARCH_MIPS64
269#ifndef __NR_sched_setaffinity
270#define __NR_sched_setaffinity 5195
271#elif __NR_sched_setaffinity != 5195
272#error Wrong code for setaffinity system call.
273#endif /* __NR_sched_setaffinity */
274#ifndef __NR_sched_getaffinity
275#define __NR_sched_getaffinity 5196
276#elif __NR_sched_getaffinity != 5196
277#error Wrong code for getaffinity system call.
278#endif /* __NR_sched_getaffinity */
279#elif KMP_ARCH_LOONGARCH64
280#ifndef __NR_sched_setaffinity
281#define __NR_sched_setaffinity 122
282#elif __NR_sched_setaffinity != 122
283#error Wrong code for setaffinity system call.
284#endif /* __NR_sched_setaffinity */
285#ifndef __NR_sched_getaffinity
286#define __NR_sched_getaffinity 123
287#elif __NR_sched_getaffinity != 123
288#error Wrong code for getaffinity system call.
289#endif /* __NR_sched_getaffinity */
290#elif KMP_ARCH_RISCV64
291#ifndef __NR_sched_setaffinity
292#define __NR_sched_setaffinity 122
293#elif __NR_sched_setaffinity != 122
294#error Wrong code for setaffinity system call.
295#endif /* __NR_sched_setaffinity */
296#ifndef __NR_sched_getaffinity
297#define __NR_sched_getaffinity 123
298#elif __NR_sched_getaffinity != 123
299#error Wrong code for getaffinity system call.
300#endif /* __NR_sched_getaffinity */
301#elif KMP_ARCH_VE
302#ifndef __NR_sched_setaffinity
303#define __NR_sched_setaffinity 203
304#elif __NR_sched_setaffinity != 203
305#error Wrong code for setaffinity system call.
306#endif /* __NR_sched_setaffinity */
307#ifndef __NR_sched_getaffinity
308#define __NR_sched_getaffinity 204
309#elif __NR_sched_getaffinity != 204
310#error Wrong code for getaffinity system call.
311#endif /* __NR_sched_getaffinity */
312#elif KMP_ARCH_S390X
313#ifndef __NR_sched_setaffinity
314#define __NR_sched_setaffinity 239
315#elif __NR_sched_setaffinity != 239
316#error Wrong code for setaffinity system call.
317#endif /* __NR_sched_setaffinity */
318#ifndef __NR_sched_getaffinity
319#define __NR_sched_getaffinity 240
320#elif __NR_sched_getaffinity != 240
321#error Wrong code for getaffinity system call.
322#endif /* __NR_sched_getaffinity */
323#else
324#error Unknown or unsupported architecture
325#endif /* KMP_ARCH_* */
326#elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
327#include <pthread.h>
328#include <pthread_np.h>
329#elif KMP_OS_NETBSD
330#include <pthread.h>
331#include <sched.h>
332#elif KMP_OS_AIX
333#include <sys/dr.h>
334#include <sys/rset.h>
335#define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
336#define GET_NUMBER_SMT_SETS 0x0004
337extern "C" int syssmt(int flags, int, int, int *);
338#endif
339class KMPNativeAffinity : public KMPAffinity {
340 class Mask : public KMPAffinity::Mask {
341 typedef unsigned long mask_t;
342 typedef decltype(__kmp_affin_mask_size) mask_size_type;
343 static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
344 static const mask_t ONE = 1;
345 mask_size_type get_num_mask_types() const {
346 return __kmp_affin_mask_size / sizeof(mask_t);
347 }
348
349 public:
350 mask_t *mask;
351 Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
352 ~Mask() {
353 if (mask)
354 __kmp_free(mask);
355 }
356 void set(int i) override {
357 mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
358 }
359 bool is_set(int i) const override {
360 return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
361 }
362 void clear(int i) override {
363 mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
364 }
365 void zero() override {
366 mask_size_type e = get_num_mask_types();
367 for (mask_size_type i = 0; i < e; ++i)
368 mask[i] = (mask_t)0;
369 }
370 bool empty() const override {
371 mask_size_type e = get_num_mask_types();
372 for (mask_size_type i = 0; i < e; ++i)
373 if (mask[i] != (mask_t)0)
374 return false;
375 return true;
376 }
377 void copy(const KMPAffinity::Mask *src) override {
378 const Mask *convert = static_cast<const Mask *>(src);
379 mask_size_type e = get_num_mask_types();
380 for (mask_size_type i = 0; i < e; ++i)
381 mask[i] = convert->mask[i];
382 }
383 void bitwise_and(const KMPAffinity::Mask *rhs) override {
384 const Mask *convert = static_cast<const Mask *>(rhs);
385 mask_size_type e = get_num_mask_types();
386 for (mask_size_type i = 0; i < e; ++i)
387 mask[i] &= convert->mask[i];
388 }
389 void bitwise_or(const KMPAffinity::Mask *rhs) override {
390 const Mask *convert = static_cast<const Mask *>(rhs);
391 mask_size_type e = get_num_mask_types();
392 for (mask_size_type i = 0; i < e; ++i)
393 mask[i] |= convert->mask[i];
394 }
395 void bitwise_not() override {
396 mask_size_type e = get_num_mask_types();
397 for (mask_size_type i = 0; i < e; ++i)
398 mask[i] = ~(mask[i]);
399 }
400 bool is_equal(const KMPAffinity::Mask *rhs) const override {
401 const Mask *convert = static_cast<const Mask *>(rhs);
402 mask_size_type e = get_num_mask_types();
403 for (mask_size_type i = 0; i < e; ++i)
404 if (mask[i] != convert->mask[i])
405 return false;
406 return true;
407 }
408 int begin() const override {
409 int retval = 0;
410 while (retval < end() && !is_set(retval))
411 ++retval;
412 return retval;
413 }
414 int end() const override {
415 int e;
416 __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
417 return e;
418 }
419 int next(int previous) const override {
420 int retval = previous + 1;
421 while (retval < end() && !is_set(retval))
422 ++retval;
423 return retval;
424 }
425#if KMP_OS_AIX
426 // On AIX, we don't have a way to get CPU(s) a thread is bound to.
427 // This routine is only used to get the full mask.
428 int get_system_affinity(bool abort_on_error) override {
429 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
430 "Illegal get affinity operation when not capable");
431
432 (void)abort_on_error;
433
434 // Set the mask with all CPUs that are available.
435 for (int i = 0; i < __kmp_xproc; ++i)
436 KMP_CPU_SET(i, this);
437 return 0;
438 }
439 int set_system_affinity(bool abort_on_error) const override {
440 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
441
442 "Illegal set affinity operation when not capable");
443
444 int location;
445 int gtid = __kmp_entry_gtid();
446 int tid = thread_self();
447
448 // Unbind the thread if it was bound to any processors before so that
449 // we can bind the thread to CPUs specified by the mask not others.
450 int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
451
452 // On AIX, we can only bind to one instead of a set of CPUs with the
453 // bindprocessor() system call.
454 KMP_CPU_SET_ITERATE(location, this) {
455 if (KMP_CPU_ISSET(location, this)) {
456 retval = bindprocessor(BINDTHREAD, tid, location);
457 if (retval == -1 && errno == 1) {
458 rsid_t rsid;
459 rsethandle_t rsh;
460 // Put something in rsh to prevent compiler warning
461 // about uninitalized use
462 rsh = rs_alloc(RS_EMPTY);
463 rsid.at_pid = getpid();
464 if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
465 retval = ra_detachrset(R_PROCESS, rsid, 0);
466 retval = bindprocessor(BINDTHREAD, tid, location);
467 }
468 }
469 if (retval == 0) {
470 KA_TRACE(10, ("__kmp_set_system_affinity: Done binding "
471 "T#%d to cpu=%d.\n",
472 gtid, location));
473 continue;
474 }
475 int error = errno;
476 if (abort_on_error) {
477 __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
478 KMP_ERR(error), __kmp_msg_null);
479 KA_TRACE(10, ("__kmp_set_system_affinity: Error binding "
480 "T#%d to cpu=%d, errno=%d.\n",
481 gtid, location, error));
482 return error;
483 }
484 }
485 }
486 return 0;
487 }
488#else // !KMP_OS_AIX
489 int get_system_affinity(bool abort_on_error) override {
490 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
491 "Illegal get affinity operation when not capable");
492#if KMP_OS_LINUX
493 long retval =
494 syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
495#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
496 int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
497 reinterpret_cast<cpuset_t *>(mask));
498 int retval = (r == 0 ? 0 : -1);
499#endif
500 if (retval >= 0) {
501 return 0;
502 }
503 int error = errno;
504 if (abort_on_error) {
505 __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
506 KMP_ERR(error), __kmp_msg_null);
507 }
508 return error;
509 }
510 int set_system_affinity(bool abort_on_error) const override {
511 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
512 "Illegal set affinity operation when not capable");
513#if KMP_OS_LINUX
514 long retval =
515 syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
516#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
517 int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
518 reinterpret_cast<cpuset_t *>(mask));
519 int retval = (r == 0 ? 0 : -1);
520#endif
521 if (retval >= 0) {
522 return 0;
523 }
524 int error = errno;
525 if (abort_on_error) {
526 __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
527 KMP_ERR(error), __kmp_msg_null);
528 }
529 return error;
530 }
531#endif // KMP_OS_AIX
532 };
533 void determine_capable(const char *env_var) override {
534 __kmp_affinity_determine_capable(env_var);
535 }
536 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
537 KMPAffinity::Mask *allocate_mask() override {
538 KMPNativeAffinity::Mask *retval = new Mask();
539 return retval;
540 }
541 void deallocate_mask(KMPAffinity::Mask *m) override {
542 KMPNativeAffinity::Mask *native_mask =
543 static_cast<KMPNativeAffinity::Mask *>(m);
544 delete native_mask;
545 }
546 KMPAffinity::Mask *allocate_mask_array(int num) override {
547 return new Mask[num];
548 }
549 void deallocate_mask_array(KMPAffinity::Mask *array) override {
550 Mask *linux_array = static_cast<Mask *>(array);
551 delete[] linux_array;
552 }
553 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
554 int index) override {
555 Mask *linux_array = static_cast<Mask *>(array);
556 return &(linux_array[index]);
557 }
558 api_type get_api_type() const override { return NATIVE_OS; }
559};
560#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY \
561 || KMP_OS_AIX */
562
563#if KMP_OS_WINDOWS
564class KMPNativeAffinity : public KMPAffinity {
565 class Mask : public KMPAffinity::Mask {
566 typedef ULONG_PTR mask_t;
567 static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
568 mask_t *mask;
569
570 public:
571 Mask() {
572 mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
573 }
574 ~Mask() {
575 if (mask)
576 __kmp_free(mask);
577 }
578 void set(int i) override {
579 mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
580 }
581 bool is_set(int i) const override {
582 return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
583 }
584 void clear(int i) override {
585 mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
586 }
587 void zero() override {
588 for (int i = 0; i < __kmp_num_proc_groups; ++i)
589 mask[i] = 0;
590 }
591 bool empty() const override {
592 for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
593 if (mask[i])
594 return false;
595 return true;
596 }
597 void copy(const KMPAffinity::Mask *src) override {
598 const Mask *convert = static_cast<const Mask *>(src);
599 for (int i = 0; i < __kmp_num_proc_groups; ++i)
600 mask[i] = convert->mask[i];
601 }
602 void bitwise_and(const KMPAffinity::Mask *rhs) override {
603 const Mask *convert = static_cast<const Mask *>(rhs);
604 for (int i = 0; i < __kmp_num_proc_groups; ++i)
605 mask[i] &= convert->mask[i];
606 }
607 void bitwise_or(const KMPAffinity::Mask *rhs) override {
608 const Mask *convert = static_cast<const Mask *>(rhs);
609 for (int i = 0; i < __kmp_num_proc_groups; ++i)
610 mask[i] |= convert->mask[i];
611 }
612 void bitwise_not() override {
613 for (int i = 0; i < __kmp_num_proc_groups; ++i)
614 mask[i] = ~(mask[i]);
615 }
616 bool is_equal(const KMPAffinity::Mask *rhs) const override {
617 const Mask *convert = static_cast<const Mask *>(rhs);
618 for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
619 if (mask[i] != convert->mask[i])
620 return false;
621 return true;
622 }
623 int begin() const override {
624 int retval = 0;
625 while (retval < end() && !is_set(retval))
626 ++retval;
627 return retval;
628 }
629 int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
630 int next(int previous) const override {
631 int retval = previous + 1;
632 while (retval < end() && !is_set(retval))
633 ++retval;
634 return retval;
635 }
636 int set_process_affinity(bool abort_on_error) const override {
637 if (__kmp_num_proc_groups <= 1) {
638 if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
639 DWORD error = GetLastError();
640 if (abort_on_error) {
641 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
642 __kmp_msg_null);
643 }
644 return error;
645 }
646 }
647 return 0;
648 }
649 int set_system_affinity(bool abort_on_error) const override {
650 if (__kmp_num_proc_groups > 1) {
651 // Check for a valid mask.
652 GROUP_AFFINITY ga;
653 int group = get_proc_group();
654 if (group < 0) {
655 if (abort_on_error) {
656 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
657 }
658 return -1;
659 }
660 // Transform the bit vector into a GROUP_AFFINITY struct
661 // and make the system call to set affinity.
662 ga.Group = group;
663 ga.Mask = mask[group];
664 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
665
666 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
667 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
668 DWORD error = GetLastError();
669 if (abort_on_error) {
670 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
671 __kmp_msg_null);
672 }
673 return error;
674 }
675 } else {
676 if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
677 DWORD error = GetLastError();
678 if (abort_on_error) {
679 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
680 __kmp_msg_null);
681 }
682 return error;
683 }
684 }
685 return 0;
686 }
687 int get_system_affinity(bool abort_on_error) override {
688 if (__kmp_num_proc_groups > 1) {
689 this->zero();
690 GROUP_AFFINITY ga;
691 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
692 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
693 DWORD error = GetLastError();
694 if (abort_on_error) {
695 __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
696 KMP_ERR(error), __kmp_msg_null);
697 }
698 return error;
699 }
700 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
701 (ga.Mask == 0)) {
702 return -1;
703 }
704 mask[ga.Group] = ga.Mask;
705 } else {
706 mask_t newMask, sysMask, retval;
707 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
708 DWORD error = GetLastError();
709 if (abort_on_error) {
710 __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
711 KMP_ERR(error), __kmp_msg_null);
712 }
713 return error;
714 }
715 retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
716 if (!retval) {
717 DWORD error = GetLastError();
718 if (abort_on_error) {
719 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
720 KMP_ERR(error), __kmp_msg_null);
721 }
722 return error;
723 }
724 newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
725 if (!newMask) {
726 DWORD error = GetLastError();
727 if (abort_on_error) {
728 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
729 KMP_ERR(error), __kmp_msg_null);
730 }
731 }
732 *mask = retval;
733 }
734 return 0;
735 }
736 int get_proc_group() const override {
737 int group = -1;
738 if (__kmp_num_proc_groups == 1) {
739 return 1;
740 }
741 for (int i = 0; i < __kmp_num_proc_groups; i++) {
742 if (mask[i] == 0)
743 continue;
744 if (group >= 0)
745 return -1;
746 group = i;
747 }
748 return group;
749 }
750 };
751 void determine_capable(const char *env_var) override {
752 __kmp_affinity_determine_capable(env_var);
753 }
754 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
755 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
756 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
757 KMPAffinity::Mask *allocate_mask_array(int num) override {
758 return new Mask[num];
759 }
760 void deallocate_mask_array(KMPAffinity::Mask *array) override {
761 Mask *windows_array = static_cast<Mask *>(array);
762 delete[] windows_array;
763 }
764 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
765 int index) override {
766 Mask *windows_array = static_cast<Mask *>(array);
767 return &(windows_array[index]);
768 }
769 api_type get_api_type() const override { return NATIVE_OS; }
770};
771#endif /* KMP_OS_WINDOWS */
772#endif /* KMP_AFFINITY_SUPPORTED */
773
774// Describe an attribute for a level in the machine topology
775struct kmp_hw_attr_t {
776 int core_type : 8;
777 int core_eff : 8;
778 unsigned valid : 1;
779 unsigned reserved : 15;
780
781 static const int UNKNOWN_CORE_EFF = -1;
782
783 kmp_hw_attr_t()
784 : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
785 valid(0), reserved(0) {}
786 void set_core_type(kmp_hw_core_type_t type) {
787 valid = 1;
788 core_type = type;
789 }
790 void set_core_eff(int eff) {
791 valid = 1;
792 core_eff = eff;
793 }
794 kmp_hw_core_type_t get_core_type() const {
795 return (kmp_hw_core_type_t)core_type;
796 }
797 int get_core_eff() const { return core_eff; }
798 bool is_core_type_valid() const {
799 return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
800 }
801 bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
802 operator bool() const { return valid; }
803 void clear() {
804 core_type = KMP_HW_CORE_TYPE_UNKNOWN;
805 core_eff = UNKNOWN_CORE_EFF;
806 valid = 0;
807 }
808 bool contains(const kmp_hw_attr_t &other) const {
809 if (!valid && !other.valid)
810 return true;
811 if (valid && other.valid) {
812 if (other.is_core_type_valid()) {
813 if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
814 return false;
815 }
816 if (other.is_core_eff_valid()) {
817 if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
818 return false;
819 }
820 return true;
821 }
822 return false;
823 }
824#if KMP_AFFINITY_SUPPORTED
825 bool contains(const kmp_affinity_attrs_t &attr) const {
826 if (!valid && !attr.valid)
827 return true;
828 if (valid && attr.valid) {
829 if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
830 return (is_core_type_valid() &&
831 (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
832 if (attr.core_eff != UNKNOWN_CORE_EFF)
833 return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
834 return true;
835 }
836 return false;
837 }
838#endif // KMP_AFFINITY_SUPPORTED
839 bool operator==(const kmp_hw_attr_t &rhs) const {
840 return (rhs.valid == valid && rhs.core_eff == core_eff &&
841 rhs.core_type == core_type);
842 }
843 bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
844};
845
846#if KMP_AFFINITY_SUPPORTED
847KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
848#endif
849
850class kmp_hw_thread_t {
851public:
852 static const int UNKNOWN_ID = -1;
853 static const int MULTIPLE_ID = -2;
854 static int compare_ids(const void *a, const void *b);
855 static int compare_compact(const void *a, const void *b);
856 int ids[KMP_HW_LAST];
857 int sub_ids[KMP_HW_LAST];
858 bool leader;
859 int os_id;
860 kmp_hw_attr_t attrs;
861
862 void print() const;
863 void clear() {
864 for (int i = 0; i < (int)KMP_HW_LAST; ++i)
865 ids[i] = UNKNOWN_ID;
866 leader = false;
867 attrs.clear();
868 }
869};
870
871class kmp_topology_t {
872
873 struct flags_t {
874 int uniform : 1;
875 int reserved : 31;
876 };
877
878 int depth;
879
880 // The following arrays are all 'depth' long and have been
881 // allocated to hold up to KMP_HW_LAST number of objects if
882 // needed so layers can be added without reallocation of any array
883
884 // Orderd array of the types in the topology
885 kmp_hw_t *types;
886
887 // Keep quick topology ratios, for non-uniform topologies,
888 // this ratio holds the max number of itemAs per itemB
889 // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
890 int *ratio;
891
892 // Storage containing the absolute number of each topology layer
893 int *count;
894
895 // The number of core efficiencies. This is only useful for hybrid
896 // topologies. Core efficiencies will range from 0 to num efficiencies - 1
897 int num_core_efficiencies;
898 int num_core_types;
899 kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
900
901 // The hardware threads array
902 // hw_threads is num_hw_threads long
903 // Each hw_thread's ids and sub_ids are depth deep
904 int num_hw_threads;
905 kmp_hw_thread_t *hw_threads;
906
907 // Equivalence hash where the key is the hardware topology item
908 // and the value is the equivalent hardware topology type in the
909 // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
910 // known equivalence for the topology type
911 kmp_hw_t equivalent[KMP_HW_LAST];
912
913 // Flags describing the topology
914 flags_t flags;
915
916 // Compact value used during sort_compact()
917 int compact;
918
919 // Insert a new topology layer after allocation
920 void _insert_layer(kmp_hw_t type, const int *ids);
921
922#if KMP_GROUP_AFFINITY
923 // Insert topology information about Windows Processor groups
924 void _insert_windows_proc_groups();
925#endif
926
927 // Count each item & get the num x's per y
928 // e.g., get the number of cores and the number of threads per core
929 // for each (x, y) in (KMP_HW_* , KMP_HW_*)
930 void _gather_enumeration_information();
931
932 // Remove layers that don't add information to the topology.
933 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
934 void _remove_radix1_layers();
935
936 // Find out if the topology is uniform
937 void _discover_uniformity();
938
939 // Set all the sub_ids for each hardware thread
940 void _set_sub_ids();
941
942 // Set global affinity variables describing the number of threads per
943 // core, the number of packages, the number of cores per package, and
944 // the number of cores.
945 void _set_globals();
946
947 // Set the last level cache equivalent type
948 void _set_last_level_cache();
949
950 // Return the number of cores with a particular attribute, 'attr'.
951 // If 'find_all' is true, then find all cores on the machine, otherwise find
952 // all cores per the layer 'above'
953 int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
954 bool find_all = false) const;
955
956public:
957 // Force use of allocate()/deallocate()
958 kmp_topology_t() = delete;
959 kmp_topology_t(const kmp_topology_t &t) = delete;
960 kmp_topology_t(kmp_topology_t &&t) = delete;
961 kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
962 kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
963
964 static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
965 static void deallocate(kmp_topology_t *);
966
967 // Functions used in create_map() routines
968 kmp_hw_thread_t &at(int index) {
969 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
970 return hw_threads[index];
971 }
972 const kmp_hw_thread_t &at(int index) const {
973 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
974 return hw_threads[index];
975 }
976 int get_num_hw_threads() const { return num_hw_threads; }
977 void sort_ids() {
978 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
979 kmp_hw_thread_t::compare_ids);
980 }
981 // Check if the hardware ids are unique, if they are
982 // return true, otherwise return false
983 bool check_ids() const;
984
985 // Function to call after the create_map() routine
986 void canonicalize();
987 void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
988
989// Functions used after canonicalize() called
990
991#if KMP_AFFINITY_SUPPORTED
992 // Set the granularity for affinity settings
993 void set_granularity(kmp_affinity_t &stgs) const;
994 bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
995 bool restrict_to_mask(const kmp_affin_mask_t *mask);
996 bool filter_hw_subset();
997#endif
998 bool is_uniform() const { return flags.uniform; }
999 // Tell whether a type is a valid type in the topology
1000 // returns KMP_HW_UNKNOWN when there is no equivalent type
1001 kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
1002 if (type == KMP_HW_UNKNOWN)
1003 return KMP_HW_UNKNOWN;
1004 return equivalent[type];
1005 }
1006 // Set type1 = type2
1007 void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
1008 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
1009 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
1010 kmp_hw_t real_type2 = equivalent[type2];
1011 if (real_type2 == KMP_HW_UNKNOWN)
1012 real_type2 = type2;
1013 equivalent[type1] = real_type2;
1014 // This loop is required since any of the types may have been set to
1015 // be equivalent to type1. They all must be checked and reset to type2.
1016 KMP_FOREACH_HW_TYPE(type) {
1017 if (equivalent[type] == type1) {
1018 equivalent[type] = real_type2;
1019 }
1020 }
1021 }
1022 // Calculate number of types corresponding to level1
1023 // per types corresponding to level2 (e.g., number of threads per core)
1024 int calculate_ratio(int level1, int level2) const {
1025 KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
1026 KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
1027 int r = 1;
1028 for (int level = level1; level > level2; --level)
1029 r *= ratio[level];
1030 return r;
1031 }
1032 int get_ratio(int level) const {
1033 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1034 return ratio[level];
1035 }
1036 int get_depth() const { return depth; };
1037 kmp_hw_t get_type(int level) const {
1038 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1039 return types[level];
1040 }
1041 int get_level(kmp_hw_t type) const {
1042 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
1043 int eq_type = equivalent[type];
1044 if (eq_type == KMP_HW_UNKNOWN)
1045 return -1;
1046 for (int i = 0; i < depth; ++i)
1047 if (types[i] == eq_type)
1048 return i;
1049 return -1;
1050 }
1051 int get_count(int level) const {
1052 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1053 return count[level];
1054 }
1055 // Return the total number of cores with attribute 'attr'
1056 int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1057 return _get_ncores_with_attr(attr, -1, true);
1058 }
1059 // Return the number of cores with attribute
1060 // 'attr' per topology level 'above'
1061 int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1062 return _get_ncores_with_attr(attr, above, false);
1063 }
1064
1065#if KMP_AFFINITY_SUPPORTED
1066 friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
1067 void sort_compact(kmp_affinity_t &affinity) {
1068 compact = affinity.compact;
1069 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
1070 kmp_hw_thread_t::compare_compact);
1071 }
1072#endif
1073 void print(const char *env_var = "KMP_AFFINITY") const;
1074 void dump() const;
1075};
1076extern kmp_topology_t *__kmp_topology;
1077
1078class kmp_hw_subset_t {
1079 const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1080
1081public:
1082 // Describe a machine topology item in KMP_HW_SUBSET
1083 struct item_t {
1084 kmp_hw_t type;
1085 int num_attrs;
1086 int num[MAX_ATTRS];
1087 int offset[MAX_ATTRS];
1088 kmp_hw_attr_t attr[MAX_ATTRS];
1089 };
1090 // Put parenthesis around max to avoid accidental use of Windows max macro.
1091 const static int USE_ALL = (std::numeric_limits<int>::max)();
1092
1093private:
1094 int depth;
1095 int capacity;
1096 item_t *items;
1097 kmp_uint64 set;
1098 bool absolute;
1099 // The set must be able to handle up to KMP_HW_LAST number of layers
1100 KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1101 // Sorting the KMP_HW_SUBSET items to follow topology order
1102 // All unknown topology types will be at the beginning of the subset
1103 static int hw_subset_compare(const void *i1, const void *i2) {
1104 kmp_hw_t type1 = ((const item_t *)i1)->type;
1105 kmp_hw_t type2 = ((const item_t *)i2)->type;
1106 int level1 = __kmp_topology->get_level(type1);
1107 int level2 = __kmp_topology->get_level(type2);
1108 return level1 - level2;
1109 }
1110
1111public:
1112 // Force use of allocate()/deallocate()
1113 kmp_hw_subset_t() = delete;
1114 kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1115 kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1116 kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1117 kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1118
1119 static kmp_hw_subset_t *allocate() {
1120 int initial_capacity = 5;
1121 kmp_hw_subset_t *retval =
1122 (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1123 retval->depth = 0;
1124 retval->capacity = initial_capacity;
1125 retval->set = 0ull;
1126 retval->absolute = false;
1127 retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1128 return retval;
1129 }
1130 static void deallocate(kmp_hw_subset_t *subset) {
1131 __kmp_free(subset->items);
1132 __kmp_free(subset);
1133 }
1134 void set_absolute() { absolute = true; }
1135 bool is_absolute() const { return absolute; }
1136 void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1137 for (int i = 0; i < depth; ++i) {
1138 // Found an existing item for this layer type
1139 // Add the num, offset, and attr to this item
1140 if (items[i].type == type) {
1141 int idx = items[i].num_attrs++;
1142 if ((size_t)idx >= MAX_ATTRS)
1143 return;
1144 items[i].num[idx] = num;
1145 items[i].offset[idx] = offset;
1146 items[i].attr[idx] = attr;
1147 return;
1148 }
1149 }
1150 if (depth == capacity - 1) {
1151 capacity *= 2;
1152 item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1153 for (int i = 0; i < depth; ++i)
1154 new_items[i] = items[i];
1155 __kmp_free(items);
1156 items = new_items;
1157 }
1158 items[depth].num_attrs = 1;
1159 items[depth].type = type;
1160 items[depth].num[0] = num;
1161 items[depth].offset[0] = offset;
1162 items[depth].attr[0] = attr;
1163 depth++;
1164 set |= (1ull << type);
1165 }
1166 int get_depth() const { return depth; }
1167 const item_t &at(int index) const {
1168 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1169 return items[index];
1170 }
1171 item_t &at(int index) {
1172 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1173 return items[index];
1174 }
1175 void remove(int index) {
1176 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1177 set &= ~(1ull << items[index].type);
1178 for (int j = index + 1; j < depth; ++j) {
1179 items[j - 1] = items[j];
1180 }
1181 depth--;
1182 }
1183 void sort() {
1184 KMP_DEBUG_ASSERT(__kmp_topology);
1185 qsort(items, depth, sizeof(item_t), hw_subset_compare);
1186 }
1187 bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1188
1189 // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.
1190 // This means putting each of {sockets, cores, threads} in the topology if
1191 // they are not specified:
1192 // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc.
1193 // e.g., 3module => *s,3module,*c,*t
1194 // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET
1195 // are expecting the traditional sockets/cores/threads topology. For newer
1196 // hardware, there can be intervening layers like dies/tiles/modules
1197 // (usually corresponding to a cache level). So when a user asks for
1198 // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user
1199 // should get 12 hardware threads across 6 cores and effectively ignore the
1200 // module layer.
1201 void canonicalize(const kmp_topology_t *top) {
1202 // Layers to target for KMP_HW_SUBSET canonicalization
1203 kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1204
1205 // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS
1206 if (is_absolute())
1207 return;
1208
1209 // Do not target-layer-canonicalize KMP_HW_SUBSETS when the
1210 // topology doesn't have these layers
1211 for (kmp_hw_t type : targeted)
1212 if (top->get_level(type) == KMP_HW_UNKNOWN)
1213 return;
1214
1215 // Put targeted layers in topology if they do not exist
1216 for (kmp_hw_t type : targeted) {
1217 bool found = false;
1218 for (int i = 0; i < get_depth(); ++i) {
1219 if (top->get_equivalent_type(items[i].type) == type) {
1220 found = true;
1221 break;
1222 }
1223 }
1224 if (!found) {
1225 push_back(USE_ALL, type, 0, kmp_hw_attr_t{});
1226 }
1227 }
1228 sort();
1229 // Set as an absolute topology that only targets the targeted layers
1230 set_absolute();
1231 }
1232 void dump() const {
1233 printf("**********************\n");
1234 printf("*** kmp_hw_subset: ***\n");
1235 printf("* depth: %d\n", depth);
1236 printf("* items:\n");
1237 for (int i = 0; i < depth; ++i) {
1238 printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1239 for (int j = 0; j < items[i].num_attrs; ++j) {
1240 printf(" num: %d, offset: %d, attr: ", items[i].num[j],
1241 items[i].offset[j]);
1242 if (!items[i].attr[j]) {
1243 printf(" (none)\n");
1244 } else {
1245 printf(
1246 " core_type = %s, core_eff = %d\n",
1247 __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1248 items[i].attr[j].get_core_eff());
1249 }
1250 }
1251 }
1252 printf("* set: 0x%llx\n", set);
1253 printf("* absolute: %d\n", absolute);
1254 printf("**********************\n");
1255 }
1256};
1257extern kmp_hw_subset_t *__kmp_hw_subset;
1258
1259/* A structure for holding machine-specific hierarchy info to be computed once
1260 at init. This structure represents a mapping of threads to the actual machine
1261 hierarchy, or to our best guess at what the hierarchy might be, for the
1262 purpose of performing an efficient barrier. In the worst case, when there is
1263 no machine hierarchy information, it produces a tree suitable for a barrier,
1264 similar to the tree used in the hyper barrier. */
1265class hierarchy_info {
1266public:
1267 /* Good default values for number of leaves and branching factor, given no
1268 affinity information. Behaves a bit like hyper barrier. */
1269 static const kmp_uint32 maxLeaves = 4;
1270 static const kmp_uint32 minBranch = 4;
1276 kmp_uint32 maxLevels;
1277
1282 kmp_uint32 depth;
1283 kmp_uint32 base_num_threads;
1284 enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1285 volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1286 // 2=initialization in progress
1287 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1288
1293 kmp_uint32 *numPerLevel;
1294 kmp_uint32 *skipPerLevel;
1295
1296 void deriveLevels() {
1297 int hier_depth = __kmp_topology->get_depth();
1298 for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1299 numPerLevel[level] = __kmp_topology->get_ratio(i);
1300 }
1301 }
1302
1303 hierarchy_info()
1304 : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1305
1306 void fini() {
1307 if (!uninitialized && numPerLevel) {
1308 __kmp_free(numPerLevel);
1309 numPerLevel = NULL;
1310 uninitialized = not_initialized;
1311 }
1312 }
1313
1314 void init(int num_addrs) {
1315 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1316 &uninitialized, not_initialized, initializing);
1317 if (bool_result == 0) { // Wait for initialization
1318 while (TCR_1(uninitialized) != initialized)
1319 KMP_CPU_PAUSE();
1320 return;
1321 }
1322 KMP_DEBUG_ASSERT(bool_result == 1);
1323
1324 /* Added explicit initialization of the data fields here to prevent usage of
1325 dirty value observed when static library is re-initialized multiple times
1326 (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1327 OpenMP). */
1328 depth = 1;
1329 resizing = 0;
1330 maxLevels = 7;
1331 numPerLevel =
1332 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1333 skipPerLevel = &(numPerLevel[maxLevels]);
1334 for (kmp_uint32 i = 0; i < maxLevels;
1335 ++i) { // init numPerLevel[*] to 1 item per level
1336 numPerLevel[i] = 1;
1337 skipPerLevel[i] = 1;
1338 }
1339
1340 // Sort table by physical ID
1341 if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1342 deriveLevels();
1343 } else {
1344 numPerLevel[0] = maxLeaves;
1345 numPerLevel[1] = num_addrs / maxLeaves;
1346 if (num_addrs % maxLeaves)
1347 numPerLevel[1]++;
1348 }
1349
1350 base_num_threads = num_addrs;
1351 for (int i = maxLevels - 1; i >= 0;
1352 --i) // count non-empty levels to get depth
1353 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1354 depth++;
1355
1356 kmp_uint32 branch = minBranch;
1357 if (numPerLevel[0] == 1)
1358 branch = num_addrs / maxLeaves;
1359 if (branch < minBranch)
1360 branch = minBranch;
1361 for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1362 while (numPerLevel[d] > branch ||
1363 (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1364 if (numPerLevel[d] & 1)
1365 numPerLevel[d]++;
1366 numPerLevel[d] = numPerLevel[d] >> 1;
1367 if (numPerLevel[d + 1] == 1)
1368 depth++;
1369 numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1370 }
1371 if (numPerLevel[0] == 1) {
1372 branch = branch >> 1;
1373 if (branch < 4)
1374 branch = minBranch;
1375 }
1376 }
1377
1378 for (kmp_uint32 i = 1; i < depth; ++i)
1379 skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1380 // Fill in hierarchy in the case of oversubscription
1381 for (kmp_uint32 i = depth; i < maxLevels; ++i)
1382 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1383
1384 uninitialized = initialized; // One writer
1385 }
1386
1387 // Resize the hierarchy if nproc changes to something larger than before
1388 void resize(kmp_uint32 nproc) {
1389 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1390 while (bool_result == 0) { // someone else is trying to resize
1391 KMP_CPU_PAUSE();
1392 if (nproc <= base_num_threads) // happy with other thread's resize
1393 return;
1394 else // try to resize
1395 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1396 }
1397 KMP_DEBUG_ASSERT(bool_result != 0);
1398 if (nproc <= base_num_threads)
1399 return; // happy with other thread's resize
1400
1401 // Calculate new maxLevels
1402 kmp_uint32 old_sz = skipPerLevel[depth - 1];
1403 kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1404 // First see if old maxLevels is enough to contain new size
1405 for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1406 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1407 numPerLevel[i - 1] *= 2;
1408 old_sz *= 2;
1409 depth++;
1410 }
1411 if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1412 while (nproc > old_sz) {
1413 old_sz *= 2;
1414 incs++;
1415 depth++;
1416 }
1417 maxLevels += incs;
1418
1419 // Resize arrays
1420 kmp_uint32 *old_numPerLevel = numPerLevel;
1421 kmp_uint32 *old_skipPerLevel = skipPerLevel;
1422 numPerLevel = skipPerLevel = NULL;
1423 numPerLevel =
1424 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1425 skipPerLevel = &(numPerLevel[maxLevels]);
1426
1427 // Copy old elements from old arrays
1428 for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1429 // init numPerLevel[*] to 1 item per level
1430 numPerLevel[i] = old_numPerLevel[i];
1431 skipPerLevel[i] = old_skipPerLevel[i];
1432 }
1433
1434 // Init new elements in arrays to 1
1435 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1436 // init numPerLevel[*] to 1 item per level
1437 numPerLevel[i] = 1;
1438 skipPerLevel[i] = 1;
1439 }
1440
1441 // Free old arrays
1442 __kmp_free(old_numPerLevel);
1443 }
1444
1445 // Fill in oversubscription levels of hierarchy
1446 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1447 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1448
1449 base_num_threads = nproc;
1450 resizing = 0; // One writer
1451 }
1452};
1453#endif // KMP_AFFINITY_H