LLVM OpenMP* Runtime Library
Loading...
Searching...
No Matches
kmp_affinity.h
1/*
2 * kmp_affinity.h -- header for affinity management
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef KMP_AFFINITY_H
14#define KMP_AFFINITY_H
15
16#include "kmp.h"
17#include "kmp_os.h"
18#include <limits>
19
20#if KMP_AFFINITY_SUPPORTED
21#if KMP_USE_HWLOC
22class KMPHwlocAffinity : public KMPAffinity {
23public:
24 class Mask : public KMPAffinity::Mask {
25 hwloc_cpuset_t mask;
26
27 public:
28 Mask() {
29 mask = hwloc_bitmap_alloc();
30 this->zero();
31 }
32 ~Mask() { hwloc_bitmap_free(mask); }
33 void set(int i) override { hwloc_bitmap_set(mask, i); }
34 bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
35 void clear(int i) override { hwloc_bitmap_clr(mask, i); }
36 void zero() override { hwloc_bitmap_zero(mask); }
37 void copy(const KMPAffinity::Mask *src) override {
38 const Mask *convert = static_cast<const Mask *>(src);
39 hwloc_bitmap_copy(mask, convert->mask);
40 }
41 void bitwise_and(const KMPAffinity::Mask *rhs) override {
42 const Mask *convert = static_cast<const Mask *>(rhs);
43 hwloc_bitmap_and(mask, mask, convert->mask);
44 }
45 void bitwise_or(const KMPAffinity::Mask *rhs) override {
46 const Mask *convert = static_cast<const Mask *>(rhs);
47 hwloc_bitmap_or(mask, mask, convert->mask);
48 }
49 void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
50 int begin() const override { return hwloc_bitmap_first(mask); }
51 int end() const override { return -1; }
52 int next(int previous) const override {
53 return hwloc_bitmap_next(mask, previous);
54 }
55 int get_system_affinity(bool abort_on_error) override {
56 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
57 "Illegal get affinity operation when not capable");
58 long retval =
59 hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
60 if (retval >= 0) {
61 return 0;
62 }
63 int error = errno;
64 if (abort_on_error) {
65 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
66 }
67 return error;
68 }
69 int set_system_affinity(bool abort_on_error) const override {
70 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
71 "Illegal set affinity operation when not capable");
72 long retval =
73 hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
74 if (retval >= 0) {
75 return 0;
76 }
77 int error = errno;
78 if (abort_on_error) {
79 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
80 }
81 return error;
82 }
83#if KMP_OS_WINDOWS
84 int set_process_affinity(bool abort_on_error) const override {
85 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
86 "Illegal set process affinity operation when not capable");
87 int error = 0;
88 const hwloc_topology_support *support =
89 hwloc_topology_get_support(__kmp_hwloc_topology);
90 if (support->cpubind->set_proc_cpubind) {
91 int retval;
92 retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
93 HWLOC_CPUBIND_PROCESS);
94 if (retval >= 0)
95 return 0;
96 error = errno;
97 if (abort_on_error)
98 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
99 }
100 return error;
101 }
102#endif
103 int get_proc_group() const override {
104 int group = -1;
105#if KMP_OS_WINDOWS
106 if (__kmp_num_proc_groups == 1) {
107 return 1;
108 }
109 for (int i = 0; i < __kmp_num_proc_groups; i++) {
110 // On windows, the long type is always 32 bits
111 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
112 unsigned long second_32_bits =
113 hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
114 if (first_32_bits == 0 && second_32_bits == 0) {
115 continue;
116 }
117 if (group >= 0) {
118 return -1;
119 }
120 group = i;
121 }
122#endif /* KMP_OS_WINDOWS */
123 return group;
124 }
125 };
126 void determine_capable(const char *var) override {
127 const hwloc_topology_support *topology_support;
128 if (__kmp_hwloc_topology == NULL) {
129 if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
130 __kmp_hwloc_error = TRUE;
131 if (__kmp_affinity_verbose)
132 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
133 }
134 if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
135 __kmp_hwloc_error = TRUE;
136 if (__kmp_affinity_verbose)
137 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
138 }
139 }
140 topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
141 // Is the system capable of setting/getting this thread's affinity?
142 // Also, is topology discovery possible? (pu indicates ability to discover
143 // processing units). And finally, were there no errors when calling any
144 // hwloc_* API functions?
145 if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
146 topology_support->cpubind->get_thisthread_cpubind &&
147 topology_support->discovery->pu && !__kmp_hwloc_error) {
148 // enables affinity according to KMP_AFFINITY_CAPABLE() macro
149 KMP_AFFINITY_ENABLE(TRUE);
150 } else {
151 // indicate that hwloc didn't work and disable affinity
152 __kmp_hwloc_error = TRUE;
153 KMP_AFFINITY_DISABLE();
154 }
155 }
156 void bind_thread(int which) override {
157 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
158 "Illegal set affinity operation when not capable");
159 KMPAffinity::Mask *mask;
160 KMP_CPU_ALLOC_ON_STACK(mask);
161 KMP_CPU_ZERO(mask);
162 KMP_CPU_SET(which, mask);
163 __kmp_set_system_affinity(mask, TRUE);
164 KMP_CPU_FREE_FROM_STACK(mask);
165 }
166 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
167 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
168 KMPAffinity::Mask *allocate_mask_array(int num) override {
169 return new Mask[num];
170 }
171 void deallocate_mask_array(KMPAffinity::Mask *array) override {
172 Mask *hwloc_array = static_cast<Mask *>(array);
173 delete[] hwloc_array;
174 }
175 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
176 int index) override {
177 Mask *hwloc_array = static_cast<Mask *>(array);
178 return &(hwloc_array[index]);
179 }
180 api_type get_api_type() const override { return HWLOC; }
181};
182#endif /* KMP_USE_HWLOC */
183
184#if KMP_OS_LINUX || KMP_OS_FREEBSD
185#if KMP_OS_LINUX
186/* On some of the older OS's that we build on, these constants aren't present
187 in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
188 all systems of the same arch where they are defined, and they cannot change.
189 stone forever. */
190#include <sys/syscall.h>
191#if KMP_ARCH_X86 || KMP_ARCH_ARM
192#ifndef __NR_sched_setaffinity
193#define __NR_sched_setaffinity 241
194#elif __NR_sched_setaffinity != 241
195#error Wrong code for setaffinity system call.
196#endif /* __NR_sched_setaffinity */
197#ifndef __NR_sched_getaffinity
198#define __NR_sched_getaffinity 242
199#elif __NR_sched_getaffinity != 242
200#error Wrong code for getaffinity system call.
201#endif /* __NR_sched_getaffinity */
202#elif KMP_ARCH_AARCH64
203#ifndef __NR_sched_setaffinity
204#define __NR_sched_setaffinity 122
205#elif __NR_sched_setaffinity != 122
206#error Wrong code for setaffinity system call.
207#endif /* __NR_sched_setaffinity */
208#ifndef __NR_sched_getaffinity
209#define __NR_sched_getaffinity 123
210#elif __NR_sched_getaffinity != 123
211#error Wrong code for getaffinity system call.
212#endif /* __NR_sched_getaffinity */
213#elif KMP_ARCH_RISCV64
214#ifndef __NR_sched_setaffinity
215#define __NR_sched_setaffinity 122
216#elif __NR_sched_setaffinity != 122
217#error Wrong code for setaffinity system call.
218#endif /* __NR_sched_setaffinity */
219#ifndef __NR_sched_getaffinity
220#define __NR_sched_getaffinity 123
221#elif __NR_sched_getaffinity != 123
222#error Wrong code for getaffinity system call.
223#endif /* __NR_sched_getaffinity */
224#elif KMP_ARCH_X86_64
225#ifndef __NR_sched_setaffinity
226#define __NR_sched_setaffinity 203
227#elif __NR_sched_setaffinity != 203
228#error Wrong code for setaffinity system call.
229#endif /* __NR_sched_setaffinity */
230#ifndef __NR_sched_getaffinity
231#define __NR_sched_getaffinity 204
232#elif __NR_sched_getaffinity != 204
233#error Wrong code for getaffinity system call.
234#endif /* __NR_sched_getaffinity */
235#elif KMP_ARCH_PPC64
236#ifndef __NR_sched_setaffinity
237#define __NR_sched_setaffinity 222
238#elif __NR_sched_setaffinity != 222
239#error Wrong code for setaffinity system call.
240#endif /* __NR_sched_setaffinity */
241#ifndef __NR_sched_getaffinity
242#define __NR_sched_getaffinity 223
243#elif __NR_sched_getaffinity != 223
244#error Wrong code for getaffinity system call.
245#endif /* __NR_sched_getaffinity */
246# elif KMP_ARCH_MIPS
247# ifndef __NR_sched_setaffinity
248# define __NR_sched_setaffinity 4239
249# elif __NR_sched_setaffinity != 4239
250# error Wrong code for setaffinity system call.
251# endif /* __NR_sched_setaffinity */
252# ifndef __NR_sched_getaffinity
253# define __NR_sched_getaffinity 4240
254# elif __NR_sched_getaffinity != 4240
255# error Wrong code for getaffinity system call.
256# endif /* __NR_sched_getaffinity */
257# elif KMP_ARCH_MIPS64
258# ifndef __NR_sched_setaffinity
259# define __NR_sched_setaffinity 5195
260# elif __NR_sched_setaffinity != 5195
261# error Wrong code for setaffinity system call.
262# endif /* __NR_sched_setaffinity */
263# ifndef __NR_sched_getaffinity
264# define __NR_sched_getaffinity 5196
265# elif __NR_sched_getaffinity != 5196
266# error Wrong code for getaffinity system call.
267# endif /* __NR_sched_getaffinity */
268# else
269#error Unknown or unsupported architecture
270#endif /* KMP_ARCH_* */
271#elif KMP_OS_FREEBSD
272#include <pthread.h>
273#include <pthread_np.h>
274#endif
275class KMPNativeAffinity : public KMPAffinity {
276 class Mask : public KMPAffinity::Mask {
277 typedef unsigned long mask_t;
278 typedef decltype(__kmp_affin_mask_size) mask_size_type;
279 static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
280 static const mask_t ONE = 1;
281 mask_size_type get_num_mask_types() const {
282 return __kmp_affin_mask_size / sizeof(mask_t);
283 }
284
285 public:
286 mask_t *mask;
287 Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
288 ~Mask() {
289 if (mask)
290 __kmp_free(mask);
291 }
292 void set(int i) override {
293 mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
294 }
295 bool is_set(int i) const override {
296 return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
297 }
298 void clear(int i) override {
299 mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
300 }
301 void zero() override {
302 mask_size_type e = get_num_mask_types();
303 for (mask_size_type i = 0; i < e; ++i)
304 mask[i] = (mask_t)0;
305 }
306 void copy(const KMPAffinity::Mask *src) override {
307 const Mask *convert = static_cast<const Mask *>(src);
308 mask_size_type e = get_num_mask_types();
309 for (mask_size_type i = 0; i < e; ++i)
310 mask[i] = convert->mask[i];
311 }
312 void bitwise_and(const KMPAffinity::Mask *rhs) override {
313 const Mask *convert = static_cast<const Mask *>(rhs);
314 mask_size_type e = get_num_mask_types();
315 for (mask_size_type i = 0; i < e; ++i)
316 mask[i] &= convert->mask[i];
317 }
318 void bitwise_or(const KMPAffinity::Mask *rhs) override {
319 const Mask *convert = static_cast<const Mask *>(rhs);
320 mask_size_type e = get_num_mask_types();
321 for (mask_size_type i = 0; i < e; ++i)
322 mask[i] |= convert->mask[i];
323 }
324 void bitwise_not() override {
325 mask_size_type e = get_num_mask_types();
326 for (mask_size_type i = 0; i < e; ++i)
327 mask[i] = ~(mask[i]);
328 }
329 int begin() const override {
330 int retval = 0;
331 while (retval < end() && !is_set(retval))
332 ++retval;
333 return retval;
334 }
335 int end() const override {
336 int e;
337 __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
338 return e;
339 }
340 int next(int previous) const override {
341 int retval = previous + 1;
342 while (retval < end() && !is_set(retval))
343 ++retval;
344 return retval;
345 }
346 int get_system_affinity(bool abort_on_error) override {
347 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
348 "Illegal get affinity operation when not capable");
349#if KMP_OS_LINUX
350 long retval =
351 syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
352#elif KMP_OS_FREEBSD
353 int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
354 reinterpret_cast<cpuset_t *>(mask));
355 int retval = (r == 0 ? 0 : -1);
356#endif
357 if (retval >= 0) {
358 return 0;
359 }
360 int error = errno;
361 if (abort_on_error) {
362 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
363 }
364 return error;
365 }
366 int set_system_affinity(bool abort_on_error) const override {
367 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
368 "Illegal set affinity operation when not capable");
369#if KMP_OS_LINUX
370 long retval =
371 syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
372#elif KMP_OS_FREEBSD
373 int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
374 reinterpret_cast<cpuset_t *>(mask));
375 int retval = (r == 0 ? 0 : -1);
376#endif
377 if (retval >= 0) {
378 return 0;
379 }
380 int error = errno;
381 if (abort_on_error) {
382 __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
383 }
384 return error;
385 }
386 };
387 void determine_capable(const char *env_var) override {
388 __kmp_affinity_determine_capable(env_var);
389 }
390 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
391 KMPAffinity::Mask *allocate_mask() override {
392 KMPNativeAffinity::Mask *retval = new Mask();
393 return retval;
394 }
395 void deallocate_mask(KMPAffinity::Mask *m) override {
396 KMPNativeAffinity::Mask *native_mask =
397 static_cast<KMPNativeAffinity::Mask *>(m);
398 delete native_mask;
399 }
400 KMPAffinity::Mask *allocate_mask_array(int num) override {
401 return new Mask[num];
402 }
403 void deallocate_mask_array(KMPAffinity::Mask *array) override {
404 Mask *linux_array = static_cast<Mask *>(array);
405 delete[] linux_array;
406 }
407 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
408 int index) override {
409 Mask *linux_array = static_cast<Mask *>(array);
410 return &(linux_array[index]);
411 }
412 api_type get_api_type() const override { return NATIVE_OS; }
413};
414#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
415
416#if KMP_OS_WINDOWS
417class KMPNativeAffinity : public KMPAffinity {
418 class Mask : public KMPAffinity::Mask {
419 typedef ULONG_PTR mask_t;
420 static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
421 mask_t *mask;
422
423 public:
424 Mask() {
425 mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
426 }
427 ~Mask() {
428 if (mask)
429 __kmp_free(mask);
430 }
431 void set(int i) override {
432 mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
433 }
434 bool is_set(int i) const override {
435 return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
436 }
437 void clear(int i) override {
438 mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
439 }
440 void zero() override {
441 for (int i = 0; i < __kmp_num_proc_groups; ++i)
442 mask[i] = 0;
443 }
444 void copy(const KMPAffinity::Mask *src) override {
445 const Mask *convert = static_cast<const Mask *>(src);
446 for (int i = 0; i < __kmp_num_proc_groups; ++i)
447 mask[i] = convert->mask[i];
448 }
449 void bitwise_and(const KMPAffinity::Mask *rhs) override {
450 const Mask *convert = static_cast<const Mask *>(rhs);
451 for (int i = 0; i < __kmp_num_proc_groups; ++i)
452 mask[i] &= convert->mask[i];
453 }
454 void bitwise_or(const KMPAffinity::Mask *rhs) override {
455 const Mask *convert = static_cast<const Mask *>(rhs);
456 for (int i = 0; i < __kmp_num_proc_groups; ++i)
457 mask[i] |= convert->mask[i];
458 }
459 void bitwise_not() override {
460 for (int i = 0; i < __kmp_num_proc_groups; ++i)
461 mask[i] = ~(mask[i]);
462 }
463 int begin() const override {
464 int retval = 0;
465 while (retval < end() && !is_set(retval))
466 ++retval;
467 return retval;
468 }
469 int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
470 int next(int previous) const override {
471 int retval = previous + 1;
472 while (retval < end() && !is_set(retval))
473 ++retval;
474 return retval;
475 }
476 int set_process_affinity(bool abort_on_error) const override {
477 if (__kmp_num_proc_groups <= 1) {
478 if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
479 DWORD error = GetLastError();
480 if (abort_on_error) {
481 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
482 __kmp_msg_null);
483 }
484 return error;
485 }
486 }
487 return 0;
488 }
489 int set_system_affinity(bool abort_on_error) const override {
490 if (__kmp_num_proc_groups > 1) {
491 // Check for a valid mask.
492 GROUP_AFFINITY ga;
493 int group = get_proc_group();
494 if (group < 0) {
495 if (abort_on_error) {
496 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
497 }
498 return -1;
499 }
500 // Transform the bit vector into a GROUP_AFFINITY struct
501 // and make the system call to set affinity.
502 ga.Group = group;
503 ga.Mask = mask[group];
504 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
505
506 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
507 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
508 DWORD error = GetLastError();
509 if (abort_on_error) {
510 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
511 __kmp_msg_null);
512 }
513 return error;
514 }
515 } else {
516 if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
517 DWORD error = GetLastError();
518 if (abort_on_error) {
519 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
520 __kmp_msg_null);
521 }
522 return error;
523 }
524 }
525 return 0;
526 }
527 int get_system_affinity(bool abort_on_error) override {
528 if (__kmp_num_proc_groups > 1) {
529 this->zero();
530 GROUP_AFFINITY ga;
531 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
532 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
533 DWORD error = GetLastError();
534 if (abort_on_error) {
535 __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
536 KMP_ERR(error), __kmp_msg_null);
537 }
538 return error;
539 }
540 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
541 (ga.Mask == 0)) {
542 return -1;
543 }
544 mask[ga.Group] = ga.Mask;
545 } else {
546 mask_t newMask, sysMask, retval;
547 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
548 DWORD error = GetLastError();
549 if (abort_on_error) {
550 __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
551 KMP_ERR(error), __kmp_msg_null);
552 }
553 return error;
554 }
555 retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
556 if (!retval) {
557 DWORD error = GetLastError();
558 if (abort_on_error) {
559 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
560 KMP_ERR(error), __kmp_msg_null);
561 }
562 return error;
563 }
564 newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
565 if (!newMask) {
566 DWORD error = GetLastError();
567 if (abort_on_error) {
568 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
569 KMP_ERR(error), __kmp_msg_null);
570 }
571 }
572 *mask = retval;
573 }
574 return 0;
575 }
576 int get_proc_group() const override {
577 int group = -1;
578 if (__kmp_num_proc_groups == 1) {
579 return 1;
580 }
581 for (int i = 0; i < __kmp_num_proc_groups; i++) {
582 if (mask[i] == 0)
583 continue;
584 if (group >= 0)
585 return -1;
586 group = i;
587 }
588 return group;
589 }
590 };
591 void determine_capable(const char *env_var) override {
592 __kmp_affinity_determine_capable(env_var);
593 }
594 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
595 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
596 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
597 KMPAffinity::Mask *allocate_mask_array(int num) override {
598 return new Mask[num];
599 }
600 void deallocate_mask_array(KMPAffinity::Mask *array) override {
601 Mask *windows_array = static_cast<Mask *>(array);
602 delete[] windows_array;
603 }
604 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
605 int index) override {
606 Mask *windows_array = static_cast<Mask *>(array);
607 return &(windows_array[index]);
608 }
609 api_type get_api_type() const override { return NATIVE_OS; }
610};
611#endif /* KMP_OS_WINDOWS */
612#endif /* KMP_AFFINITY_SUPPORTED */
613
614// Describe an attribute for a level in the machine topology
615struct kmp_hw_attr_t {
616 int core_type : 8;
617 int core_eff : 8;
618 unsigned valid : 1;
619 unsigned reserved : 15;
620
621 static const int UNKNOWN_CORE_EFF = -1;
622
623 kmp_hw_attr_t()
624 : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
625 valid(0), reserved(0) {}
626 void set_core_type(kmp_hw_core_type_t type) {
627 valid = 1;
628 core_type = type;
629 }
630 void set_core_eff(int eff) {
631 valid = 1;
632 core_eff = eff;
633 }
634 kmp_hw_core_type_t get_core_type() const {
635 return (kmp_hw_core_type_t)core_type;
636 }
637 int get_core_eff() const { return core_eff; }
638 bool is_core_type_valid() const {
639 return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
640 }
641 bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
642 operator bool() const { return valid; }
643 void clear() {
644 core_type = KMP_HW_CORE_TYPE_UNKNOWN;
645 core_eff = UNKNOWN_CORE_EFF;
646 valid = 0;
647 }
648 bool contains(const kmp_hw_attr_t &other) const {
649 if (!valid && !other.valid)
650 return true;
651 if (valid && other.valid) {
652 if (other.is_core_type_valid()) {
653 if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
654 return false;
655 }
656 if (other.is_core_eff_valid()) {
657 if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
658 return false;
659 }
660 return true;
661 }
662 return false;
663 }
664 bool operator==(const kmp_hw_attr_t &rhs) const {
665 return (rhs.valid == valid && rhs.core_eff == core_eff &&
666 rhs.core_type == core_type);
667 }
668 bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
669};
670
671class kmp_hw_thread_t {
672public:
673 static const int UNKNOWN_ID = -1;
674 static int compare_ids(const void *a, const void *b);
675 static int compare_compact(const void *a, const void *b);
676 int ids[KMP_HW_LAST];
677 int sub_ids[KMP_HW_LAST];
678 bool leader;
679 int os_id;
680 kmp_hw_attr_t attrs;
681
682 void print() const;
683 void clear() {
684 for (int i = 0; i < (int)KMP_HW_LAST; ++i)
685 ids[i] = UNKNOWN_ID;
686 leader = false;
687 attrs.clear();
688 }
689};
690
691class kmp_topology_t {
692
693 struct flags_t {
694 int uniform : 1;
695 int reserved : 31;
696 };
697
698 int depth;
699
700 // The following arrays are all 'depth' long and have been
701 // allocated to hold up to KMP_HW_LAST number of objects if
702 // needed so layers can be added without reallocation of any array
703
704 // Orderd array of the types in the topology
705 kmp_hw_t *types;
706
707 // Keep quick topology ratios, for non-uniform topologies,
708 // this ratio holds the max number of itemAs per itemB
709 // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
710 int *ratio;
711
712 // Storage containing the absolute number of each topology layer
713 int *count;
714
715 // The number of core efficiencies. This is only useful for hybrid
716 // topologies. Core efficiencies will range from 0 to num efficiencies - 1
717 int num_core_efficiencies;
718 int num_core_types;
719 kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
720
721 // The hardware threads array
722 // hw_threads is num_hw_threads long
723 // Each hw_thread's ids and sub_ids are depth deep
724 int num_hw_threads;
725 kmp_hw_thread_t *hw_threads;
726
727 // Equivalence hash where the key is the hardware topology item
728 // and the value is the equivalent hardware topology type in the
729 // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
730 // known equivalence for the topology type
731 kmp_hw_t equivalent[KMP_HW_LAST];
732
733 // Flags describing the topology
734 flags_t flags;
735
736 // Insert a new topology layer after allocation
737 void _insert_layer(kmp_hw_t type, const int *ids);
738
739#if KMP_GROUP_AFFINITY
740 // Insert topology information about Windows Processor groups
741 void _insert_windows_proc_groups();
742#endif
743
744 // Count each item & get the num x's per y
745 // e.g., get the number of cores and the number of threads per core
746 // for each (x, y) in (KMP_HW_* , KMP_HW_*)
747 void _gather_enumeration_information();
748
749 // Remove layers that don't add information to the topology.
750 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
751 void _remove_radix1_layers();
752
753 // Find out if the topology is uniform
754 void _discover_uniformity();
755
756 // Set all the sub_ids for each hardware thread
757 void _set_sub_ids();
758
759 // Set global affinity variables describing the number of threads per
760 // core, the number of packages, the number of cores per package, and
761 // the number of cores.
762 void _set_globals();
763
764 // Set the last level cache equivalent type
765 void _set_last_level_cache();
766
767 // Return the number of cores with a particular attribute, 'attr'.
768 // If 'find_all' is true, then find all cores on the machine, otherwise find
769 // all cores per the layer 'above'
770 int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
771 bool find_all = false) const;
772
773public:
774 // Force use of allocate()/deallocate()
775 kmp_topology_t() = delete;
776 kmp_topology_t(const kmp_topology_t &t) = delete;
777 kmp_topology_t(kmp_topology_t &&t) = delete;
778 kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
779 kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
780
781 static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
782 static void deallocate(kmp_topology_t *);
783
784 // Functions used in create_map() routines
785 kmp_hw_thread_t &at(int index) {
786 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
787 return hw_threads[index];
788 }
789 const kmp_hw_thread_t &at(int index) const {
790 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
791 return hw_threads[index];
792 }
793 int get_num_hw_threads() const { return num_hw_threads; }
794 void sort_ids() {
795 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
796 kmp_hw_thread_t::compare_ids);
797 }
798 // Check if the hardware ids are unique, if they are
799 // return true, otherwise return false
800 bool check_ids() const;
801
802 // Function to call after the create_map() routine
803 void canonicalize();
804 void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
805
806 // Functions used after canonicalize() called
807 bool filter_hw_subset();
808 bool is_close(int hwt1, int hwt2, int level) const;
809 bool is_uniform() const { return flags.uniform; }
810 // Tell whether a type is a valid type in the topology
811 // returns KMP_HW_UNKNOWN when there is no equivalent type
812 kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; }
813 // Set type1 = type2
814 void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
815 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
816 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
817 kmp_hw_t real_type2 = equivalent[type2];
818 if (real_type2 == KMP_HW_UNKNOWN)
819 real_type2 = type2;
820 equivalent[type1] = real_type2;
821 // This loop is required since any of the types may have been set to
822 // be equivalent to type1. They all must be checked and reset to type2.
823 KMP_FOREACH_HW_TYPE(type) {
824 if (equivalent[type] == type1) {
825 equivalent[type] = real_type2;
826 }
827 }
828 }
829 // Calculate number of types corresponding to level1
830 // per types corresponding to level2 (e.g., number of threads per core)
831 int calculate_ratio(int level1, int level2) const {
832 KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
833 KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
834 int r = 1;
835 for (int level = level1; level > level2; --level)
836 r *= ratio[level];
837 return r;
838 }
839 int get_ratio(int level) const {
840 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
841 return ratio[level];
842 }
843 int get_depth() const { return depth; };
844 kmp_hw_t get_type(int level) const {
845 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
846 return types[level];
847 }
848 int get_level(kmp_hw_t type) const {
849 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
850 int eq_type = equivalent[type];
851 if (eq_type == KMP_HW_UNKNOWN)
852 return -1;
853 for (int i = 0; i < depth; ++i)
854 if (types[i] == eq_type)
855 return i;
856 return -1;
857 }
858 int get_count(int level) const {
859 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
860 return count[level];
861 }
862 // Return the total number of cores with attribute 'attr'
863 int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
864 return _get_ncores_with_attr(attr, -1, true);
865 }
866 // Return the number of cores with attribute
867 // 'attr' per topology level 'above'
868 int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
869 return _get_ncores_with_attr(attr, above, false);
870 }
871
872#if KMP_AFFINITY_SUPPORTED
873 void sort_compact() {
874 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
875 kmp_hw_thread_t::compare_compact);
876 }
877#endif
878 void print(const char *env_var = "KMP_AFFINITY") const;
879 void dump() const;
880};
881extern kmp_topology_t *__kmp_topology;
882
883class kmp_hw_subset_t {
884 const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
885
886public:
887 // Describe a machine topology item in KMP_HW_SUBSET
888 struct item_t {
889 kmp_hw_t type;
890 int num_attrs;
891 int num[MAX_ATTRS];
892 int offset[MAX_ATTRS];
893 kmp_hw_attr_t attr[MAX_ATTRS];
894 };
895 // Put parenthesis around max to avoid accidental use of Windows max macro.
896 const static int USE_ALL = (std::numeric_limits<int>::max)();
897
898private:
899 int depth;
900 int capacity;
901 item_t *items;
902 kmp_uint64 set;
903 bool absolute;
904 // The set must be able to handle up to KMP_HW_LAST number of layers
905 KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
906 // Sorting the KMP_HW_SUBSET items to follow topology order
907 // All unknown topology types will be at the beginning of the subset
908 static int hw_subset_compare(const void *i1, const void *i2) {
909 kmp_hw_t type1 = ((const item_t *)i1)->type;
910 kmp_hw_t type2 = ((const item_t *)i2)->type;
911 int level1 = __kmp_topology->get_level(type1);
912 int level2 = __kmp_topology->get_level(type2);
913 return level1 - level2;
914 }
915
916public:
917 // Force use of allocate()/deallocate()
918 kmp_hw_subset_t() = delete;
919 kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
920 kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
921 kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
922 kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
923
924 static kmp_hw_subset_t *allocate() {
925 int initial_capacity = 5;
926 kmp_hw_subset_t *retval =
927 (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
928 retval->depth = 0;
929 retval->capacity = initial_capacity;
930 retval->set = 0ull;
931 retval->absolute = false;
932 retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
933 return retval;
934 }
935 static void deallocate(kmp_hw_subset_t *subset) {
936 __kmp_free(subset->items);
937 __kmp_free(subset);
938 }
939 void set_absolute() { absolute = true; }
940 bool is_absolute() const { return absolute; }
941 void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
942 for (int i = 0; i < depth; ++i) {
943 // Found an existing item for this layer type
944 // Add the num, offset, and attr to this item
945 if (items[i].type == type) {
946 int idx = items[i].num_attrs++;
947 if ((size_t)idx >= MAX_ATTRS)
948 return;
949 items[i].num[idx] = num;
950 items[i].offset[idx] = offset;
951 items[i].attr[idx] = attr;
952 return;
953 }
954 }
955 if (depth == capacity - 1) {
956 capacity *= 2;
957 item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
958 for (int i = 0; i < depth; ++i)
959 new_items[i] = items[i];
960 __kmp_free(items);
961 items = new_items;
962 }
963 items[depth].num_attrs = 1;
964 items[depth].type = type;
965 items[depth].num[0] = num;
966 items[depth].offset[0] = offset;
967 items[depth].attr[0] = attr;
968 depth++;
969 set |= (1ull << type);
970 }
971 int get_depth() const { return depth; }
972 const item_t &at(int index) const {
973 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
974 return items[index];
975 }
976 item_t &at(int index) {
977 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
978 return items[index];
979 }
980 void remove(int index) {
981 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
982 set &= ~(1ull << items[index].type);
983 for (int j = index + 1; j < depth; ++j) {
984 items[j - 1] = items[j];
985 }
986 depth--;
987 }
988 void sort() {
989 KMP_DEBUG_ASSERT(__kmp_topology);
990 qsort(items, depth, sizeof(item_t), hw_subset_compare);
991 }
992 bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
993 void dump() const {
994 printf("**********************\n");
995 printf("*** kmp_hw_subset: ***\n");
996 printf("* depth: %d\n", depth);
997 printf("* items:\n");
998 for (int i = 0; i < depth; ++i) {
999 printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1000 for (int j = 0; j < items[i].num_attrs; ++j) {
1001 printf(" num: %d, offset: %d, attr: ", items[i].num[j],
1002 items[i].offset[j]);
1003 if (!items[i].attr[j]) {
1004 printf(" (none)\n");
1005 } else {
1006 printf(
1007 " core_type = %s, core_eff = %d\n",
1008 __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1009 items[i].attr[j].get_core_eff());
1010 }
1011 }
1012 }
1013 printf("* set: 0x%llx\n", set);
1014 printf("* absolute: %d\n", absolute);
1015 printf("**********************\n");
1016 }
1017};
1018extern kmp_hw_subset_t *__kmp_hw_subset;
1019
1020/* A structure for holding machine-specific hierarchy info to be computed once
1021 at init. This structure represents a mapping of threads to the actual machine
1022 hierarchy, or to our best guess at what the hierarchy might be, for the
1023 purpose of performing an efficient barrier. In the worst case, when there is
1024 no machine hierarchy information, it produces a tree suitable for a barrier,
1025 similar to the tree used in the hyper barrier. */
1026class hierarchy_info {
1027public:
1028 /* Good default values for number of leaves and branching factor, given no
1029 affinity information. Behaves a bit like hyper barrier. */
1030 static const kmp_uint32 maxLeaves = 4;
1031 static const kmp_uint32 minBranch = 4;
1037 kmp_uint32 maxLevels;
1038
1043 kmp_uint32 depth;
1044 kmp_uint32 base_num_threads;
1045 enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1046 volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1047 // 2=initialization in progress
1048 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1049
1054 kmp_uint32 *numPerLevel;
1055 kmp_uint32 *skipPerLevel;
1056
1057 void deriveLevels() {
1058 int hier_depth = __kmp_topology->get_depth();
1059 for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1060 numPerLevel[level] = __kmp_topology->get_ratio(i);
1061 }
1062 }
1063
1064 hierarchy_info()
1065 : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1066
1067 void fini() {
1068 if (!uninitialized && numPerLevel) {
1069 __kmp_free(numPerLevel);
1070 numPerLevel = NULL;
1071 uninitialized = not_initialized;
1072 }
1073 }
1074
1075 void init(int num_addrs) {
1076 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1077 &uninitialized, not_initialized, initializing);
1078 if (bool_result == 0) { // Wait for initialization
1079 while (TCR_1(uninitialized) != initialized)
1080 KMP_CPU_PAUSE();
1081 return;
1082 }
1083 KMP_DEBUG_ASSERT(bool_result == 1);
1084
1085 /* Added explicit initialization of the data fields here to prevent usage of
1086 dirty value observed when static library is re-initialized multiple times
1087 (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1088 OpenMP). */
1089 depth = 1;
1090 resizing = 0;
1091 maxLevels = 7;
1092 numPerLevel =
1093 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1094 skipPerLevel = &(numPerLevel[maxLevels]);
1095 for (kmp_uint32 i = 0; i < maxLevels;
1096 ++i) { // init numPerLevel[*] to 1 item per level
1097 numPerLevel[i] = 1;
1098 skipPerLevel[i] = 1;
1099 }
1100
1101 // Sort table by physical ID
1102 if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1103 deriveLevels();
1104 } else {
1105 numPerLevel[0] = maxLeaves;
1106 numPerLevel[1] = num_addrs / maxLeaves;
1107 if (num_addrs % maxLeaves)
1108 numPerLevel[1]++;
1109 }
1110
1111 base_num_threads = num_addrs;
1112 for (int i = maxLevels - 1; i >= 0;
1113 --i) // count non-empty levels to get depth
1114 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1115 depth++;
1116
1117 kmp_uint32 branch = minBranch;
1118 if (numPerLevel[0] == 1)
1119 branch = num_addrs / maxLeaves;
1120 if (branch < minBranch)
1121 branch = minBranch;
1122 for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1123 while (numPerLevel[d] > branch ||
1124 (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1125 if (numPerLevel[d] & 1)
1126 numPerLevel[d]++;
1127 numPerLevel[d] = numPerLevel[d] >> 1;
1128 if (numPerLevel[d + 1] == 1)
1129 depth++;
1130 numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1131 }
1132 if (numPerLevel[0] == 1) {
1133 branch = branch >> 1;
1134 if (branch < 4)
1135 branch = minBranch;
1136 }
1137 }
1138
1139 for (kmp_uint32 i = 1; i < depth; ++i)
1140 skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1141 // Fill in hierarchy in the case of oversubscription
1142 for (kmp_uint32 i = depth; i < maxLevels; ++i)
1143 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1144
1145 uninitialized = initialized; // One writer
1146 }
1147
1148 // Resize the hierarchy if nproc changes to something larger than before
1149 void resize(kmp_uint32 nproc) {
1150 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1151 while (bool_result == 0) { // someone else is trying to resize
1152 KMP_CPU_PAUSE();
1153 if (nproc <= base_num_threads) // happy with other thread's resize
1154 return;
1155 else // try to resize
1156 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1157 }
1158 KMP_DEBUG_ASSERT(bool_result != 0);
1159 if (nproc <= base_num_threads)
1160 return; // happy with other thread's resize
1161
1162 // Calculate new maxLevels
1163 kmp_uint32 old_sz = skipPerLevel[depth - 1];
1164 kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1165 // First see if old maxLevels is enough to contain new size
1166 for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1167 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1168 numPerLevel[i - 1] *= 2;
1169 old_sz *= 2;
1170 depth++;
1171 }
1172 if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1173 while (nproc > old_sz) {
1174 old_sz *= 2;
1175 incs++;
1176 depth++;
1177 }
1178 maxLevels += incs;
1179
1180 // Resize arrays
1181 kmp_uint32 *old_numPerLevel = numPerLevel;
1182 kmp_uint32 *old_skipPerLevel = skipPerLevel;
1183 numPerLevel = skipPerLevel = NULL;
1184 numPerLevel =
1185 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1186 skipPerLevel = &(numPerLevel[maxLevels]);
1187
1188 // Copy old elements from old arrays
1189 for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1190 // init numPerLevel[*] to 1 item per level
1191 numPerLevel[i] = old_numPerLevel[i];
1192 skipPerLevel[i] = old_skipPerLevel[i];
1193 }
1194
1195 // Init new elements in arrays to 1
1196 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1197 // init numPerLevel[*] to 1 item per level
1198 numPerLevel[i] = 1;
1199 skipPerLevel[i] = 1;
1200 }
1201
1202 // Free old arrays
1203 __kmp_free(old_numPerLevel);
1204 }
1205
1206 // Fill in oversubscription levels of hierarchy
1207 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1208 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1209
1210 base_num_threads = nproc;
1211 resizing = 0; // One writer
1212 }
1213};
1214#endif // KMP_AFFINITY_H