LLVM OpenMP 19.0.0git
LLVM  OpenMP* Runtime Library Interface

Introduction

This document describes the interface provided by the LLVM  OpenMP\other runtime library to the compiler. Routines that are directly called as simple functions by user code are not currently described here, since their definition is in the OpenMP specification available from http://openmp.org

The aim here is to explain the interface from the compiler to the runtime.

The overall design is described, and each function in the interface has its own description. (At least, that's the ambition, we may not be there yet).

Quickly Building the Runtime

For the impatient, we cover building the runtime as the first topic here.

CMake is used to build the OpenMP runtime. For details and a full list of options for the CMake build system, see README.rst in the source code repository. These instructions will provide the most typical build.

In-LLVM-tree build:.

$ cd where-you-want-to-live
Check out openmp into llvm/projects
$ cd where-you-want-to-build
$ mkdir build && cd build
$ cmake path/to/llvm -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
$ make omp
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type size_t void ITT_FORMAT p const __itt_domain __itt_id __itt_string_handle const wchar_t size_t ITT_FORMAT lu const __itt_domain __itt_id __itt_relation __itt_id ITT_FORMAT p const wchar_t int ITT_FORMAT __itt_group_mark d __itt_event ITT_FORMAT __itt_group_mark d void const wchar_t const wchar_t int ITT_FORMAT __itt_group_sync __itt_group_fsync x void const wchar_t int const wchar_t int int ITT_FORMAT __itt_group_sync __itt_group_fsync x void ITT_FORMAT __itt_group_sync __itt_group_fsync p void ITT_FORMAT __itt_group_sync __itt_group_fsync p void size_t ITT_FORMAT lu no args __itt_obj_prop_t __itt_obj_state_t ITT_FORMAT d const char ITT_FORMAT s const char ITT_FORMAT s __itt_frame ITT_FORMAT p __itt_counter ITT_FORMAT p __itt_counter unsigned long long ITT_FORMAT lu __itt_counter unsigned long long ITT_FORMAT lu __itt_counter __itt_clock_domain unsigned long long void ITT_FORMAT p const wchar_t ITT_FORMAT S __itt_mark_type const wchar_t ITT_FORMAT S __itt_mark_type const char ITT_FORMAT s __itt_mark_type ITT_FORMAT d __itt_caller ITT_FORMAT p __itt_caller ITT_FORMAT p no args const __itt_domain __itt_clock_domain unsigned long long __itt_id ITT_FORMAT lu const __itt_domain __itt_clock_domain unsigned long long __itt_id __itt_id void ITT_FORMAT p const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_clock_domain unsigned long long __itt_id __itt_string_handle __itt_scope ITT_FORMAT d const __itt_domain __itt_scope __itt_string_handle const char size_t ITT_FORMAT lu const __itt_domain __itt_clock_domain unsigned long long __itt_relation __itt_id ITT_FORMAT lu __itt_track_group __itt_string_handle __itt_track_group_type ITT_FORMAT d __itt_track ITT_FORMAT p void int const int int const char int ITT_FORMAT d void void const char * path
#define C

Out-of-LLVM-tree build:

$ cd where-you-want-to-live
Check out openmp
$ cd where-you-want-to-live/openmp
$ mkdir build && cd build
$ cmake path/to/openmp -DCMAKE_C_COMPILER=<C compiler> -DCMAKE_CXX_COMPILER=<C++ compiler>
$ make

Supported RTL Build Configurations

The architectures supported are IA-32 architecture, Intel®  64, and Intel®  Many Integrated Core Architecture. The build configurations supported are shown in the table below.

icc/iclgccclang
Linux\other OSYes(1,5)Yes(2,4)Yes(4,6,7)
FreeBSD\otherYes(1,5)Yes(2,4)Yes(4,6,7,8)
OS X\otherYes(1,3,4)NoYes(4,6,7)
Windows\other OSYes(1,4)NoNo

(1) On IA-32 architecture and Intel®  64, icc/icl versions 12.x are supported (12.1 is recommended).
(2) gcc version 4.7 is supported.
(3) For icc on OS X\other, OS X\other version 10.5.8 is supported.
(4) Intel®  Many Integrated Core Architecture not supported.
(5) On Intel®  Many Integrated Core Architecture, icc/icl versions 13.0 or later are required.
(6) Clang\other version 3.3 is supported.
(7) Clang\other currently does not offer a software-implemented 128 bit extended precision type. Thus, all entry points reliant on this type are removed from the library and cannot be called in the user program. The following functions are not available:

__kmpc_atomic_cmplx16_*
__kmpc_atomic_float16_*
__kmpc_atomic_*_fp

(8) Community contribution provided AS IS, not tested by Intel.

Supported Architectures: IBM(R) Power 7 and Power 8

gccclang
Linux\other OSYes(1,2)Yes(3,4)

(1) On Power 7, gcc version 4.8.2 is supported.
(2) On Power 8, gcc version 4.8.2 is supported.
(3) On Power 7, clang version 3.7 is supported.
(4) On Power 8, clang version 3.7 is supported.

Front-end Compilers that work with this RTL

The following compilers are known to do compatible code generation for this RTL: icc/icl, gcc. Code generation is discussed in more detail later in this document.

Outlining

The runtime interface is based on the idea that the compiler "outlines" sections of code that are to run in parallel into separate functions that can then be invoked in multiple threads. For instance, simple code like this

void foo()
{
#pragma omp parallel
{
... do something ...
}
}
void foo()

is converted into something that looks conceptually like this (where the names used are merely illustrative; the real library function names will be used later after we've discussed some more issues...)

static void outlinedFooBody()
{
... do something ...
}
void foo()
{
__OMP_runtime_fork(outlinedFooBody, (void*)0); // Not the real function name!
}

Addressing shared variables

In real uses of the OpenMP\other API there are normally references from the outlined code to shared variables that are in scope in the containing function. Therefore the containing function must be able to address these variables. The runtime supports two alternate ways of doing this.

Current Technique

The technique currently supported by the runtime library is to receive a separate pointer to each shared variable that can be accessed from the outlined function. This is what is shown in the example below.

We hope soon to provide an alternative interface to support the alternate implementation described in the next section. The alternative implementation has performance advantages for small parallel regions that have many shared variables.

Future Technique

The idea is to treat the outlined function as though it were a lexically nested function, and pass it a single argument which is the pointer to the parent's stack frame. Provided that the compiler knows the layout of the parent frame when it is generating the outlined function it can then access the up-level variables at appropriate offsets from the parent frame. This is a classical compiler technique from the 1960s to support languages like Algol (and its descendants) that support lexically nested functions.

The main benefit of this technique is that there is no code required at the fork point to marshal the arguments to the outlined function. Since the runtime knows statically how many arguments must be passed to the outlined function, it can easily copy them to the thread's stack frame. Therefore the performance of the fork code is independent of the number of shared variables that are accessed by the outlined function.

If it is hard to determine the stack layout of the parent while generating the outlined code, it is still possible to use this approach by collecting all of the variables in the parent that are accessed from outlined functions into a single struct which is placed on the stack, and whose address is passed to the outlined functions. In this way the offsets of the shared variables are known (since they are inside the struct) without needing to know the complete layout of the parent stack-frame. From the point of view of the runtime either of these techniques is equivalent, since in either case it only has to pass a single argument to the outlined function to allow it to access shared variables.

A scheme like this is how gcc\other generates outlined functions.

Library Interfaces

The library functions used for specific parts of the OpenMP\other language implementation are documented in different modules.

Examples

Work Sharing Example

This example shows the code generated for a parallel for with reduction and dynamic scheduling.

extern float foo( void );
int main () {
int i;
float r = 0.0;
#pragma omp parallel for schedule(dynamic) reduction(+:r)
for ( i = 0; i < 10; i ++ ) {
r += foo();
}
}
#define i
Definition: kmp_stub.cpp:87
int main()
Definition: test-touch.c:21

The transformed code looks like this.

extern float foo( void );
int main () {
static int zero = 0;
auto int gtid;
auto float r = 0.0;
__kmpc_begin( & loc3, 0 );
// The gtid is not actually required in this example so could be omitted;
// We show its initialization here because it is often required for calls into
// the runtime and should be locally cached like this.
gtid = __kmpc_global thread num( & loc3 );
__kmpc_fork call( & loc7, 1, main_7_parallel_3, & r );
__kmpc_end( & loc0 );
return 0;
}
struct main_10_reduction_t_5 { float r_10_rpr; };
static kmp_critical_name lck = { 0 };
static ident_t loc10; // loc10.flags should contain KMP_IDENT_ATOMIC_REDUCE bit set
// if compiler has generated an atomic reduction.
void main_7_parallel_3( int *gtid, int *btid, float *r_7_shp ) {
auto int i_7_pr;
auto int lower, upper, liter, incr;
auto struct main_10_reduction_t_5 reduce;
reduce.r_10_rpr = 0.F;
liter = 0;
__kmpc_dispatch_init_4( & loc7,*gtid, 35, 0, 9, 1, 1 );
while ( __kmpc_dispatch_next_4( & loc7, *gtid, & liter, & lower, & upper, & incr ) ) {
for( i_7_pr = lower; upper >= i_7_pr; i_7_pr ++ )
reduce.r_10_rpr += foo();
}
switch( __kmpc_reduce_nowait( & loc10, *gtid, 1, 4, & reduce, main_10_reduce_5, & lck ) ) {
case 1:
*r_7_shp += reduce.r_10_rpr;
__kmpc_end_reduce_nowait( & loc10, *gtid, & lck );
break;
case 2:
__kmpc_atomic_float4_add( & loc10, *gtid, r_7_shp, reduce.r_10_rpr );
break;
default:;
}
}
void main_10_reduce_5( struct main_10_reduction_t_5 *reduce_lhs,
struct main_10_reduction_t_5 *reduce_rhs )
{
reduce_lhs->r_10_rpr += reduce_rhs->r_10_rpr;
}
KMP_EXPORT void __kmpc_begin(ident_t *, kmp_int32 flags)
KMP_EXPORT void __kmpc_end(ident_t *)
KMP_EXPORT void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck)
KMP_EXPORT kmp_int32 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void(*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
void __kmpc_atomic_float4_add(ident_t *id_ref, int gtid, kmp_real32 *lhs, kmp_real32 rhs)
omp_lock_t lck
Definition: omp_lock.c:7