LLVM OpenMP 19.0.0git
kmp_task_reduction_nest.cpp
Go to the documentation of this file.
1// RUN: %libomp-cxx-compile-and-run
2// RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run
3// GCC-5 is needed for OpenMP 4.0 support (taskgroup)
4// XFAIL: gcc-4
5#include <cstdio>
6#include <cmath>
7#include <cassert>
8#include <omp.h>
9
10// Total number of loop iterations, should be multiple of T for this test
11#define N 10000
12
13// Flag to request lazy (1) or eager (0) allocation of reduction objects
14#ifndef FLG
15#define FLG 0
16#endif
17
18/*
19 // initial user's code that corresponds to pseudo code of the test
20 #pragma omp taskgroup task_reduction(+:i,j) task_reduction(*:x)
21 {
22 for( int l = 0; l < N; ++l ) {
23 #pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(*:x)
24 {
25 i += l;
26 if( l%2 )
27 x *= 1.0 / (l + 1);
28 else
29 x *= (l + 1);
30 }
31 }
32
33 #pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y)
34 {
35 for( int l = 0; l < N; ++l ) {
36 #pragma omp task firstprivate(l) in_reduction(+:j,y) \
37 in_reduction(*:x) in_reduction(-:k)
38 {
39 j += l;
40 k -= l;
41 y += (double)l;
42 if( l%2 )
43 x *= 1.0 / (l + 1);
44 else
45 x *= (l + 1);
46 }
47 #pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k)
48 {
49 i -= l;
50 k -= l;
51 y += (double)l;
52 }
53 #pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(*:x)
54 {
55 j += l;
56 if( l%2 )
57 x *= 1.0 / (l + 1);
58 else
59 x *= (l + 1);
60 }
61 }
62 } // inner reduction
63
64 for( int l = 0; l < N; ++l ) {
65 #pragma omp task firstprivate(l) in_reduction(+:j)
66 j += l;
67 }
68 } // outer reduction
69*/
70
71//------------------------------------------------
72// OpenMP runtime library routines
73#ifdef __cplusplus
74extern "C" {
75#endif
76extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item);
77extern void* __kmpc_task_reduction_init(int gtid, int num, void* data);
78extern int __kmpc_global_thread_num(void*);
79#ifdef __cplusplus
80}
81#endif
82
83//------------------------------------------------
84// Compiler-generated code
85
86typedef struct _task_red_item {
87 void *shar; // shared reduction item
88 size_t size; // size of data item
89 void *f_init; // data initialization routine
90 void *f_fini; // data finalization routine
91 void *f_comb; // data combiner routine
92 unsigned flags;
94
95// int:+ no need in init/fini callbacks, valid for subtraction
96void __red_int_add_comb(void *lhs, void *rhs) // combiner
97{ *(int*)lhs += *(int*)rhs; }
98
99// long long:+ no need in init/fini callbacks, valid for subtraction
100void __red_llong_add_comb(void *lhs, void *rhs) // combiner
101{ *(long long*)lhs += *(long long*)rhs; }
102
103// double:* no need in fini callback
104void __red_dbl_mul_init(void *data) // initializer
105{ *(double*)data = 1.0; }
106void __red_dbl_mul_comb(void *lhs, void *rhs) // combiner
107{ *(double*)lhs *= *(double*)rhs; }
108
109// double:+ no need in init/fini callbacks
110void __red_dbl_add_comb(void *lhs, void *rhs) // combiner
111{ *(double*)lhs += *(double*)rhs; }
112
113// ==============================
114
115void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py)
116{
117 for( int l = 0; l < N; ++l ) {
118 *pi += l;
119 if( l%2 )
120 *px *= 1.0 / (l + 1);
121 else
122 *px *= (l + 1);
123 }
124 for( int l = 0; l < N; ++l ) {
125 *pj += l;
126 *pk -= l;
127 *py += (double)l;
128 if( l%2 )
129 *px *= 1.0 / (l + 1);
130 else
131 *px *= (l + 1);
132
133 *pi -= l;
134 *pk -= l;
135 *py += (double)l;
136
137 *pj += l;
138 if( l%2 )
139 *px *= 1.0 / (l + 1);
140 else
141 *px *= (l + 1);
142 }
143 for( int l = 0; l < N; ++l ) {
144 *pj += l;
145 }
146}
147
148//------------------------------------------------
149// Test case
150int main()
151{
152 int nthreads = omp_get_max_threads();
153 int err = 0;
154 void** ptrs = (void**)malloc(nthreads*sizeof(void*));
155
156 // user's code ======================================
157 // variables for serial calculations:
158 int is = 3;
159 long long js = -9999999;
160 double xs = 99999.0;
161 long long ks = 99999999;
162 double ys = -99999999.0;
163 // variables for parallel calculations:
164 int ip = 3;
165 long long jp = -9999999;
166 double xp = 99999.0;
167 long long kp = 99999999;
168 double yp = -99999999.0;
169
170 calc_serial(&is, &js, &xs, &ks, &ys);
171 // ==================================================
172 for (int i = 0; i < nthreads; ++i)
173 ptrs[i] = NULL;
174 #pragma omp parallel
175 {
176 #pragma omp single nowait
177 {
178 // outer taskgroup reduces (i,j,x)
179 #pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x)
180 {
181 _task_red_item_t red_data[3];
182 red_data[0].shar = &ip;
183 red_data[0].size = sizeof(ip);
184 red_data[0].f_init = NULL; // RTL will zero thread-specific objects
185 red_data[0].f_fini = NULL; // no destructors needed
186 red_data[0].f_comb = (void*)&__red_int_add_comb;
187 red_data[0].flags = FLG;
188 red_data[1].shar = &jp;
189 red_data[1].size = sizeof(jp);
190 red_data[1].f_init = NULL; // RTL will zero thread-specific objects
191 red_data[1].f_fini = NULL; // no destructors needed
192 red_data[1].f_comb = (void*)&__red_llong_add_comb;
193 red_data[1].flags = FLG;
194 red_data[2].shar = &xp;
195 red_data[2].size = sizeof(xp);
196 red_data[2].f_init = (void*)&__red_dbl_mul_init;
197 red_data[2].f_fini = NULL; // no destructors needed
198 red_data[2].f_comb = (void*)&__red_dbl_mul_comb;
199 red_data[2].flags = FLG;
200 int gtid = __kmpc_global_thread_num(NULL);
201 void* tg1 = __kmpc_task_reduction_init(gtid, 3, red_data);
202
203 for( int l = 0; l < N; l += 2 ) {
204 // 2 iterations per task to get correct x value; actually any even
205 // number of iters per task will work, otherwise x looses precision
206 #pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x)
207 {
208 int gtid = __kmpc_global_thread_num(NULL);
209 int *p_ip = (int*)__kmpc_task_reduction_get_th_data(gtid, tg1, &ip);
210 double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
211 gtid, tg1, &xp);
212 if (!ptrs[gtid]) ptrs[gtid] = p_xp;
213
214 // user's pseudo-code ==============================
215 *p_ip += l;
216 *p_xp *= (l + 1);
217
218 *p_ip += l + 1;
219 *p_xp *= 1.0 / (l + 2);
220 // ==================================================
221 }
222 }
223 // inner taskgroup reduces (i,k,y), i is same object as in outer one
224 #pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y)
225 {
226 _task_red_item_t red_data[3];
227 red_data[0].shar = &ip;
228 red_data[0].size = sizeof(ip);
229 red_data[0].f_init = NULL; // RTL will zero thread-specific objects
230 red_data[0].f_fini = NULL; // no destructors needed
231 red_data[0].f_comb = (void*)&__red_int_add_comb;
232 red_data[0].flags = FLG;
233 red_data[1].shar = &kp;
234 red_data[1].size = sizeof(kp);
235 red_data[1].f_init = NULL; // RTL will zero thread-specific objects
236 red_data[1].f_fini = NULL; // no destructors needed
237 red_data[1].f_comb = (void*)&__red_llong_add_comb; // same for + and -
238 red_data[1].flags = FLG;
239 red_data[2].shar = &yp;
240 red_data[2].size = sizeof(yp);
241 red_data[2].f_init = NULL; // RTL will zero thread-specific objects
242 red_data[2].f_fini = NULL; // no destructors needed
243 red_data[2].f_comb = (void*)&__red_dbl_add_comb;
244 red_data[2].flags = FLG;
245 int gtid = __kmpc_global_thread_num(NULL);
246 void* tg2 = __kmpc_task_reduction_init(gtid, 3, red_data);
247
248 for( int l = 0; l < N; l += 2 ) {
249 #pragma omp task firstprivate(l)
250 // in_reduction(+:j,y) in_reduction(*:x) in_reduction(-:k)
251 {
252 int gtid = __kmpc_global_thread_num(NULL);
253 long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
254 gtid, tg1, &jp);
255 long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
256 gtid, tg2, &kp);
257 double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
258 gtid, tg1, &xp);
259 double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
260 gtid, tg2, &yp);
261 // user's pseudo-code ==============================
262 *p_jp += l;
263 *p_kp -= l;
264 *p_yp += (double)l;
265 *p_xp *= (l + 1);
266
267 *p_jp += l + 1;
268 *p_kp -= l + 1;
269 *p_yp += (double)(l + 1);
270 *p_xp *= 1.0 / (l + 2);
271 // =================================================
272{
273 // the following code is here just to check __kmpc_task_reduction_get_th_data:
274 int tid = omp_get_thread_num();
275 void *addr1;
276 void *addr2;
277 addr1 = __kmpc_task_reduction_get_th_data(gtid, tg1, &xp); // from shared
278 addr2 = __kmpc_task_reduction_get_th_data(gtid, tg1, addr1); // from private
279 if (addr1 != addr2) {
280 #pragma omp atomic
281 ++err;
282 printf("Wrong thread-specific addresses %d s:%p p:%p\n", tid, addr1, addr2);
283 }
284 // from neighbour w/o taskgroup (should start lookup from current tg2)
285 if (tid > 0) {
286 if (ptrs[tid-1]) {
287 addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[tid-1]);
288 if (addr1 != addr2) {
289 #pragma omp atomic
290 ++err;
291 printf("Wrong thread-specific addresses %d s:%p n:%p\n",
292 tid, addr1, addr2);
293 }
294 }
295 } else {
296 if (ptrs[nthreads-1]) {
297 addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[nthreads-1]);
298 if (addr1 != addr2) {
299 #pragma omp atomic
300 ++err;
301 printf("Wrong thread-specific addresses %d s:%p n:%p\n",
302 tid, addr1, addr2);
303 }
304 }
305 }
306 // ----------------------------------------------
307}
308 }
309 #pragma omp task firstprivate(l)
310 // in_reduction(+:y) in_reduction(-:i,k)
311 {
312 int gtid = __kmpc_global_thread_num(NULL);
313 int *p_ip = (int*)__kmpc_task_reduction_get_th_data(
314 gtid, tg2, &ip);
315 long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(
316 gtid, tg2, &kp);
317 double *p_yp = (double*)__kmpc_task_reduction_get_th_data(
318 gtid, tg2, &yp);
319
320 // user's pseudo-code ==============================
321 *p_ip -= l;
322 *p_kp -= l;
323 *p_yp += (double)l;
324
325 *p_ip -= l + 1;
326 *p_kp -= l + 1;
327 *p_yp += (double)(l + 1);
328 // =================================================
329 }
330 #pragma omp task firstprivate(l)
331 // in_reduction(+:j) in_reduction(*:x)
332 {
333 int gtid = __kmpc_global_thread_num(NULL);
334 long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
335 gtid, tg1, &jp);
336 double *p_xp = (double*)__kmpc_task_reduction_get_th_data(
337 gtid, tg1, &xp);
338 // user's pseudo-code ==============================
339 *p_jp += l;
340 *p_xp *= (l + 1);
341
342 *p_jp += l + 1;
343 *p_xp *= 1.0 / (l + 2);
344 // =================================================
345 }
346 }
347 } // inner reduction
348
349 for( int l = 0; l < N; l += 2 ) {
350 #pragma omp task firstprivate(l) // in_reduction(+:j)
351 {
352 int gtid = __kmpc_global_thread_num(NULL);
353 long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(
354 gtid, tg1, &jp);
355 // user's pseudo-code ==============================
356 *p_jp += l;
357 *p_jp += l + 1;
358 // =================================================
359 }
360 }
361 } // outer reduction
362 } // end single
363 } // end parallel
364 // check results
365#if _DEBUG
366 printf("reduction flags = %u\n", FLG);
367#endif
368 if (ip == is && jp == js && ks == kp &&
369 fabs(xp - xs) < 0.01 && fabs(yp - ys) < 0.01)
370 printf("passed\n");
371 else
372 printf("failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n",
373 is, js, xs, ks, ys,
374 ip, jp, xp, kp, yp);
375 return 0;
376}
void * __kmpc_task_reduction_get_th_data(int gtid, void *tg, void *item)
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type size_t void * data
KMP_ARCH_X86 KMP_ARCH_X86 long double
#define i
Definition: kmp_stub.cpp:87
#define N
#define FLG
void __red_dbl_mul_init(void *data)
int __kmpc_global_thread_num(void *)
void __red_int_add_comb(void *lhs, void *rhs)
void __red_llong_add_comb(void *lhs, void *rhs)
void __red_dbl_mul_comb(void *lhs, void *rhs)
struct _task_red_item _task_red_item_t
void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py)
void __red_dbl_add_comb(void *lhs, void *rhs)
static int err
Definition: teams-no-par.c:16
int omp_get_max_threads()