doxygen/kmp__task__reduction__nest_8cpp_source.html

// RUN: %libomp-cxx-compile-and-run

// RUN: %libomp-cxx-compile -DFLG=1 && %libomp-run

// GCC-5 is needed for OpenMP 4.0 support (taskgroup)

// XFAIL: gcc-4

#include <cstdio>

#include <cmath>

#include <cassert>

#include <omp.h>


// Total number of loop iterations, should be multiple of T for this test

#define N 10000


// Flag to request lazy (1) or eager (0) allocation of reduction objects

#ifndef FLG

#define FLG 0

#endif


/*

  // initial user's code that corresponds to pseudo code of the test

  #pragma omp taskgroup task_reduction(+:i,j) task_reduction(*:x)

  {

    for( int l = 0; l < N; ++l ) {

      #pragma omp task firstprivate(l) in_reduction(+:i) in_reduction(*:x)

      {

        i += l;

        if( l%2 )

          x *= 1.0 / (l + 1);

        else

          x *= (l + 1);

      }

    }


    #pragma omp taskgroup task_reduction(-:i,k) task_reduction(+:y)

    {

      for( int l = 0; l < N; ++l ) {

        #pragma omp task firstprivate(l) in_reduction(+:j,y) \

            in_reduction(*:x) in_reduction(-:k)

        {

          j += l;

          k -= l;

          y += (double)l;

          if( l%2 )

            x *= 1.0 / (l + 1);

          else

            x *= (l + 1);

        }

        #pragma omp task firstprivate(l) in_reduction(+:y) in_reduction(-:i,k)

        {

          i -= l;

          k -= l;

          y += (double)l;

        }

        #pragma omp task firstprivate(l) in_reduction(+:j) in_reduction(*:x)

        {

          j += l;

          if( l%2 )

            x *= 1.0 / (l + 1);

          else

            x *= (l + 1);

        }

      }

    } // inner reduction


    for( int l = 0; l < N; ++l ) {

      #pragma omp task firstprivate(l) in_reduction(+:j)

        j += l;

    }

  } // outer reduction

*/


//------------------------------------------------

// OpenMP runtime library routines

#ifdef __cplusplus

extern "C" {

#endif

extern void* __kmpc_task_reduction_get_th_data(int gtid, void* tg, void* item);

extern void* __kmpc_task_reduction_init(int gtid, int num, void* data);

extern int __kmpc_global_thread_num(void*);

#ifdef __cplusplus

}

#endif


//------------------------------------------------

// Compiler-generated code


typedef struct _task_red_item {

    void       *shar; // shared reduction item

    size_t      size; // size of data item

    void       *f_init; // data initialization routine

    void       *f_fini; // data finalization routine

    void       *f_comb; // data combiner routine

    unsigned    flags;

} _task_red_item_t;


// int:+   no need in init/fini callbacks, valid for subtraction


void __red_int_add_comb(void *lhs, void *rhs) // combiner

{ *(int*)lhs += *(int*)rhs; }


// long long:+   no need in init/fini callbacks, valid for subtraction


void __red_llong_add_comb(void *lhs, void *rhs) // combiner

{ *(long long*)lhs += *(long long*)rhs; }


// double:*   no need in fini callback


void __red_dbl_mul_init(void *data) // initializer

{ *(double*)data = 1.0; }


void __red_dbl_mul_comb(void *lhs, void *rhs) // combiner

{ *(double*)lhs *= *(double*)rhs; }


// double:+   no need in init/fini callbacks


void __red_dbl_add_comb(void *lhs, void *rhs) // combiner

{ *(double*)lhs += *(double*)rhs; }


// ==============================


void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py)

{

    for( int l = 0; l < N; ++l ) {

        *pi += l;

        if( l%2 )

          *px *= 1.0 / (l + 1);

        else

          *px *= (l + 1);

    }

    for( int l = 0; l < N; ++l ) {

        *pj += l;

        *pk -= l;

        *py += (double)l;

        if( l%2 )

            *px *= 1.0 / (l + 1);

        else

            *px *= (l + 1);


        *pi -= l;

        *pk -= l;

        *py += (double)l;


        *pj += l;

        if( l%2 )

            *px *= 1.0 / (l + 1);

        else

            *px *= (l + 1);

    }

    for( int l = 0; l < N; ++l ) {

        *pj += l;

    }

}


//------------------------------------------------

// Test case


int main()

{

  int nthreads = omp_get_max_threads();

  int err = 0;

  void** ptrs = (void**)malloc(nthreads*sizeof(void*));


  // user's code ======================================

  // variables for serial calculations:

  int is = 3;

  long long js = -9999999;

  double xs = 99999.0;

  long long ks = 99999999;

  double ys = -99999999.0;

  // variables for parallel calculations:

  int ip = 3;

  long long jp = -9999999;

  double xp = 99999.0;

  long long kp = 99999999;

  double yp = -99999999.0;


  calc_serial(&is, &js, &xs, &ks, &ys);

  // ==================================================

  for (int i = 0; i < nthreads; ++i)

    ptrs[i] = NULL;

  #pragma omp parallel

  {

    #pragma omp single nowait

    {

      // outer taskgroup reduces (i,j,x)

      #pragma omp taskgroup // task_reduction(+:i,j) task_reduction(*:x)

      {

        _task_red_item_t red_data[3];

        red_data[0].shar = &ip;

        red_data[0].size = sizeof(ip);

        red_data[0].f_init = NULL; // RTL will zero thread-specific objects

        red_data[0].f_fini = NULL; // no destructors needed

        red_data[0].f_comb = (void*)&__red_int_add_comb;

        red_data[0].flags = FLG;

        red_data[1].shar = &jp;

        red_data[1].size = sizeof(jp);

        red_data[1].f_init = NULL; // RTL will zero thread-specific objects

        red_data[1].f_fini = NULL; // no destructors needed

        red_data[1].f_comb = (void*)&__red_llong_add_comb;

        red_data[1].flags = FLG;

        red_data[2].shar = &xp;

        red_data[2].size = sizeof(xp);

        red_data[2].f_init = (void*)&__red_dbl_mul_init;

        red_data[2].f_fini = NULL; // no destructors needed

        red_data[2].f_comb = (void*)&__red_dbl_mul_comb;

        red_data[2].flags = FLG;

        int gtid = __kmpc_global_thread_num(NULL);

        void* tg1 = __kmpc_task_reduction_init(gtid, 3, red_data);


        for( int l = 0; l < N; l += 2 ) {

          // 2 iterations per task to get correct x value; actually any even

          // number of iters per task will work, otherwise x looses precision

          #pragma omp task firstprivate(l) //in_reduction(+:i) in_reduction(*:x)

          {

            int gtid = __kmpc_global_thread_num(NULL);

            int *p_ip = (int*)__kmpc_task_reduction_get_th_data(gtid, tg1, &ip);

            double *p_xp = (double*)__kmpc_task_reduction_get_th_data(

                                        gtid, tg1, &xp);

            if (!ptrs[gtid]) ptrs[gtid] = p_xp;


            // user's pseudo-code ==============================

            *p_ip += l;

            *p_xp *= (l + 1);


            *p_ip += l + 1;

            *p_xp *= 1.0 / (l + 2);

            // ==================================================

          }

        }

        // inner taskgroup reduces (i,k,y), i is same object as in outer one

        #pragma omp taskgroup // task_reduction(-:i,k) task_reduction(+:y)

        {

          _task_red_item_t red_data[3];

          red_data[0].shar = &ip;

          red_data[0].size = sizeof(ip);

          red_data[0].f_init = NULL; // RTL will zero thread-specific objects

          red_data[0].f_fini = NULL; // no destructors needed

          red_data[0].f_comb = (void*)&__red_int_add_comb;

          red_data[0].flags = FLG;

          red_data[1].shar = &kp;

          red_data[1].size = sizeof(kp);

          red_data[1].f_init = NULL; // RTL will zero thread-specific objects

          red_data[1].f_fini = NULL; // no destructors needed

          red_data[1].f_comb = (void*)&__red_llong_add_comb; // same for + and -

          red_data[1].flags = FLG;

          red_data[2].shar = &yp;

          red_data[2].size = sizeof(yp);

          red_data[2].f_init = NULL; // RTL will zero thread-specific objects

          red_data[2].f_fini = NULL; // no destructors needed

          red_data[2].f_comb = (void*)&__red_dbl_add_comb;

          red_data[2].flags = FLG;

          int gtid = __kmpc_global_thread_num(NULL);

          void* tg2 = __kmpc_task_reduction_init(gtid, 3, red_data);


          for( int l = 0; l < N; l += 2 ) {

            #pragma omp task firstprivate(l)

            // in_reduction(+:j,y) in_reduction(*:x) in_reduction(-:k)

            {

              int gtid = __kmpc_global_thread_num(NULL);

              long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(

                                                gtid, tg1, &jp);

              long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(

                                                gtid, tg2, &kp);

              double *p_xp = (double*)__kmpc_task_reduction_get_th_data(

                                          gtid, tg1, &xp);

              double *p_yp = (double*)__kmpc_task_reduction_get_th_data(

                                          gtid, tg2, &yp);

              // user's pseudo-code ==============================

              *p_jp += l;

              *p_kp -= l;

              *p_yp += (double)l;

              *p_xp *= (l + 1);


              *p_jp += l + 1;

              *p_kp -= l + 1;

              *p_yp += (double)(l + 1);

              *p_xp *= 1.0 / (l + 2);

              // =================================================

{

  // the following code is here just to check __kmpc_task_reduction_get_th_data:

  int tid = omp_get_thread_num();

  void *addr1;

  void *addr2;

  addr1 = __kmpc_task_reduction_get_th_data(gtid, tg1, &xp); // from shared

  addr2 = __kmpc_task_reduction_get_th_data(gtid, tg1, addr1); // from private

  if (addr1 != addr2) {

    #pragma omp atomic

      ++err;

    printf("Wrong thread-specific addresses %d s:%p p:%p\n", tid, addr1, addr2);

  }

  // from neighbour w/o taskgroup (should start lookup from current tg2)

  if (tid > 0) {

    if (ptrs[tid-1]) {

      addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[tid-1]);

      if (addr1 != addr2) {

        #pragma omp atomic

          ++err;

        printf("Wrong thread-specific addresses %d s:%p n:%p\n",

               tid, addr1, addr2);

      }

    }

  } else {

    if (ptrs[nthreads-1]) {

      addr2 = __kmpc_task_reduction_get_th_data(gtid, NULL, ptrs[nthreads-1]);

      if (addr1 != addr2) {

        #pragma omp atomic

          ++err;

        printf("Wrong thread-specific addresses %d s:%p n:%p\n",

               tid, addr1, addr2);

      }

    }

  }

  // ----------------------------------------------

}

            }

            #pragma omp task firstprivate(l)

            // in_reduction(+:y) in_reduction(-:i,k)

            {

              int gtid = __kmpc_global_thread_num(NULL);

              int *p_ip = (int*)__kmpc_task_reduction_get_th_data(

                                    gtid, tg2, &ip);

              long long *p_kp = (long long*)__kmpc_task_reduction_get_th_data(

                                                gtid, tg2, &kp);

              double *p_yp = (double*)__kmpc_task_reduction_get_th_data(

                                          gtid, tg2, &yp);


              // user's pseudo-code ==============================

              *p_ip -= l;

              *p_kp -= l;

              *p_yp += (double)l;


              *p_ip -= l + 1;

              *p_kp -= l + 1;

              *p_yp += (double)(l + 1);

              // =================================================

            }

            #pragma omp task firstprivate(l)

            // in_reduction(+:j) in_reduction(*:x)

            {

              int gtid = __kmpc_global_thread_num(NULL);

              long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(

                                                gtid, tg1, &jp);

              double *p_xp = (double*)__kmpc_task_reduction_get_th_data(

                                          gtid, tg1, &xp);

              // user's pseudo-code ==============================

              *p_jp += l;

              *p_xp *= (l + 1);


              *p_jp += l + 1;

              *p_xp *= 1.0 / (l + 2);

              // =================================================

            }

          }

        } // inner reduction


        for( int l = 0; l < N; l += 2 ) {

          #pragma omp task firstprivate(l) // in_reduction(+:j)

          {

            int gtid = __kmpc_global_thread_num(NULL);

            long long *p_jp = (long long*)__kmpc_task_reduction_get_th_data(

                                              gtid, tg1, &jp);

            // user's pseudo-code ==============================

            *p_jp += l;

            *p_jp += l + 1;

            // =================================================

          }

        }

      } // outer reduction

    } // end single

  } // end parallel

  // check results

#if _DEBUG

  printf("reduction flags = %u\n", FLG);

#endif

  if (ip == is && jp == js && ks == kp &&

      fabs(xp - xs) < 0.01 && fabs(yp - ys) < 0.01)

    printf("passed\n");

  else

    printf("failed,\n ser:(%d %lld %f %lld %f)\n par:(%d %lld %f %lld %f)\n",

      is, js, xs, ks, ys,

      ip, jp, xp, kp, yp);

  return 0;

}


N
#define N
Definition bug54082.c:13

__kmpc_task_reduction_get_th_data
void * __kmpc_task_reduction_get_th_data(int gtid, void *tg, void *item)
Definition kmp_tasking.cpp:2436

__kmpc_task_reduction_init
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
Definition kmp_tasking.cpp:2366

data
void const char const char int ITT_FORMAT __itt_group_sync x void const char ITT_FORMAT __itt_group_sync s void ITT_FORMAT __itt_group_sync p void ITT_FORMAT p void ITT_FORMAT p no args __itt_suppress_mode_t unsigned int void size_t ITT_FORMAT d void ITT_FORMAT p void ITT_FORMAT p __itt_model_site __itt_model_site_instance ITT_FORMAT p __itt_model_task __itt_model_task_instance ITT_FORMAT p void ITT_FORMAT p void ITT_FORMAT p void size_t ITT_FORMAT d void ITT_FORMAT p const wchar_t ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s const char ITT_FORMAT s no args void ITT_FORMAT p size_t ITT_FORMAT d no args const wchar_t const wchar_t ITT_FORMAT s __itt_heap_function void size_t int ITT_FORMAT d __itt_heap_function void ITT_FORMAT p __itt_heap_function void void size_t int ITT_FORMAT d no args no args unsigned int ITT_FORMAT u const __itt_domain __itt_id ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain __itt_id ITT_FORMAT p const __itt_domain __itt_id __itt_timestamp __itt_timestamp ITT_FORMAT lu const __itt_domain __itt_id __itt_id __itt_string_handle ITT_FORMAT p const __itt_domain ITT_FORMAT p const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_string_handle unsigned long long ITT_FORMAT lu const __itt_domain __itt_id __itt_string_handle __itt_metadata_type size_t void * data
Definition ittnotify_static.h:415

double
KMP_ARCH_X86 KMP_ARCH_X86 long double
Definition kmp_atomic.cpp:2069

i
#define i
Definition kmp_stub.cpp:87

FLG
#define FLG
Definition kmp_task_reduction_nest.cpp:15

__red_dbl_mul_init
void __red_dbl_mul_init(void *data)
Definition kmp_task_reduction_nest.cpp:104

__kmpc_global_thread_num
int __kmpc_global_thread_num(void *)

__red_int_add_comb
void __red_int_add_comb(void *lhs, void *rhs)
Definition kmp_task_reduction_nest.cpp:96

__red_llong_add_comb
void __red_llong_add_comb(void *lhs, void *rhs)
Definition kmp_task_reduction_nest.cpp:100

__red_dbl_mul_comb
void __red_dbl_mul_comb(void *lhs, void *rhs)
Definition kmp_task_reduction_nest.cpp:106

_task_red_item_t
struct _task_red_item _task_red_item_t

calc_serial
void calc_serial(int *pi, long long *pj, double *px, long long *pk, double *py)
Definition kmp_task_reduction_nest.cpp:115

main
int main()
Definition kmp_task_reduction_nest.cpp:150

__red_dbl_add_comb
void __red_dbl_add_comb(void *lhs, void *rhs)
Definition kmp_task_reduction_nest.cpp:110

_task_red_item
Definition kmp_task_reduction_nest.cpp:86

_task_red_item::f_init
void * f_init
Definition kmp_task_reduction_nest.cpp:89

_task_red_item::f_fini
void * f_fini
Definition kmp_task_reduction_nest.cpp:90

_task_red_item::shar
void * shar
Definition kmp_task_reduction_nest.cpp:87

_task_red_item::f_comb
void * f_comb
Definition kmp_task_reduction_nest.cpp:91

_task_red_item::flags
unsigned flags
Definition kmp_task_reduction_nest.cpp:92

_task_red_item::size
size_t size
Definition kmp_task_reduction_nest.cpp:88

err
static int err
Definition teams-no-par.c:16

omp_get_max_threads
int omp_get_max_threads()