Offloading to GPU
Objectives
Understand and be able to offload code to device
Understand different constructs to create parallelism on device
Host-device model
Since version 4.0 , OpenMP supports heterogeneous systems. OpenMP uses
TARGET
construct to offload execution from the host to the target
device(s), and hence the directive name. In addition, the associated
data needs to be transferred to the device(s) as well. Once
transferred, the target device owns the data and accesses by the host
during the execution of the target region is forbidden.
A host/device model is generally used by OpenMP for offloading:
normally there is only one single host: e.g. CPU
one or multiple target devices of the same kind: e.g. coprocessor, GPU, FPGA, …
unless with unified shared memory, the host and device have separate memory address space
Note
Under the following condition, there will be NO data transfer to the device
data already exists on the device from a previous execution
Device execution model
The execution on the device is host-centric
1.the host creates the data environments on the device(s)
2.the host maps data to the device data environment
3.the host offloads OpenMP target regions to the target device to be executed
4.the host transfers data from the device to the host
5.the host destroys the data environment on the device
TARGET construct
The TARGET
construct consists of a target directive and an
execution region. It is used to transfer both the control flow from
the host to the device and the data between the host and device.
Syntax
#pragma omp target [clauses]
structured-block
clause:
if([ target:] scalar-expression)
device(integer-expression)
private(list)
firstprivate(list)
map([map-type:] list)
is_device_ptr(list)
defaultmap(tofrom:scalar)
nowait
depend(dependence-type : list)
!$omp target [clauses]
structured-block
!$omp end target
clause:
if([ target:] scalar-expression)
device(integer-expression)
private(list)
firstprivate(list)
map([map-type:] list)
is_device_ptr(list)
defaultmap(tofrom:scalar)
nowait
depend(dependence-type : list)
Exercise00: Hello world with OpenMP offloading
1/* Copyright (c) 2019 CSC Training */
2/* Copyright (c) 2021 ENCCS */
3#include <stdio.h>
4
5#ifdef _OPENMP
6#include <omp.h>
7#endif
8
9int main()
10{
11 int num_devices = omp_get_num_devices();
12 printf("Number of available devices %d\n", num_devices);
13
14 #pragma omp target
15 {
16 if (omp_is_initial_device()) {
17 printf("Running on host\n");
18 } else {
19 int nteams= omp_get_num_teams();
20 int nthreads= omp_get_num_threads();
21 printf("Running on device with %d teams in total and %d threads in each team\n",nteams,nthreads);
22 }
23 }
24
25}
26
1! Copyright (c) 2019 CSC Training
2! Copyright (c) 2021 ENCCS
3program hello
4
5#ifdef _OPENMP
6 use omp_lib
7#endif
8 implicit none
9
10 integer :: num_devices,nteams,nthreads
11 logical :: initial_device
12
13 num_devices = omp_get_num_devices()
14 print *, "Number of available devices", num_devices
15
16 !$omp target map(nteams,nthreads)
17 initial_device = omp_is_initial_device()
18 nteams= omp_get_num_teams()
19 nthreads= omp_get_num_threads()
20 !$omp end target
21 if (initial_device) then
22 write(*,*) "Running on host"
23 else
24 write(*,'(A,I4,A,I4,A)') "Running on device with ",nteams, " teams in total and ", nthreads, " threads in each team"
25 end if
26
27end program
Exercise01: Adding TARGET
construct
1/* Copyright (c) 2019 CSC Training */
2/* Copyright (c) 2021 ENCCS */
3#include <stdio.h>
4#include <math.h>
5#define NX 102400
6
7int main(void)
8{
9 double vecA[NX],vecB[NX],vecC[NX];
10 double r=0.2;
11
12/* Initialization of vectors */
13 for (int i = 0; i < NX; i++) {
14 vecA[i] = pow(r, i);
15 vecB[i] = 1.0;
16 }
17
18/* Dot product of two vectors */
19 for (int i = 0; i < NX; i++) {
20 vecC[i] = vecA[i] * vecB[i];
21 }
22
23 double sum = 0.0;
24 /* Calculate the sum */
25 for (int i = 0; i < NX; i++) {
26 sum += vecC[i];
27 }
28 printf("The sum is: %8.6f \n", sum);
29 return 0;
30}
1! Copyright (c) 2019 CSC Training
2! Copyright (c) 2021 ENCCS
3program dotproduct
4 implicit none
5
6 integer, parameter :: nx = 102400
7 real, parameter :: r=0.2
8
9 real, dimension(nx) :: vecA,vecB,vecC
10 real :: sum
11 integer :: i
12
13 ! Initialization of vectors
14 do i = 1, nx
15 vecA(i) = r**(i-1)
16 vecB(i) = 1.0
17 end do
18
19 ! Dot product of two vectors
20 do i = 1, nx
21 vecC(i) = vecA(i) * vecB(i)
22 end do
23
24 sum = 0.0
25 ! Calculate the sum
26 do i = 1, nx
27 sum = vecC(i) + sum
28 end do
29
30 write(*,*) 'The sum is: ', sum
31
32end program dotproduct
Solution
1/* Copyright (c) 2019 CSC Training */
2/* Copyright (c) 2021 ENCCS */
3#include <stdio.h>
4#include <math.h>
5#define NX 102400
6
7int main(void)
8{
9 double vecA[NX],vecB[NX],vecC[NX];
10 double r=0.2;
11
12/* Initialization of vectors */
13 for (int i = 0; i < NX; i++) {
14 vecA[i] = pow(r, i);
15 vecB[i] = 1.0;
16 }
17
18/* dot product of two vectors */
19 #pragma omp target
20 for (int i = 0; i < NX; i++) {
21 vecC[i] = vecA[i] * vecB[i];
22 }
23
24 double sum = 0.0;
25 /* calculate the sum */
26 for (int i = 0; i < NX; i++) {
27 sum += vecC[i];
28 }
29 printf("The sum is: %8.6f \n", sum);
30 return 0;
31}
1! Copyright (c) 2019 CSC Training
2! Copyright (c) 2021 ENCCS
3program dotproduct
4 implicit none
5
6 integer, parameter :: nx = 102400
7 real, parameter :: r=0.2
8
9 real, dimension(nx) :: vecA,vecB,vecC
10 real :: sum
11 integer :: i
12
13 ! Initialization of vectors
14 do i = 1, nx
15 vecA(i) = r**(i-1)
16 vecB(i) = 1.0
17 end do
18
19 ! Dot product of two vectors
20 !$omp target
21 do i = 1, nx
22 vecC(i) = vecA(i) * vecB(i)
23 end do
24 !$omp end target
25
26 sum = 0.0
27 ! Calculate the sum
28 do i = 1, nx
29 sum = vecC(i) + sum
30 end do
31
32 write(*,*) 'The sum is: ', sum
33
34end program dotproduct
Creating parallelism on the target device
The TARGET
construct transfers the control flow to the device is
sequential and synchronous, and it is because OpenMP separates offload
and parallelism. One needs to explicitly create parallel regions on
the target device to make efficient use of the device(s).
TEAMS construct
Syntax
#pragma omp teams [clauses]
structured-block
clause:
num_teams(integer-expression)
thread_limit(integer-expression)
default(shared | none)
private(list)
firstprivate(list)
shared(list)
reduction(reduction-identifier : list)
!$omp teams [clauses]
structured-block
!$omp end teams
clause:
num_teams(integer-expression)
thread_limit(integer-expression)
default(shared | none)
private(list)
firstprivate(list)
shared(list)
reduction(reduction-identifier : list)
The TEAMS
construct creates a league of one-thread teams where
the thread of each team executes concurrently and is in its own contention group.
The number of teams created is implementation defined, but is no more than
num_teams if specified in the clause. The maximum number of threads participating in
the contention group that each team initiates is implementation defined as well,
unless thread_limit is specified in the clause.
Threads in a team can synchronize but no synchronization among teams.
The TEAMS
construct must be contained in a TARGET
construct,
without any other directives, statements or declarations in between.
Note
A contention group is the set of all threads that are descendants of an initial thread. An initial thread is never a descendant of another initial thread.
DISTRIBUTE construct
Syntax
#pragma omp distribute [clauses]
for-loops
clause:
private(list)
firstprivate(list)
lastprivate(list)
collapse(n)
dist_schedule(kind[, chunk_size])
!$omp distribute [clauses]
do-loops
[!$omp end distribute]
clause:
private(list)
firstprivate(list)
lastprivate(list)
collapse(n)
dist_schedule(kind[, chunk_size])
The DISTRIBUTE
construct is a coarsely worksharing construct
which distributes the loop iterations across the master threads in the teams,
but no worksharing within the threads in one team. No implicit barrier
at the end of the construct and no guarantee about the order the teams
will execute.
To further create threads within each team and distritute loop iterations across threads,
we will use the PARALLEL FOR/DO
constructs.
PARALLEL construct
Syntax
#pragma omp parallel [clauses]
structured-block
clause:
num_threads(integer-expression)
default(shared | none)
private(list)
firstprivate(list)
shared(list)
reduction(reduction-identifier : list)
!$omp parallel [clauses]
structured-block
!$omp end parallel
clause:
num_threads(integer-expression)
default(private | firstprivate | shared | none)
private(list)
firstprivate(list)
shared(list)
copyin(list)
reduction(reduction-identifier : list)
FOR/DO construct
Syntax
#pragma omp for [clauses]
structured-block
clause:
private(list)
firstprivate(list)
lastprivate(list)
reduction(reduction-identifier : list)
schedule(kind[, chunk_size])
collapse(n)
!$omp do [clauses]
structured-block
[!$omp end do]
clause:
private(list)
firstprivate(list)
lastprivate(list)
reduction(reduction-identifier : list)
schedule(kind[, chunk_size])
collapse(n)
Keypoints
- TEAMS DISTRIBUTE construct
Coarse-grained parallelism
Spawns multiple single-thread teams
No synchronization of threads in different teams
- PARALLEL FOR/DO construct
Fine-grained parallelism
Spawns many threads in one team
Threads can synchronize in a team
Exercise02: Adding constructs for parallelism
1/* Copyright (c) 2019 CSC Training */
2// Copyright (c) 2021 ENCCS
3#include <stdio.h>
4#include <math.h>
5#define NX 102400
6
7int main(void)
8{
9 double vecA[NX],vecB[NX],vecC[NX];
10 double r=0.2;
11
12/* Initialization of vectors */
13 for (int i = 0; i < NX; i++) {
14 vecA[i] = pow(r, i);
15 vecB[i] = 1.0;
16 }
17
18/* dot product of two vectors */
19 #pragma omp target
20 for (int i = 0; i < NX; i++) {
21 vecC[i] = vecA[i] * vecB[i];
22 }
23
24 double sum = 0.0;
25 /* calculate the sum */
26 for (int i = 0; i < NX; i++) {
27 sum += vecC[i];
28 }
29 printf("The sum is: %8.6f \n", sum);
30 return 0;
31}
1! Copyright (c) 2019 CSC Training
2! Copyright (c) 2021 ENCCS
3program dotproduct
4 implicit none
5
6 integer, parameter :: nx = 102400
7 real, parameter :: r=0.2
8
9 real, dimension(nx) :: vecA,vecB,vecC
10 real :: sum
11 integer :: i
12
13 ! Initialization of vectors
14 do i = 1, nx
15 vecA(i) = r**(i-1)
16 vecB(i) = 1.0
17 end do
18
19 ! Dot product of two vectors
20 !$omp target
21 do i = 1, nx
22 vecC(i) = vecA(i) * vecB(i)
23 end do
24 !$omp end target
25
26 sum = 0.0
27 ! Calculate the sum
28 do i = 1, nx
29 sum = vecC(i) + sum
30 end do
31
32 write(*,*) 'The sum is: ', sum
33
34end program dotproduct
Solution
1/* Copyright (c) 2019 CSC Training */
2/* Copyright (c) 2021 ENCCS */
3#include <stdio.h>
4#include <math.h>
5#define NX 102400
6
7int main(void)
8{
9 double vecA[NX],vecB[NX],vecC[NX];
10 double r=0.2;
11
12/* Initialization of vectors */
13 for (int i = 0; i < NX; i++) {
14 vecA[i] = pow(r, i);
15 vecB[i] = 1.0;
16 }
17
18/* dot product of two vectors */
19 #pragma omp target teams distribute parallel for
20 for (int i = 0; i < NX; i++) {
21 vecC[i] = vecA[i] * vecB[i];
22 }
23
24 double sum = 0.0;
25 /* calculate the sum */
26 for (int i = 0; i < NX; i++) {
27 sum += vecC[i];
28 }
29 printf("The sum is: %8.6f \n", sum);
30 return 0;
31}
1! Copyright (c) 2019 CSC Training
2! Copyright (c) 2021 ENCCS
3program dotproduct
4 implicit none
5
6 integer, parameter :: nx = 102400
7 real, parameter :: r=0.2
8
9 real, dimension(nx) :: vecA,vecB,vecC
10 real :: sum
11 integer :: i
12
13 ! Initialization of vectors
14 do i = 1, nx
15 vecA(i) = r**(i-1)
16 vecB(i) = 1.0
17 end do
18
19 ! Dot product of two vectors
20 !$omp target teams distribute parallel do
21 do i = 1, nx
22 vecC(i) = vecA(i) * vecB(i)
23 end do
24 !$omp end target teams distribute parallel do
25
26 sum = 0.0
27 ! Calculate the sum
28 do i = 1, nx
29 sum = vecC(i) + sum
30 end do
31
32 write(*,*) 'The sum is: ', sum
33
34end program dotproduct
Exercise03: TEAMS
vs PARALLEL
constructs
We start from the “hello world” example, and by adding TEAMS
and PARALLEL
constructs
to compare the differences. Furthermore, using num_teams
and thread_limit
to limit
the number of teams and threads to be generated.
1/* Copyright (c) 2019 CSC Training */
2/* Copyright (c) 2021 ENCCS */
3#include <stdio.h>
4
5#ifdef _OPENMP
6#include <omp.h>
7#endif
8
9int main()
10{
11 int num_devices = omp_get_num_devices();
12 printf("Number of available devices %d\n", num_devices);
13
14 #pragma omp target
15 {
16 if (omp_is_initial_device()) {
17 printf("Running on host\n");
18 } else {
19 int nteams= omp_get_num_teams();
20 int nthreads= omp_get_num_threads();
21 printf("Running on device with %d teams in total and %d threads in each team\n",nteams,nthreads);
22 }
23 }
24
25}
26
1! Copyright (c) 2019 CSC Training
2! Copyright (c) 2021 ENCCS
3program hello
4
5#ifdef _OPENMP
6 use omp_lib
7#endif
8 implicit none
9
10 integer :: num_devices,nteams,nthreads
11 logical :: initial_device
12
13 num_devices = omp_get_num_devices()
14 print *, "Number of available devices", num_devices
15
16 !$omp target map(nteams,nthreads)
17 initial_device = omp_is_initial_device()
18 nteams= omp_get_num_teams()
19 nthreads= omp_get_num_threads()
20 !$omp end target
21 if (initial_device) then
22 write(*,*) "Running on host"
23 else
24 write(*,'(A,I4,A,I4,A)') "Running on device with ",nteams, " teams in total and ", nthreads, " threads in each team"
25 end if
26
27end program
Solution
1/* Copyright (c) 2019 CSC Training */
2/* Copyright (c) 2021 ENCCS */
3#include <stdio.h>
4
5#ifdef _OPENMP
6#include <omp.h>
7#endif
8
9int main()
10{
11 int num_devices = omp_get_num_devices();
12 printf("Number of available devices %d\n", num_devices);
13
14 #pragma omp target
15 #pragma omp teams num_teams(2) thread_limit(3)
16 #pragma omp parallel
17 {
18 if (omp_is_initial_device()) {
19 printf("Running on host\n");
20 } else {
21 int nteams= omp_get_num_teams();
22 int nthreads= omp_get_num_threads();
23 printf("Running on device with %d teams in total and %d threads in each team\n",nteams,nthreads);
24 }
25 }
26
27}
28
1! Copyright (c) 2019 CSC Training
2! Copyright (c) 2021 ENCCS
3program hello
4
5#ifdef _OPENMP
6 use omp_lib
7#endif
8 implicit none
9
10 integer :: num_devices,nteams,nthreads
11 logical :: initial_device
12
13 num_devices = omp_get_num_devices()
14 print *, "Number of available devices", num_devices
15
16 !$omp target map(nteams,nthreads)
17 !$omp teams num_teams(2) thread_limit(3)
18 !$omp parallel
19 initial_device = omp_is_initial_device()
20 nteams= omp_get_num_teams()
21 nthreads= omp_get_num_threads()
22 !$omp end parallel
23 !$omp end teams
24 !$omp end target
25 if (initial_device) then
26 write(*,*) "Running on host"
27 else
28 write(*,'(A,I4,A,I4,A)') "Running on device with ",nteams, " teams in total and ", nthreads, " threads in each team"
29 end if
30
31end program
Composite directive
It is convenient to use the composite construct
the code is more portable
let the compiler figures out the loop tiling since each compiler supports different levels of parallelism
possible to reach good performance without composite directives
Syntax
#pragma omp target teams distribute parallel for [clauses]
for-loops
!$omp target teams distribute parallel do [clauses]
do-loops
[!$omp end target teams distribute parallel do ]
Exercise: Offloading
We will start from the serial version of the heat diffusion and step by step add the directives for offloading and parallelism on the target device. Compare the performance to understand the effects of different directives. We will focus on the core operation only for now, i.e. subroutine evolve in the file core.cpp or core.F90.
For C/C++, you need to add a data mapping clause
map(currdata[0:(nx+2)*(ny+2)],prevdata[0:(nx+2)*(ny+2)])
step 1: adding the TARGET
construct
step 2: adding the TARGET TEAMS
construct
step 3: adding the TARGET TEAMS DISTRIBUTE
construct
step 4: adding the TARGET TEAMS DISTRIBUTE PARALLEL FOR/DO
construct
Use a small number of iterations, e.g. ./heat_serial 800 800 10, otherwise it may take a long time to finish.
The exercise is under /content/exercise/offloading
Solution
1// Copyright (c) 2019 CSC Training
2// Copyright (c) 2021 ENCCS
3// Main solver routines for heat equation solver
4
5#include "heat.h"
6
7// Update the temperature values using five-point stencil
8// Arguments:
9// curr: current temperature values
10// prev: temperature values from previous time step
11// a: diffusivity
12// dt: time step
13void evolve(field *curr, field *prev, double a, double dt)
14{
15 // Help the compiler avoid being confused by the structs
16 double *currdata = curr->data.data();
17 double *prevdata = prev->data.data();
18 int nx = curr->nx;
19 int ny = curr->ny;
20
21 // Determine the temperature field at next time step
22 // As we have fixed boundary conditions, the outermost gridpoints
23 // are not updated.
24 double dx2 = prev->dx * prev->dx;
25 double dy2 = prev->dy * prev->dy;
26 #pragma omp target teams distribute parallel for \
27 map(currdata[0:(nx+2)*(ny+2)],prevdata[0:(nx+2)*(ny+2)])
28 for (int i = 1; i < nx + 1; i++) {
29 for (int j = 1; j < ny + 1; j++) {
30 int ind = i * (ny + 2) + j;
31 int ip = (i + 1) * (ny + 2) + j;
32 int im = (i - 1) * (ny + 2) + j;
33 int jp = i * (ny + 2) + j + 1;
34 int jm = i * (ny + 2) + j - 1;
35 currdata[ind] = prevdata[ind] + a*dt*
36 ((prevdata[ip] - 2.0*prevdata[ind] + prevdata[im]) / dx2 +
37 (prevdata[jp] - 2.0*prevdata[ind] + prevdata[jm]) / dy2);
38 }
39 }
40}
1! Copyright (c) 2019 CSC Training
2! Copyright (c) 2021 ENCCS
3! Main solver routines for heat equation solver
4module core
5 use heat
6
7contains
8
9 ! Update the temperature values using five-point stencil
10 ! Arguments:
11 ! curr (type(field)): current temperature values
12 ! prev (type(field)): temperature values from previous time step
13 ! a (real(dp)): diffusivity
14 ! dt (real(dp)): time step
15 subroutine evolve(curr, prev, a, dt)
16
17 implicit none
18
19 type(field),target, intent(inout) :: curr, prev
20 real(dp) :: a, dt
21 integer :: i, j, nx, ny
22 real(dp) :: dx, dy
23 real(dp), pointer, contiguous, dimension(:,:) :: currdata, prevdata
24
25 ! Help the compiler avoid being confused
26 nx = curr%nx
27 ny = curr%ny
28 dx = curr%dx
29 dy = curr%dy
30 currdata => curr%data
31 prevdata => prev%data
32
33 ! Determine the temperature field at next time step As we have
34 ! fixed boundary conditions, the outermost gridpoints are not
35 ! updated.
36
37 !$omp target teams distribute parallel do
38 do j = 1, ny
39 do i = 1, nx
40 currdata(i, j) = prevdata(i, j) + a * dt * &
41 & ((prevdata(i-1, j) - 2.0 * prevdata(i, j) + &
42 & prevdata(i+1, j)) / dx**2 + &
43 & (prevdata(i, j-1) - 2.0 * prevdata(i, j) + &
44 & prevdata(i, j+1)) / dy**2)
45 end do
46 end do
47 !$omp end target teams distribute parallel do
48 end subroutine evolve
49
50end module core