OpenACC - The Portland Group

Download Report

Transcript OpenACC - The Portland Group

Compilers & Tools for HPC
January 2014
www.pgroup.com
Who is PGI?
The Portland Group (PGI)
Founded in 1989 – 20+ years in the HPC business
Acquisition by NVIDIA closed on July 29th 2013
Compilers & SW Development Tools
PGI-developed Compilers, Debugger, Performance Profiler
3rd party Integrated Development Environments (IDEs)
HPC & Technical Computing Market
Nat’l Labs, Research Labs, Research Universities, Oil & Gas,
Aerospace, Pharmaceutical, …
2
PGI Installations
PGI has over 25,000 users at over 5,000 sites worldwide
3
NVIDIA and PGI
World-class HPC companies need world-class
compiler teams
NVIDIA + PGI = Integrated HPC system compilers
As part of NVIDIA, PGI will
Continue to create world-class HPC Fortran/C/C++
compilers for CPU+Accelerator systems
Accelerate development and propagation of
OpenACC and CUDA Fortran
Increase velocity on Accelerator-enablement of
HPC applications
4
C99, C++, Fortran 2003 Compilers
Optimizing, Vectorizing, Parallelizing
Graphical Parallel Tools
PGDBG® debugger
PGPROF® profiler
AMD, Intel, NVIDIA Processors
PGI Unified Binary® technology
Performance portability
Linux, OS X, Windows
Visual Studio & Eclipse integration
PGI Accelerator Features
OpenACC C/C++/Fortran compilers
CUDA Fortran compiler
CUDA-x86
www.pgroup.com
5
6
The New HPC Node Architecture
7
PGI OpenACC Compilers
8
OpenACC
Open Programming Standard for Parallel Computing
“PGI OpenACC will enable programmers to easily develop portable applications that maximize
the performance and power efficiency benefits of the hybrid CPU/GPU architecture of Titan.”
--Buddy Bland, Titan Project Director, Oak Ridge National Lab
“OpenACC is a technically impressive initiative brought together by members of the OpenMP
Working Group on Accelerators, as well as many others. We look forward to releasing a version
of this proposal in the next release of OpenMP.”
--Michael Wong, CEO OpenMP Directives Board
OpenACC Members
9
OpenACC Directives
CPU
Program myscience
... serial code ...
!$acc kernels
do k = 1,n1
do i = 1,n2
... parallel code ...
enddo
enddo
!$acc end kernels
...
End Program myscience
GPU
Portable compiler hints
Compiler parallelizes code
Designed for multicore CPUs &
many core GPUs / Accelerators
OpenACC
Compiler
Directives
10
#pragma acc kernels loop
for( i = 0; i < nrows; ++i ){
float val = 0.0f;
for( d = 0; d < nzeros; ++d ){
j = i + offset[d];
if( j >= 0 && j < nrows )
val += m[i+nrows*d] * v[j];
}
x[i] = val;
}
How Do OpenACC
Compilers Work?
compile
matvec:
Link
subq
$328, %rsp
call
__pgi_cu_alloc
call
__pgi_cu_uploadx
call
__pgi_cu_launch2
call
__pgi_cu_downloadx
call
__pgi_cu_free
...
...
...
...
...
...
x86 asm code
Unified
Object
execute
+
.entry matvec_14_gpu( ...
.reg .u32 %r<70> ...
cvt.s32.u32 %r1, %tid.x;
mov.s32
%r2, 0;
setp.ne.s32 $p1, %r1, %r2
cvt.s32.u32 %r3, %ctaid.x;
cvt.s32.u32 %r4, %ntid.x;
mul.lo.s32
%r5, %r3, %r4;
@%p1 bra
$Lt_0_258;
st.shared.s32 [__i2s], %r5
$Lt_0_258:
bar.sync
0;
...
GPU asm code
… no change to existing makefiles, scripts, IDEs,
programming environment, etc.
11
OpenACC Coding Example
#pragma acc data \
copy(b[0:n][0:m]) \
create(a[0:n][0:m])
{
for (iter = 1; iter <= p; ++iter){
#pragma acc kernels
{
for (i = 1; i < n-1; ++i){
for (j = 1; j < m-1; ++j){
a[i][j]=w0*b[i][j]+
w1*(b[i-1][j]+b[i+1][j]+
b[i][j-1]+b[i][j+1])+
w2*(b[i-1][j-1]+b[i-1][j+1]+
b[i+1][j-1]+b[i+1][j+1]);
} }
for( i = 1; i < n-1; ++i )
for( j = 1; j < m-1; ++j )
b[i][j] = a[i][j];
}
}
}
A
S (B)
B
S (B)
Host Memory
GPU Memory
p2
1
1p
12
1 void matrixMulGPU(cl_uint ciDeviceCount, cl_mem h_A, float* h_B_data,
2
unsigned int mem_size_B, float* h_C )
2 {
3
cl_mem d_A[MAX_GPU_COUNT];
4
cl_mem d_C[MAX_GPU_COUNT];
5
cl_mem d_B[MAX_GPU_COUNT];
6
7
8
cl_event GPUDone[MAX_GPU_COUNT];
cl_event GPUExecution[MAX_GPU_COUNT];
9
12
// Create buffers for each GPU
13
// Each GPU will compute sizePerGPU rows of the result
14
int sizePerGPU = HA / ciDeviceCount;
15
Matrix Multiply Source
Code Size Comparison:
16
17
int workOffset[MAX_GPU_COUNT];
int workSize[MAX_GPU_COUNT];
18
19
workOffset[0] = 0;
20
for(unsigned int i=0; i < ciDeviceCount; ++i)
21
{
22
// Input buffer
23
workSize[i] = (i != (ciDeviceCount - 1)) ? sizePerGPU : (HA - workOffset[i]);
24
25
d_A[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float) * WA, NULL,NULL);
26
27
// Copy only assigned rows from host to device
28
clEnqueueCopyBuffer(commandQueue[i], h_A, d_A[i], workOffset[i] * sizeof(float) * WA,
29
0, workSize[i] * sizeof(float) * WA, 0, NULL, NULL);
30
31
// create OpenCL buffer on device that will be initiatlize from the host memory on first use
32
// on device
33
d_B[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
34
mem_size_B, h_B_data, NULL);
35
36
// Output buffer
37
d_C[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY,
workSize[i] * WC * sizeof(float), NULL,NULL);
38
39
// set the args values
40
clSetKernelArg(multiplicationKernel[i], 0, sizeof(cl_mem), (void *) &d_C[i]);
41
clSetKernelArg(multiplicationKernel[i], 1, sizeof(cl_mem), (void *) &d_A[i]);
42
clSetKernelArg(multiplicationKernel[i], 2, sizeof(cl_mem), (void *) &d_B[i]);
43
clSetKernelArg(multiplicationKernel[i], 3, sizeof(float) * BLOCK_SIZE *BLOCK_SIZE, 0 );
44
clSetKernelArg(multiplicationKernel[i], 4, sizeof(float) * BLOCK_SIZE *BLOCK_SIZE, 0 );
45
46
if(i+1 < ciDeviceCount)
47
workOffset[i + 1] = workOffset[i] + workSize[i];
48
}
49
// Execute Multiplication on all GPUs in parallel
50
size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE};
51
size_t globalWorkSize[] = {shrRoundUp(BLOCK_SIZE, WC), shrRoundUp(BLOCK_SIZE, workSize[0])};
52
// Launch kernels on devices
53
for(unsigned int i = 0; i < ciDeviceCount; i++)
54
{
55
// Multiplication - non-blocking execution
56
globalWorkSize[1] = shrRoundUp(BLOCK_SIZE, workSize[i]);
57
clEnqueueNDRangeKernel(commandQueue[i], multiplicationKernel[i], 2, 0, globalWorkSize, localWorkSize,
58
0, NULL, &GPUExecution[i]);
59
}
60
61
1 void
2 computeMM0_saxpy(float C[][WB], float A[][WA], float B[][WB],
3
int hA, int wA, int wB)
4 {
5 #pragma acc region
6 {
7 #pragma acc for parallel vector(16) unroll(4)
8
for (int i = 0; i < hA; ++i) {
9
for (int j = 0; j < wB; ++j) {
10
C[i][j] = 0.0 ;
11
}
12
for (int k = 0; k < wA; ++k) {
13
for (int j = 0; j < wB; ++j) {
14
C[i][j] = C[i][j]+A[i][k]*B[k][j];
15
}
16
17
}
}
18 }
19 }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
void
__global__ void matrixMul( float* C, float* A, float* B, int wA, int wB)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int aBegin = wA * BLOCK_SIZE * by;
int aEnd
= aBegin + wA - 1;
int aStep = BLOCK_SIZE;
int bBegin = BLOCK_SIZE * bx;
int bStep = BLOCK_SIZE * wB;
float Csub = 0;
for (int a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep) {
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
AS(ty, tx) = A[a + wA * ty + tx];
BS(ty, tx) = B[b + wB * ty + tx];
__syncthreads();
for (int k = 0; k < BLOCK_SIZE; ++k)
Csub += AS(ty, k) * BS(k, tx);
__syncthreads();
}
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
}
void
domatmul( float* C, float* A, float* B, unsigned int hA, unsigned int wA , unsigned wB )
{
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
unsigned int size_C = WC * HC;
unsigned int mem_size_C = sizeof(float) * size_C;
float *d_A, *d_B, *d_C;
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(WC / threads.x, HC / threads.y);
matrixMul<<< grid, threads >>>(d_C, d_A, d_B, WA, WB);
cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}
clFinish(commandQueue[i]);
63
}
64
for(unsigned int i = 0; i < ciDeviceCount; i++)
65
{
66 // Non-blocking copy of result from device to host
67
clEnqueueReadBuffer(commandQueue[i], d_C[i], CL_FALSE, 0, WC * sizeof(float) * workSize[i],
68
69
h_C + workOffset[i] * WC, 0, NULL, &GPUDone[i]);
}
70 // CPU sync with GPU
71
clWaitForEvents(ciDeviceCount, GPUDone);
72
73 // Release mem and event objects
74
for(unsigned int i = 0; i < ciDeviceCount; i++)
75
{
76
clReleaseMemObject(d_A[i]);
77
clReleaseMemObject(d_C[i]);
78
clReleaseMemObject(d_B[i]);
79
clReleaseEvent(GPUExecution[i]);
80
81
clReleaseEvent(GPUDone[i]);
}
82 }
83 __kernel void
84 matrixMul( __global float* C, __global float* A, __global float* B,
85
__local float* As, __local float* Bs)
86 {
87
int bx = get_group_id(0), tx = get_local_id(0);
88
int by = get_group_id(1), ty = get_local_id(1);
89
int aEnd
= WA * BLOCK_SIZE * by + WA - 1;
90
float Csub = 0.0f;
92
93
for (int a = WA*BLOCK_SIZE*by , b = BLOCK_SIZE * bx;
94
a <= aEnd; a += BLOCK_SIZE, b += BLOCK_SIZE*WB) {
95
As[tx + ty * BLOCK_SIZE] = A[a + WA * ty + tx];
96
Bs[tx + ty * BLOCK_SIZE] = B[b + WB * ty + tx];
97
barrier(CLK_LOCAL_MEM_FENCE);
98
for (int k = 0; k < BLOCK_SIZE; ++k)
99
Csub += As[k + ty * BLOCK_SIZE]*Bs[tx + k * BLOCK_SIZE] ;
101
barrier(CLK_LOCAL_MEM_FENCE);
102
}
103
C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = Csub;
104
105 }
}
OpenACC
{
62
91
cudaMalloc((void**) &d_A, mem_size_A);
cudaMalloc((void**) &d_B, mem_size_B);
cudaMalloc((void**) &d_C, mem_size_C);
cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
for(unsigned int i = 0; i < ciDeviceCount; i++)
CUDA C
OpenCL
13
Accelerating
SEISMIC_CPML
from the University
of Pau
Read this article online at
www.pgroup.com/pginsider
14
SEISMIC_CPML Timings
Approx.
Programming
Time (min)
MPI
Processes
OpenMP
Threads
GPUs
Time (sec)
Original
MPI/OMP
2
4
0
951
ACC Steps 1/2
2
0
2
3100
10
ACC Step 3
2
0
2
550
60
ACC Step 4
2
0
2
124
120
ACC Step 5
2
0
2
120
120
Version
System Info:
4 Core Intel Core-i7 920 Running at 2.67Ghz
Includes 2 Tesla C2070 GPUs
Problem Size: 101x641x128
5x in 5
hours!
15
Typical Porting Experience with
OpenACC Directives
16
Cloverleaf mini-App Performance
1000
Better
100
Run-time
K20X CUDA
K20X OpenACC
Dual-socket CPU C
Dual-socket CPU Fortran
10
1
bm_short
bm
bm16_short
bm16
NVIDIA benchmarks: dual-socket Intel Xeon E5-2667
Cloverleaf is a US National Labs Trinity/Coral mini-app benchmark https://github.com/Warwick-PCAV/CloverLeaf/wiki/Performance-Table
17
OpenACC: Performance with Less Effort
Words of Code
20000
18000
16000
14000
12000
10000
8000
6000
4000
2000
0
OpenACC
CUDA
OpenCL
Cloverleaf: http://www.computer.org/csdl/proceedings/sccompanion/2012/4956/00/4956a465-abs.html
18
Why Use OpenACC Directives?
Productivity
Higher-level programming model
Similar to OpenMP, designed for Accelerated computing
Portability
Ignore directives, code is portable to the host
Portable across different types of Accelerators
Performance portability
Performance Feedback Information
Unique to compilers
Enables incremental porting and tuning
19
PGI CUDA Fortran
20
CUDA Fortran in a Nutshell
real, device, allocatable, dimension(:,:) ::
Adev,Bdev,Cdev
. . .
allocate (Adev(N,M), Bdev(M,L), Cdev(N,L))
Adev = A(1:N,1:M)
Bdev = B(1:M,1:L)
call mm_kernel <<<dim3(N/16,M/16),dim3(16,16)>>>
( Adev, Bdev, Cdev, N, M, L)
C(1:N,1:L) = Cdev
deallocate ( Adev, Bdev, Cdev )
. . .
Host Code
attributes(global) subroutine mm_kernel
( A, B, C, N, M, L )
real :: A(N,M), B(M,L), C(N,L), Cij
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
real, shared :: Asub(16,16),Bsub(16,16)
tx = threadidx%x
ty = threadidx%y
i = blockidx%x * 16 + tx
j = blockidx%y * 16 + ty
Cij = 0.0
do kb = 1, M, 16
Asub(tx,ty) = A(i,kb+tx-1)
Bsub(tx,ty) = B(kb+ty-1,j)
call syncthreads()
do k = 1,16
Cij = Cij + Asub(tx,k) * Bsub(k,ty)
enddo
call syncthreads()
enddo
C(i,j) = Cij
end subroutine mmul_kernel
GPU Code
21
CUDA Fortran / OpenACC Interoperability
module mymod
real, dimension(:), allocatable, device :: xDev
end module
...
use mymod
...
allocate( xDev(n) ) ! allocates xDev in GPU memory
call init_kernel <<<dim3(n/128),dim3(128)>>> (xDev, n)
...
!$acc data copy( y(:) ) ! no need to copy xDev
...
!$acc kernels loop
do i = 1, n
y(i) = y(i) + a*xDev(i)
enddo
...
!$acc end data
22
CUDA Fortran Supports
Generic Interfaces and Overloading
use cublas
real(4), device :: xd(N)
real(4) x(N)
call random_number(x)
! Alloc xd in device memory, copy x
! to xd, invoke overloaded isamax
allocate(xd(N))
xd = x
j = isamax(N,xd,1)
! On the host, same name
k = isamax(N,x,1)
module cublas
! isamax
interface isamax
integer function isamax &
(n, x, incx)
integer :: n, incx
real(4) :: x(*)
end function
integer function isamaxcu &
(n, x, incx) bind(c, &
name='cublasIsamax')
integer, value :: n, incx
real(4), device :: x(*)
end function
end interface
. . .
23
CUDA Fortran Supports Encapsulation
Isolate device data and accelerator kernel declarations in
Fortran modules
module mm
real, device, allocatable :: a(:)
real, device :: x, y(10)
real, constant :: c1, c2(10)
integer, device :: n
contains
attributes(global) subroutine s( b )
...
Partition source into sections written and maintained by
accelerator experts vs those evolved by science and engineering
domain experts
24
!$CUF Kernel Directives
Simplifies Kernel Creation
module madd_device_module
use cudafor
contains
subroutine madd_dev(a,b,c,sum,n1,n2)
real,dimension(:,:),device :: a,b,c
real :: sum
integer :: n1,n2
type(dim3) :: grid, block
!$cuf kernel do (2) <<<(*,*),(32,4)>>>
do j = 1,n2
do i = 1,n1
a(i,j) = b(i,j) + c(i,j)
sum = sum + a(i,j)
enddo
enddo
end subroutine
Equivalent
end module
hand-written
CUDA kernels
module madd_device_module
use cudafor
implicit none
contains
attributes(global) subroutine madd_kernel(a,b,c,blocksum,n1,n2)
real, dimension(:,:) :: a,b,c
real, dimension(:) :: blocksum
integer, value :: n1,n2
integer :: i,j,tindex,tneighbor,bindex
real :: mysum
real, shared :: bsum(256)
! Do this thread's work
mysum = 0.0
do j = threadidx%y + (blockidx%y-1)*blockdim%y, n2, blockdim%y*griddim%y
do i = threadidx%x + (blockidx%x-1)*blockdim%x, n1, blockdim%x*griddim%x
a(i,j) = b(i,j) + c(i,j)
mysum = mysum + a(i,j) ! accumulates partial sum per thread
enddo
enddo
! Now add up all partial sums for the whole thread block
! Compute this thread's linear index in the thread block
! We assume 256 threads in the thread block
tindex = threadidx%x + (threadidx%y-1)*blockdim%x
! Store this thread's partial sum in the shared memory block
bsum(tindex) = mysum
call syncthreads()
! Accumulate all the partial sums for this thread block to a single value
tneighbor = 128
do while( tneighbor >= 1 )
if( tindex <= tneighbor ) &
bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor)
tneighbor = tneighbor / 2
call syncthreads()
enddo
! Store the partial sum for the thread block
bindex = blockidx%x + (blockidx%y-1)*griddim%x
if( tindex == 1 ) blocksum(bindex) = bsum(1)
end subroutine
! Add up partial sums for all thread blocks to a single cumulative sum
attributes(global) subroutine madd_sum_kernel(blocksum,dsum,nb)
real, dimension(:) :: blocksum
real :: dsum
integer, value :: nb
real, shared :: bsum(256)
integer :: tindex,tneighbor,i
! Again, we assume 256 threads in the thread block
! accumulate a partial sum for each thread
tindex = threadidx%x
bsum(tindex) = 0.0
do i = tindex, nb, blockdim%x
bsum(tindex) = bsum(tindex) + blocksum(i)
enddo
call syncthreads()
! This code is copied from the previous kernel
! Accumulate all the partial sums for this thread block to a single value
! Since there is only one thread block, this single value is the final result
tneighbor = 128
do while( tneighbor >= 1 )
if( tindex <= tneighbor ) &
bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor)
tneighbor = tneighbor / 2
call syncthreads()
enddo
if( tindex == 1 ) dsum = bsum(1)
end subroutine
subroutine madd_dev(a,b,c,dsum,n1,n2)
real, dimension(:,:), device :: a,b,c
real, device :: dsum
real, dimension(:), allocatable, device :: blocksum
integer :: n1,n2,nb
type(dim3) :: grid, block
integer :: r
! Compute grid/block size; block size must be 256 threads
grid = dim3((n1+31)/32, (n2+7)/8, 1)
block = dim3(32,8,1)
nb = grid%x * grid%y
allocate(blocksum(1:nb))
call madd_kernel<<< grid, block >>>(a,b,c,blocksum,n1,n2)
call madd_sum_kernel<<< 1, 256 >>>(blocksum,dsum,nb)
r = cudaThreadSynchronize() ! don't deallocate too early
deallocate(blocksum)
end subroutine
end module
25
26
PGI 2014 Multi-core x64 Highlights
180%
175%
170%
165%
160%
155%
150%
Performance Tuning:
OpenMP 75% faster
than GCC
Comprehensive
MPI support
Free PGI for your
MacBook
27
Industry-leading Multi-core x86 Performance
AMD Piledriver 2p/32 cores
Intel Sandy Bridge 2p/16 cores (hyperthreading enabled)
GCC 4.8.1
Intel 14.0.1
PGI 14.1
0%
20%
40%
60%
80%
100%
120%
140%
160%
180%
200%
SPECompG_base2012 relative performance as measured by The Portland Group during the week of July 29, 2013. The number of OpenMP threads was
set to match the number of cores on each system. SPEComp® is a registered trademark of the Standard Performance Evaluation Corporation (SPEC).
28
Comprehensive MPI Support
Debug
Run
Profile
MVAPICH2
MPICH3
Open MPI
SGI MPI
MVAPICH2
29
What is in Free PGI for OS X?
OpenMP 3.1 & auto-parallelizing
Fortran 2003 and C99 compilers
Optimized for the latest
multi-core x86-64 CPUs
Supported on Mountain
Lion and Mavericks with
Xcode 5
Includes a cmd-level
parallel Fortran debugger
30
PGI Accelerator 2014 Highlights
NVIDIA Tesla K40
and AMD Radeon
GPUs Support
OpenACC 2.0 Features
and Optimizations
CUDA Fortran and
OpenACC
Debugging
31
OpenACC Performance Portability
Speed-up
Average Speed-up Across 16 Benchmarks on Tesla and Radeon
16
14
12
10
8
6
4
2
0
CPU
GPU A
GPU B
Platform
CPU results are one core of an Intel Core i7-3930 CPU @ 3.20GHz (Sandy Bridge).
32
PGI 14.1 OpenACC New Features
Accelerator-side procedure calls using the routine directive
Unstructured data lifetimes using enter_data, exit_data
Comprehensive host_data support in Fortran
declare create, declare device_resident
Fortran deviceptr data clause
Multidimensional dynamically allocated C/C++ arrays
OpenACC 2.0 API function calls
Calling of CUDA Fortran atomic functions in OpenACC regions
Tuning, tuning, tuning … driven by SPEC ACCEL bmks, COSMO, NIM,
Cloverleaf, WRF, Gaussian, proprietary customer codes, …
33
OpenACC Procedure Calls
Inlining required, single file
Separate compilation + link step
void matvec( ... ){
float s = 0.0;
#pragma acc loop vector reduction(+:s)
for(int j=0; j<n; ++j)
s += a[i*n+j]*v[j];
x[i] = s;
}
#pragma acc routine vector
void matvec(...){
float s = 0.0;
#pragma acc loop vector reduction(+:s)
for(int j=0; j<n; ++j)
s += a[i*n+j]*v[j];
x[i] = s;
}
void test( ... ){
#pragma acc parallel loop gang ...
for(int i=0; i<n; ++i)
matvec(a, v, x, i, n);
}
void test(...){
#pragma acc parallel loop gang ...
for(int i=0; i<n; ++i)
matvec(a, v, x, i, n);
}
% pgcc -acc -Minline test.c
% pgcc -acc -c matvec.c
% pgcc -acc -c test.c
% pgcc -acc test.o matvec.o
Simplifies porting, allows separate compilation
and libraries of device-side procedures
34
OpenACC Unstructured Data Lifetimes
Initialization on host
alloc(){
x = (float*)malloc(...);
...
}
do_init(){
alloc();
for( i=0; i<n; ++i) x[i] = ...
}
int main(){
do_init();
#pragma acc data copy(x[0:n])
for( time=0; time<n; ++time)
process( time );
genoutput();
}
Initialization on device
alloc(){
x = (float*)malloc(...);
...
#pragma acc enter data create(x[0:n])
}
do_init(){
alloc();
#pragma acc parallel present(x[0:n])
for( i=0; i<n; ++i) x[i] = ...
}
int main(){
do_init();
#pragma acc data present(x[0:n])
for( time=0; time<n; ++time)
process( time )
#pragma acc exit data copyout(x[0:n])
genoutput();
}
Enables fine-tuning of data movement
through dynamic data management
35
OpenACC Host_data Directive
cudaMalloc needed
void cudaproc(...){
cudakernel<<<n/128,128>>>( a, n );
}
void test(...){
#ifdef CUDA
cudaMalloc( &a, sizeof(float)*n );
#else
a = malloc( sizeof(float)*n );
#endif
...
#pragma acc parallel loop deviceptr(a)
for( i=0; i<n; ++i ) a[i] = ...
#ifdef CUDA
cudaproc( a, n );
#else
hostproc( a, n );
#endif
}
% nvcc -c cproc.cu
% pgcc -DCUDA -acc test.c cproc.o -Mcuda
host_data allows runtime test
void cudaproc(...){
cudakernel<<<n/128,128>>>( a, n );
}
void test(...){
a = malloc( sizeof(float)*n );
...
#pragma acc data copy(a[0:n])
{
#pragma acc parallel loop
for( i=0; i<n; ++i ) a[i] = ...
if( usingcuda){
#pragma acc host_data use_device(a)
cudaproc( a, n );
}else{
hostproc( a, n );
}
}
% nvcc -c cproc.cu
% pgcc -acc test.c cproc.o -Mcuda
Enhances interoperability of CUDA and
OpenACC, enables use of GPUDirect
36
OpenACC Atomic Operations
miniMD contains a race condition
#pragma acc data copyout(f[0:3*nall])
copyin(x[0:3*nall], numneigh[0:nlocal],
neighbors[0:nlocal*maxneighs])
{
// clear force on own and ghost atoms
#pragma acc kernels loop
for(int i = 0; i < nall; i++) {
……
}
#pragma acc kernels loop independent
{
for(int i = 0; i < nlocal; i++) {
……
for(int k = 0; k < numneighs; k++) {
j = neighs[k];
……
if(GHOST_NEWTON || j < nlocal) {
f[3 * j + 0] -= delx * force;
f[3 * j + 1] -= dely * force;
f[3 * j + 2] -= delz * force;
}
Fails – requires total re-write without support
for OpenACC 2.0 atomic directives, currently
scheduled for PGI 14.4, in the meantime …
With OpenACC atomic
#pragma acc data copyout(f[0:3*nall])
copyin(x[0:3*nall], numneigh[0:nlocal],
neighbors[0:nlocal*maxneighs])
{
// clear force on own and ghost atoms
#pragma acc kernels loop
for(int i = 0; i < nall; i++) {
……
}
#pragma acc kernels loop independent
{
for(int i = 0; i < nlocal; i++) {
……
for(int k = 0; k < numneighs; k++) {
j = neighs[k];
……
if(GHOST_NEWTON || j < nlocal) {
#pragma acc atomic update
f[3 * j + 0] -= delx * force;
#pragma acc atomic update
f[3 * j + 1] -= dely * force;
#pragma acc atomic update
f[3 * j + 2] -= delz * force;
}
37
OpenACC Atomic Operations
miniMD contains a race condition
#pragma acc data copyout(f[0:3*nall])
copyin(x[0:3*nall], numneigh[0:nlocal],
neighbors[0:nlocal*maxneighs])
{
// clear force on own and ghost atoms
#pragma acc kernels loop
for(int i = 0; i < nall; i++) {
……
}
#pragma acc kernels loop independent
{
for(int i = 0; i < nlocal; i++) {
……
for(int k = 0; k < numneighs; k++) {
j = neighs[k];
……
if(GHOST_NEWTON || j < nlocal) {
f[3 * j + 0] -= delx * force;
f[3 * j + 1] -= dely * force;
f[3 * j + 2] -= delz * force;
}
Calling CUDA atomics in OpenACC regions
#pragma acc data copyout(f[0:3*nall])
copyin(x[0:3*nall], numneigh[0:nlocal],
neighbors[0:nlocal*maxneighs])
{
// clear force on own and ghost atoms
#pragma acc kernels loop
for(int i = 0; i < nall; i++) {
……
}
#pragma acc kernels loop independent
{
for(int i = 0; i < nlocal; i++) {
……
for(int k = 0; k < numneighs; k++) {
j = neighs[k];
……
if(GHOST_NEWTON || j < nlocal) {
//#pragma acc atomic update
atomicSubf(f[3 * j + 0], delx * force);
//#pragma acc atomic update
atomicSubf(f[3 * j + 1], dely * force);
//#pragma acc atomic update
atomicSubf(f[3 * j + 2], delz * force);
}
38
Debugging CUDA Fortran with Allinea DDT
39
Key Value-add of the PGI Compilers
PGI compilers enable performance-portable
programming across CPU+Accelerator systems
and minimize re-coding for each new HW advancement
40
41