SlideShare a Scribd company logo
1 of 23
Download to read offline
Jeff Larkin, April 20, 2016
Performance Portability Through
Descriptive Parallelism
2
My definition of performance portability
The same source code will run
productively on a variety of
different architectures.
3
Two Types of Portability
FUNCTIONAL PORTABILITY
The ability for a single code
to run anywhere.
PERFORMANCE PORTABILITY
The ability for a single code
to run well anywhere.
4
Performance Portability is Not
Best == Best
Optimal Everywhere
Optimal Anywhere?
5
I Assert
Descriptive Parallelism enables
performance portable code.
6
Descriptive or Prescriptive Parallelism
Programmer explicitly parallelizes the
code, compiler obeys
Requires little/no analysis by the compiler
Substantially different architectures
require different directives
Fairly consistent behavior between
implementations
Prescriptive
Compiler parallelizes the code with
guidance from the programmer
Compiler must make decisions from
available information
Compiler uses information from the
programmer and heuristics about the
architecture to make decisions
Quality of implementation greatly affects
results.
Descriptive
7
Prescribing vs. Describing
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma acc parallel loop reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma acc parallel loop
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
8
Prescribing vs. Describing
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma acc parallel loop reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma acc parallel loop
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target
{
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
9
Prescribing vs. Describing
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma acc parallel loop reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma acc parallel loop
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target
{
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for
reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
}
10
Prescribing vs. Describing
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma acc parallel loop reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma acc parallel loop
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target
{
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for
reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for
reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for reduction(max:error) schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
11
Prescribing vs. Describing
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma acc parallel loop reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma acc parallel loop
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target
{
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for
reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for
reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for reduction(max:error) schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for 
reduction(max:error) collapse(2)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for 
collapse(2)
for( int j = 1; j < n-1; j++)
{
12
Prescribing vs. Describing
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma acc parallel loop reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma acc parallel loop
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target
{
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for
reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for
reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for reduction(max:error) schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for reduction(max:error) schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for schedule(static,1)
13
Prescribing vs. Describing
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
#pragma omp simd
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma acc parallel loop reduction(max:error)
for( int j = 1; j < n-1; j++) {
#pragma acc loop reduction(max:error)
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma acc parallel loop
for( int j = 1; j < n-1; j++) {
#pragma acc loop
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target
{
#pragma omp parallel for reduction(max:error)
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp parallel for
for( int j = 1; j < n-1; j++) {
for( int i = 1; i < m-1; i++ ) {
A[j][i] = Anew[j][i];
}
}
}
if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error);
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for
reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for
reduction(max:error)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for reduction(max:error) schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for reduction(max:error) schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for schedule(static,1)
#pragma omp target data map(alloc:Anew) map(A)
while ( error > tol && iter < iter_max )
{
error = 0.0;
#pragma omp target teams distribute parallel for 
reduction(max:error) collapse(2) schedule(static,1)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for 
collapse(2) schedule(static,1)
14
Execution Time (Smaller is Better)
1.00X
5.12X
0.12X
1.01X
2.47X
0.96X
4.00X
17.42X
4.96X
23.03X
0.00
20.00
40.00
60.00
80.00
100.00
120.00
140.00
Original CPU
Threaded
GPU
Threaded
GPU Teams GPU Split GPU
Collapse
GPU Split
Sched
GPU
Collapse
Sched
OpenACC
(CPU)
OpenACC
(GPU)
893
Same
directives
(unoptimized)
ExecutionTime(seconds)
NVIDIA Tesla K40, Intel Xeon E5-2690 v2 @3.00GHz – CLANGpre-3.8, PGI 16.3
15
Importance of Loop Scheduling
1.00X
5.11X
1.07X
0.96X
17.42X
0
20
40
60
80
100
120
Serial OpenMP (CPU) OpenMP (CPU)
interleaved
OpenMP (GPU) OpenMP (GPU)
interleaved
Time(s)
NVIDIA Tesla K40, Intel Xeon E5-2690 v2 @3.00GHz –CLANGpre-3.8, PGI 16.3
16
Importance of Loop Scheduling
1.00X
16.55X
2.36X
1.64X
29.84X
0
20
40
60
80
100
120
140
160
180
200
Serial OpenMP (CPU) OpenMP (CPU)
interleaved
OpenMP (GPU) OpenMP (GPU)
interleaved
Time(s)
NVIDIA Tesla K40, Intel Xeon E5-2690 v2 @3.00GHz – Intel C Compiler 15.0, CLANGpre-3.8, PGI 16.3
ICC 15.0 CLANG
17
Why Can’t OpenMP Do This? (1)
#pragma omp target teams distribute parallel for 
reduction(max:error) collapse(2) schedule(static,1)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for 
collapse(2) schedule(static,1)
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
}
The compiler can freely specify
the default schedule, but not
add the collapse
Wrong loop for coalescing on the
GPU
Default schedule is implementation defined
18
Why Can’t OpenMP Do This? (2)
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for reduction(max:error) schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute
for( int j = 1; j < n-1; j++)
{
#pragma omp parallel for schedule(static,1)
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
}
Right loop for coalescing on the
GPU
Makes no sense to distribute to
teams on the CPU
Default schedule is implementation defined
19
Why Can’t OpenMP Do This? (3)
#pragma omp target teams distribute parallel for 
reduction(max:error) collapse(2) simd
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1]
+ A[j-1][i] + A[j+1][i]);
error = fmax( error, fabs(Anew[j][i] - A[j][i]));
}
}
#pragma omp target teams distribute parallel for collapse(2) simd
for( int j = 1; j < n-1; j++)
{
for( int i = 1; i < m-1; i++ )
{
A[j][i] = Anew[j][i];
}
}
Compile can always choose 1
team, making this behave as a
parallel for
Maybe the compiler can treat
SIMD as a coalescing hint?
Is this the new best practice?
Default schedule is implementation defined
20
Performance Portability Results
0
2
4
6
8
10
12
miniGhost (Mantevo) NEMO (Climate & Ocean) CLOVERLEAF (Physics)
CPU: MPI + OpenMP *
SpeedupvssingleCPUCore
PGI 15.10 Compiler
miniGhost: CPU: Intel Xeon E5-2698 v3, 2 sockets, 32-cores total, GPU: Tesla K80 (single GPU)
NEMO: Each socket CPU: Intel Xeon E5-‐2698 v3, 16 cores; GPU: NVIDIA K80 both GPUs
CLOVERLEAF: CPU: Dual socket Intel Xeon CPUE5-2690 v2, 20 cores total, GPU: Tesla K80 both GPUs * NEMOrun used all-MPI
21
Performance Portability Results
0
2
4
6
8
10
12
miniGhost (Mantevo) NEMO (Climate & Ocean) CLOVERLEAF (Physics)
CPU: MPI + OpenMP *
CPU: MPI + OpenACC
SpeedupvssingleCPUCore
PGI 15.10 Compiler
miniGhost: CPU: Intel Xeon E5-2698 v3, 2 sockets, 32-cores total, GPU: Tesla K80 (single GPU)
NEMO: Each socket CPU: Intel Xeon E5-‐2698 v3, 16 cores; GPU: NVIDIA K80 both GPUs
CLOVERLEAF: CPU: Dual socket Intel Xeon CPUE5-2690 v2, 20 cores total, GPU: Tesla K80 both GPUs * NEMOrun used all-MPI
22
Performance Portability Results
0
2
4
6
8
10
12
miniGhost (Mantevo) NEMO (Climate & Ocean) CLOVERLEAF (Physics)
CPU: MPI + OpenMP *
CPU: MPI + OpenACC
CPU + GPU: MPI + OpenACC
SpeedupvssingleCPUCore
PGI 15.10 Compiler
miniGhost: CPU: Intel Xeon E5-2698 v3, 2 sockets, 32-cores total, GPU: Tesla K80 (single GPU)
NEMO: Each socket CPU: Intel Xeon E5-‐2698 v3, 16 cores; GPU: NVIDIA K80 both GPUs
CLOVERLEAF: CPU: Dual socket Intel Xeon CPUE5-2690 v2, 20 cores total, GPU: Tesla K80 both GPUs
30X
* NEMOrun used all-MPI
23
Challenge to the Community
OpenMP should additionally adopt a descriptive programming style
It is possible to be performance portable by being descriptive first and
prescriptive as necessary
The community must adopt new best practices (parallel for isn’t
enough any more)

More Related Content

What's hot

Post-processing SAR images on Xeon Phi - a porting exercise
Post-processing SAR images on Xeon Phi - a porting exercisePost-processing SAR images on Xeon Phi - a porting exercise
Post-processing SAR images on Xeon Phi - a porting exerciseIntel IT Center
 
Intrinsics: Low-level engine development with Burst - Unite Copenhagen 2019
Intrinsics: Low-level engine development with Burst - Unite Copenhagen 2019 Intrinsics: Low-level engine development with Burst - Unite Copenhagen 2019
Intrinsics: Low-level engine development with Burst - Unite Copenhagen 2019 Unity Technologies
 
An introduction to ROP
An introduction to ROPAn introduction to ROP
An introduction to ROPSaumil Shah
 
Machine-Learning-based Performance Heuristics for Runtime CPU/GPU Selection
Machine-Learning-based Performance Heuristics for Runtime CPU/GPU SelectionMachine-Learning-based Performance Heuristics for Runtime CPU/GPU Selection
Machine-Learning-based Performance Heuristics for Runtime CPU/GPU SelectionAkihiro Hayashi
 
SPU Optimizations - Part 2
SPU Optimizations - Part 2SPU Optimizations - Part 2
SPU Optimizations - Part 2Naughty Dog
 
Machine-learning based performance heuristics for Runtime CPU/GPU Selection i...
Machine-learning based performance heuristics for Runtime CPU/GPU Selection i...Machine-learning based performance heuristics for Runtime CPU/GPU Selection i...
Machine-learning based performance heuristics for Runtime CPU/GPU Selection i...Akihiro Hayashi
 
Introduction to Chainer
Introduction to ChainerIntroduction to Chainer
Introduction to ChainerSeiya Tokui
 
Lab 3 nust control
Lab 3 nust controlLab 3 nust control
Lab 3 nust controltalhawaqar
 
Algorithm Selection for Preferred Extensions Enumeration
Algorithm Selection for Preferred Extensions EnumerationAlgorithm Selection for Preferred Extensions Enumeration
Algorithm Selection for Preferred Extensions EnumerationFederico Cerutti
 
Optimize your game with the Profile Analyzer - Unite Copenhagen 2019
Optimize your game with the Profile Analyzer - Unite Copenhagen 2019Optimize your game with the Profile Analyzer - Unite Copenhagen 2019
Optimize your game with the Profile Analyzer - Unite Copenhagen 2019Unity Technologies
 
Argumentation Extensions Enumeration as a Constraint Satisfaction Problem: a ...
Argumentation Extensions Enumeration as a Constraint Satisfaction Problem: a ...Argumentation Extensions Enumeration as a Constraint Satisfaction Problem: a ...
Argumentation Extensions Enumeration as a Constraint Satisfaction Problem: a ...Federico Cerutti
 
TMPA-2017: The Quest for Average Response Time
TMPA-2017: The Quest for Average Response TimeTMPA-2017: The Quest for Average Response Time
TMPA-2017: The Quest for Average Response TimeIosif Itkin
 
Yampa AFRP Introduction
Yampa AFRP IntroductionYampa AFRP Introduction
Yampa AFRP IntroductionChengHui Weng
 
A Simple Communication System Design Lab #4 with MATLAB Simulink
A Simple Communication System Design Lab #4 with MATLAB SimulinkA Simple Communication System Design Lab #4 with MATLAB Simulink
A Simple Communication System Design Lab #4 with MATLAB SimulinkJaewook. Kang
 
A SCC Recursive Meta-Algorithm for Computing Preferred Labellings in Abstract...
A SCC Recursive Meta-Algorithm for Computing Preferred Labellings in Abstract...A SCC Recursive Meta-Algorithm for Computing Preferred Labellings in Abstract...
A SCC Recursive Meta-Algorithm for Computing Preferred Labellings in Abstract...Federico Cerutti
 
Practical Spherical Harmonics Based PRT Methods
Practical Spherical Harmonics Based PRT MethodsPractical Spherical Harmonics Based PRT Methods
Practical Spherical Harmonics Based PRT MethodsNaughty Dog
 

What's hot (18)

Post-processing SAR images on Xeon Phi - a porting exercise
Post-processing SAR images on Xeon Phi - a porting exercisePost-processing SAR images on Xeon Phi - a porting exercise
Post-processing SAR images on Xeon Phi - a porting exercise
 
Intrinsics: Low-level engine development with Burst - Unite Copenhagen 2019
Intrinsics: Low-level engine development with Burst - Unite Copenhagen 2019 Intrinsics: Low-level engine development with Burst - Unite Copenhagen 2019
Intrinsics: Low-level engine development with Burst - Unite Copenhagen 2019
 
An introduction to ROP
An introduction to ROPAn introduction to ROP
An introduction to ROP
 
Machine-Learning-based Performance Heuristics for Runtime CPU/GPU Selection
Machine-Learning-based Performance Heuristics for Runtime CPU/GPU SelectionMachine-Learning-based Performance Heuristics for Runtime CPU/GPU Selection
Machine-Learning-based Performance Heuristics for Runtime CPU/GPU Selection
 
SPU Optimizations - Part 2
SPU Optimizations - Part 2SPU Optimizations - Part 2
SPU Optimizations - Part 2
 
Machine-learning based performance heuristics for Runtime CPU/GPU Selection i...
Machine-learning based performance heuristics for Runtime CPU/GPU Selection i...Machine-learning based performance heuristics for Runtime CPU/GPU Selection i...
Machine-learning based performance heuristics for Runtime CPU/GPU Selection i...
 
Introduction to Chainer
Introduction to ChainerIntroduction to Chainer
Introduction to Chainer
 
Lab 3 nust control
Lab 3 nust controlLab 3 nust control
Lab 3 nust control
 
Algorithm Selection for Preferred Extensions Enumeration
Algorithm Selection for Preferred Extensions EnumerationAlgorithm Selection for Preferred Extensions Enumeration
Algorithm Selection for Preferred Extensions Enumeration
 
Optimize your game with the Profile Analyzer - Unite Copenhagen 2019
Optimize your game with the Profile Analyzer - Unite Copenhagen 2019Optimize your game with the Profile Analyzer - Unite Copenhagen 2019
Optimize your game with the Profile Analyzer - Unite Copenhagen 2019
 
Argumentation Extensions Enumeration as a Constraint Satisfaction Problem: a ...
Argumentation Extensions Enumeration as a Constraint Satisfaction Problem: a ...Argumentation Extensions Enumeration as a Constraint Satisfaction Problem: a ...
Argumentation Extensions Enumeration as a Constraint Satisfaction Problem: a ...
 
Cerutti -- TAFA2013
Cerutti -- TAFA2013Cerutti -- TAFA2013
Cerutti -- TAFA2013
 
TMPA-2017: The Quest for Average Response Time
TMPA-2017: The Quest for Average Response TimeTMPA-2017: The Quest for Average Response Time
TMPA-2017: The Quest for Average Response Time
 
Yampa AFRP Introduction
Yampa AFRP IntroductionYampa AFRP Introduction
Yampa AFRP Introduction
 
A Simple Communication System Design Lab #4 with MATLAB Simulink
A Simple Communication System Design Lab #4 with MATLAB SimulinkA Simple Communication System Design Lab #4 with MATLAB Simulink
A Simple Communication System Design Lab #4 with MATLAB Simulink
 
A SCC Recursive Meta-Algorithm for Computing Preferred Labellings in Abstract...
A SCC Recursive Meta-Algorithm for Computing Preferred Labellings in Abstract...A SCC Recursive Meta-Algorithm for Computing Preferred Labellings in Abstract...
A SCC Recursive Meta-Algorithm for Computing Preferred Labellings in Abstract...
 
6. Implementation
6. Implementation6. Implementation
6. Implementation
 
Practical Spherical Harmonics Based PRT Methods
Practical Spherical Harmonics Based PRT MethodsPractical Spherical Harmonics Based PRT Methods
Practical Spherical Harmonics Based PRT Methods
 

Viewers also liked

Ebusiness Marketing Software
Ebusiness Marketing SoftwareEbusiness Marketing Software
Ebusiness Marketing SoftwareNextBee Media
 
Policy Development Process Infographic Portuguese
Policy Development Process Infographic PortuguesePolicy Development Process Infographic Portuguese
Policy Development Process Infographic PortugueseICANN
 
Realidad aumentada gc94
Realidad aumentada gc94Realidad aumentada gc94
Realidad aumentada gc94Gabriel Romero
 
Pazarlama Konferansları Mayıs Haziran-Temmuz 2016
Pazarlama Konferansları Mayıs Haziran-Temmuz 2016Pazarlama Konferansları Mayıs Haziran-Temmuz 2016
Pazarlama Konferansları Mayıs Haziran-Temmuz 2016Roy+Teddy
 
OpenMP Tutorial for Beginners
OpenMP Tutorial for BeginnersOpenMP Tutorial for Beginners
OpenMP Tutorial for BeginnersDhanashree Prasad
 
Motricidad gruesa
Motricidad gruesaMotricidad gruesa
Motricidad gruesalissethorbe
 
Actividad1 unidadiii mtsm
Actividad1 unidadiii mtsmActividad1 unidadiii mtsm
Actividad1 unidadiii mtsmTere Segura
 

Viewers also liked (14)

Ebusiness Marketing Software
Ebusiness Marketing SoftwareEbusiness Marketing Software
Ebusiness Marketing Software
 
Policy Development Process Infographic Portuguese
Policy Development Process Infographic PortuguesePolicy Development Process Infographic Portuguese
Policy Development Process Infographic Portuguese
 
Pump3.bas
Pump3.basPump3.bas
Pump3.bas
 
Presentation1
Presentation1Presentation1
Presentation1
 
Archivio62
Archivio62Archivio62
Archivio62
 
07.08.2006, LAW, Anti-Corruption Law
07.08.2006, LAW, Anti-Corruption Law07.08.2006, LAW, Anti-Corruption Law
07.08.2006, LAW, Anti-Corruption Law
 
Realidad aumentada gc94
Realidad aumentada gc94Realidad aumentada gc94
Realidad aumentada gc94
 
sas
sassas
sas
 
Pazarlama Konferansları Mayıs Haziran-Temmuz 2016
Pazarlama Konferansları Mayıs Haziran-Temmuz 2016Pazarlama Konferansları Mayıs Haziran-Temmuz 2016
Pazarlama Konferansları Mayıs Haziran-Temmuz 2016
 
Epoch powerpoint
Epoch powerpointEpoch powerpoint
Epoch powerpoint
 
OpenMP Tutorial for Beginners
OpenMP Tutorial for BeginnersOpenMP Tutorial for Beginners
OpenMP Tutorial for Beginners
 
Motricidad gruesa
Motricidad gruesaMotricidad gruesa
Motricidad gruesa
 
caffiene and theine
caffiene and theinecaffiene and theine
caffiene and theine
 
Actividad1 unidadiii mtsm
Actividad1 unidadiii mtsmActividad1 unidadiii mtsm
Actividad1 unidadiii mtsm
 

Similar to Performance Portability Through Descriptive Parallelism

Network lap pgms 7th semester
Network lap pgms 7th semesterNetwork lap pgms 7th semester
Network lap pgms 7th semesterDOSONKA Group
 
Bti1022 lab sheet 8
Bti1022 lab sheet 8Bti1022 lab sheet 8
Bti1022 lab sheet 8alish sha
 
Bti1022 lab sheet 8
Bti1022 lab sheet 8Bti1022 lab sheet 8
Bti1022 lab sheet 8alish sha
 
programs on arrays.pdf
programs on arrays.pdfprograms on arrays.pdf
programs on arrays.pdfsowmya koneru
 
check the modifed code now you will get all operations done.termin.pdf
check the modifed code now you will get all operations done.termin.pdfcheck the modifed code now you will get all operations done.termin.pdf
check the modifed code now you will get all operations done.termin.pdfangelfragranc
 
Scaladays 2011 - The Ease of Scalaz
Scaladays 2011 - The Ease of ScalazScaladays 2011 - The Ease of Scalaz
Scaladays 2011 - The Ease of ScalazHeiko Seeberger
 
MH prediction modeling and validation in r (2) classification 190709
MH prediction modeling and validation in r (2) classification 190709MH prediction modeling and validation in r (2) classification 190709
MH prediction modeling and validation in r (2) classification 190709Min-hyung Kim
 
From Javascript To Haskell
From Javascript To HaskellFrom Javascript To Haskell
From Javascript To Haskellujihisa
 
En nybegynners introduksjon til scalaz
En nybegynners introduksjon til scalazEn nybegynners introduksjon til scalaz
En nybegynners introduksjon til scalazTrond Marius Øvstetun
 
convert the following C code to Mips assembly with steps and comment.pdf
convert the following C code to Mips assembly with steps and comment.pdfconvert the following C code to Mips assembly with steps and comment.pdf
convert the following C code to Mips assembly with steps and comment.pdfmurtuzadahadwala3
 
Csci101 lect04 advanced_selection
Csci101 lect04 advanced_selectionCsci101 lect04 advanced_selection
Csci101 lect04 advanced_selectionElsayed Hemayed
 
Adding Statistical Functionality to the DATA Step with PROC FCMP
Adding Statistical Functionality to the DATA Step with PROC FCMPAdding Statistical Functionality to the DATA Step with PROC FCMP
Adding Statistical Functionality to the DATA Step with PROC FCMPJacques Rioux
 

Similar to Performance Portability Through Descriptive Parallelism (20)

Sorting Technique
Sorting TechniqueSorting Technique
Sorting Technique
 
Network lap pgms 7th semester
Network lap pgms 7th semesterNetwork lap pgms 7th semester
Network lap pgms 7th semester
 
Bti1022 lab sheet 8
Bti1022 lab sheet 8Bti1022 lab sheet 8
Bti1022 lab sheet 8
 
Bti1022 lab sheet 8
Bti1022 lab sheet 8Bti1022 lab sheet 8
Bti1022 lab sheet 8
 
Python 1 liners
Python 1 linersPython 1 liners
Python 1 liners
 
programs on arrays.pdf
programs on arrays.pdfprograms on arrays.pdf
programs on arrays.pdf
 
Martha
MarthaMartha
Martha
 
C programs
C programsC programs
C programs
 
PRACTICAL COMPUTING
PRACTICAL COMPUTINGPRACTICAL COMPUTING
PRACTICAL COMPUTING
 
VTU Data Structures Lab Manual
VTU Data Structures Lab ManualVTU Data Structures Lab Manual
VTU Data Structures Lab Manual
 
check the modifed code now you will get all operations done.termin.pdf
check the modifed code now you will get all operations done.termin.pdfcheck the modifed code now you will get all operations done.termin.pdf
check the modifed code now you will get all operations done.termin.pdf
 
Scaladays 2011 - The Ease of Scalaz
Scaladays 2011 - The Ease of ScalazScaladays 2011 - The Ease of Scalaz
Scaladays 2011 - The Ease of Scalaz
 
R Code for EM Algorithm
R Code for EM AlgorithmR Code for EM Algorithm
R Code for EM Algorithm
 
MH prediction modeling and validation in r (2) classification 190709
MH prediction modeling and validation in r (2) classification 190709MH prediction modeling and validation in r (2) classification 190709
MH prediction modeling and validation in r (2) classification 190709
 
From Javascript To Haskell
From Javascript To HaskellFrom Javascript To Haskell
From Javascript To Haskell
 
En nybegynners introduksjon til scalaz
En nybegynners introduksjon til scalazEn nybegynners introduksjon til scalaz
En nybegynners introduksjon til scalaz
 
convert the following C code to Mips assembly with steps and comment.pdf
convert the following C code to Mips assembly with steps and comment.pdfconvert the following C code to Mips assembly with steps and comment.pdf
convert the following C code to Mips assembly with steps and comment.pdf
 
Csci101 lect04 advanced_selection
Csci101 lect04 advanced_selectionCsci101 lect04 advanced_selection
Csci101 lect04 advanced_selection
 
Adding Statistical Functionality to the DATA Step with PROC FCMP
Adding Statistical Functionality to the DATA Step with PROC FCMPAdding Statistical Functionality to the DATA Step with PROC FCMP
Adding Statistical Functionality to the DATA Step with PROC FCMP
 
Cpds lab
Cpds labCpds lab
Cpds lab
 

More from Jeff Larkin

FortranCon2020: Highly Parallel Fortran and OpenACC Directives
FortranCon2020: Highly Parallel Fortran and OpenACC DirectivesFortranCon2020: Highly Parallel Fortran and OpenACC Directives
FortranCon2020: Highly Parallel Fortran and OpenACC DirectivesJeff Larkin
 
SC13: OpenMP and NVIDIA
SC13: OpenMP and NVIDIASC13: OpenMP and NVIDIA
SC13: OpenMP and NVIDIAJeff Larkin
 
Refactoring Applications for the XK7 and Future Hybrid Architectures
Refactoring Applications for the XK7 and Future Hybrid ArchitecturesRefactoring Applications for the XK7 and Future Hybrid Architectures
Refactoring Applications for the XK7 and Future Hybrid ArchitecturesJeff Larkin
 
Optimizing GPU to GPU Communication on Cray XK7
Optimizing GPU to GPU Communication on Cray XK7Optimizing GPU to GPU Communication on Cray XK7
Optimizing GPU to GPU Communication on Cray XK7Jeff Larkin
 
Progress Toward Accelerating CAM-SE
Progress Toward Accelerating CAM-SEProgress Toward Accelerating CAM-SE
Progress Toward Accelerating CAM-SEJeff Larkin
 
HPCMPUG2011 cray tutorial
HPCMPUG2011 cray tutorialHPCMPUG2011 cray tutorial
HPCMPUG2011 cray tutorialJeff Larkin
 
CUG2011 Introduction to GPU Computing
CUG2011 Introduction to GPU ComputingCUG2011 Introduction to GPU Computing
CUG2011 Introduction to GPU ComputingJeff Larkin
 
Maximizing Application Performance on Cray XT6 and XE6 Supercomputers DOD-MOD...
Maximizing Application Performance on Cray XT6 and XE6 Supercomputers DOD-MOD...Maximizing Application Performance on Cray XT6 and XE6 Supercomputers DOD-MOD...
Maximizing Application Performance on Cray XT6 and XE6 Supercomputers DOD-MOD...Jeff Larkin
 
May2010 hex-core-opt
May2010 hex-core-optMay2010 hex-core-opt
May2010 hex-core-optJeff Larkin
 
A Comparison of Accelerator Programming Models
A Comparison of Accelerator Programming ModelsA Comparison of Accelerator Programming Models
A Comparison of Accelerator Programming ModelsJeff Larkin
 
Cray XT Porting, Scaling, and Optimization Best Practices
Cray XT Porting, Scaling, and Optimization Best PracticesCray XT Porting, Scaling, and Optimization Best Practices
Cray XT Porting, Scaling, and Optimization Best PracticesJeff Larkin
 
XT Best Practices
XT Best PracticesXT Best Practices
XT Best PracticesJeff Larkin
 
Practical Examples for Efficient I/O on Cray XT Systems (CUG 2009)
Practical Examples for Efficient I/O on Cray XT Systems (CUG 2009)Practical Examples for Efficient I/O on Cray XT Systems (CUG 2009)
Practical Examples for Efficient I/O on Cray XT Systems (CUG 2009)Jeff Larkin
 

More from Jeff Larkin (13)

FortranCon2020: Highly Parallel Fortran and OpenACC Directives
FortranCon2020: Highly Parallel Fortran and OpenACC DirectivesFortranCon2020: Highly Parallel Fortran and OpenACC Directives
FortranCon2020: Highly Parallel Fortran and OpenACC Directives
 
SC13: OpenMP and NVIDIA
SC13: OpenMP and NVIDIASC13: OpenMP and NVIDIA
SC13: OpenMP and NVIDIA
 
Refactoring Applications for the XK7 and Future Hybrid Architectures
Refactoring Applications for the XK7 and Future Hybrid ArchitecturesRefactoring Applications for the XK7 and Future Hybrid Architectures
Refactoring Applications for the XK7 and Future Hybrid Architectures
 
Optimizing GPU to GPU Communication on Cray XK7
Optimizing GPU to GPU Communication on Cray XK7Optimizing GPU to GPU Communication on Cray XK7
Optimizing GPU to GPU Communication on Cray XK7
 
Progress Toward Accelerating CAM-SE
Progress Toward Accelerating CAM-SEProgress Toward Accelerating CAM-SE
Progress Toward Accelerating CAM-SE
 
HPCMPUG2011 cray tutorial
HPCMPUG2011 cray tutorialHPCMPUG2011 cray tutorial
HPCMPUG2011 cray tutorial
 
CUG2011 Introduction to GPU Computing
CUG2011 Introduction to GPU ComputingCUG2011 Introduction to GPU Computing
CUG2011 Introduction to GPU Computing
 
Maximizing Application Performance on Cray XT6 and XE6 Supercomputers DOD-MOD...
Maximizing Application Performance on Cray XT6 and XE6 Supercomputers DOD-MOD...Maximizing Application Performance on Cray XT6 and XE6 Supercomputers DOD-MOD...
Maximizing Application Performance on Cray XT6 and XE6 Supercomputers DOD-MOD...
 
May2010 hex-core-opt
May2010 hex-core-optMay2010 hex-core-opt
May2010 hex-core-opt
 
A Comparison of Accelerator Programming Models
A Comparison of Accelerator Programming ModelsA Comparison of Accelerator Programming Models
A Comparison of Accelerator Programming Models
 
Cray XT Porting, Scaling, and Optimization Best Practices
Cray XT Porting, Scaling, and Optimization Best PracticesCray XT Porting, Scaling, and Optimization Best Practices
Cray XT Porting, Scaling, and Optimization Best Practices
 
XT Best Practices
XT Best PracticesXT Best Practices
XT Best Practices
 
Practical Examples for Efficient I/O on Cray XT Systems (CUG 2009)
Practical Examples for Efficient I/O on Cray XT Systems (CUG 2009)Practical Examples for Efficient I/O on Cray XT Systems (CUG 2009)
Practical Examples for Efficient I/O on Cray XT Systems (CUG 2009)
 

Recently uploaded

Artificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsArtificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsJoaquim Jorge
 
Boost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivityBoost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivityPrincipled Technologies
 
Top 5 Benefits OF Using Muvi Live Paywall For Live Streams
Top 5 Benefits OF Using Muvi Live Paywall For Live StreamsTop 5 Benefits OF Using Muvi Live Paywall For Live Streams
Top 5 Benefits OF Using Muvi Live Paywall For Live StreamsRoshan Dwivedi
 
2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...Martijn de Jong
 
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers:  A Deep Dive into Serverless Spatial Data and FMECloud Frontiers:  A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FMESafe Software
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerThousandEyes
 
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot TakeoffStrategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoffsammart93
 
Top 10 Most Downloaded Games on Play Store in 2024
Top 10 Most Downloaded Games on Play Store in 2024Top 10 Most Downloaded Games on Play Store in 2024
Top 10 Most Downloaded Games on Play Store in 2024SynarionITSolutions
 
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobe
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, AdobeApidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobe
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobeapidays
 
Automating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps ScriptAutomating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps Scriptwesley chun
 
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...apidays
 
Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024The Digital Insurer
 
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost SavingRepurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost SavingEdi Saputra
 
Data Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonData Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonAnna Loughnan Colquhoun
 
presentation ICT roal in 21st century education
presentation ICT roal in 21st century educationpresentation ICT roal in 21st century education
presentation ICT roal in 21st century educationjfdjdjcjdnsjd
 
A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)Gabriella Davis
 
Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024The Digital Insurer
 
Why Teams call analytics are critical to your entire business
Why Teams call analytics are critical to your entire businessWhy Teams call analytics are critical to your entire business
Why Teams call analytics are critical to your entire businesspanagenda
 
Powerful Google developer tools for immediate impact! (2023-24 C)
Powerful Google developer tools for immediate impact! (2023-24 C)Powerful Google developer tools for immediate impact! (2023-24 C)
Powerful Google developer tools for immediate impact! (2023-24 C)wesley chun
 

Recently uploaded (20)

Artificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsArtificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and Myths
 
Boost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivityBoost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivity
 
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
 
Top 5 Benefits OF Using Muvi Live Paywall For Live Streams
Top 5 Benefits OF Using Muvi Live Paywall For Live StreamsTop 5 Benefits OF Using Muvi Live Paywall For Live Streams
Top 5 Benefits OF Using Muvi Live Paywall For Live Streams
 
2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...
 
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers:  A Deep Dive into Serverless Spatial Data and FMECloud Frontiers:  A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected Worker
 
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot TakeoffStrategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
 
Top 10 Most Downloaded Games on Play Store in 2024
Top 10 Most Downloaded Games on Play Store in 2024Top 10 Most Downloaded Games on Play Store in 2024
Top 10 Most Downloaded Games on Play Store in 2024
 
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobe
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, AdobeApidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobe
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobe
 
Automating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps ScriptAutomating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps Script
 
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
 
Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024
 
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost SavingRepurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
 
Data Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonData Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt Robison
 
presentation ICT roal in 21st century education
presentation ICT roal in 21st century educationpresentation ICT roal in 21st century education
presentation ICT roal in 21st century education
 
A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)
 
Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024
 
Why Teams call analytics are critical to your entire business
Why Teams call analytics are critical to your entire businessWhy Teams call analytics are critical to your entire business
Why Teams call analytics are critical to your entire business
 
Powerful Google developer tools for immediate impact! (2023-24 C)
Powerful Google developer tools for immediate impact! (2023-24 C)Powerful Google developer tools for immediate impact! (2023-24 C)
Powerful Google developer tools for immediate impact! (2023-24 C)
 

Performance Portability Through Descriptive Parallelism

  • 1. Jeff Larkin, April 20, 2016 Performance Portability Through Descriptive Parallelism
  • 2. 2 My definition of performance portability The same source code will run productively on a variety of different architectures.
  • 3. 3 Two Types of Portability FUNCTIONAL PORTABILITY The ability for a single code to run anywhere. PERFORMANCE PORTABILITY The ability for a single code to run well anywhere.
  • 4. 4 Performance Portability is Not Best == Best Optimal Everywhere Optimal Anywhere?
  • 5. 5 I Assert Descriptive Parallelism enables performance portable code.
  • 6. 6 Descriptive or Prescriptive Parallelism Programmer explicitly parallelizes the code, compiler obeys Requires little/no analysis by the compiler Substantially different architectures require different directives Fairly consistent behavior between implementations Prescriptive Compiler parallelizes the code with guidance from the programmer Compiler must make decisions from available information Compiler uses information from the programmer and heuristics about the architecture to make decisions Quality of implementation greatly affects results. Descriptive
  • 7. 7 Prescribing vs. Describing while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc parallel loop reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma acc loop reduction(max:error) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma acc parallel loop for( int j = 1; j < n-1; j++) { #pragma acc loop for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); }
  • 8. 8 Prescribing vs. Describing while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc parallel loop reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma acc loop reduction(max:error) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma acc parallel loop for( int j = 1; j < n-1; j++) { #pragma acc loop for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target { #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); }
  • 9. 9 Prescribing vs. Describing while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc parallel loop reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma acc loop reduction(max:error) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma acc parallel loop for( int j = 1; j < n-1; j++) { #pragma acc loop for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target { #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } }
  • 10. 10 Prescribing vs. Describing while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc parallel loop reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma acc loop reduction(max:error) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma acc parallel loop for( int j = 1; j < n-1; j++) { #pragma acc loop for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target { #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for reduction(max:error) schedule(static,1) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for schedule(static,1) for( int i = 1; i < m-1; i++ ) {
  • 11. 11 Prescribing vs. Describing while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc parallel loop reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma acc loop reduction(max:error) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma acc parallel loop for( int j = 1; j < n-1; j++) { #pragma acc loop for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target { #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for reduction(max:error) schedule(static,1) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for schedule(static,1) for( int i = 1; i < m-1; i++ ) { #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) collapse(2) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for collapse(2) for( int j = 1; j < n-1; j++) {
  • 12. 12 Prescribing vs. Describing while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc parallel loop reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma acc loop reduction(max:error) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma acc parallel loop for( int j = 1; j < n-1; j++) { #pragma acc loop for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target { #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for reduction(max:error) schedule(static,1) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for schedule(static,1) for( int i = 1; i < m-1; i++ ) { #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for reduction(max:error) schedule(static,1) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for schedule(static,1)
  • 13. 13 Prescribing vs. Describing while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { #pragma omp simd for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma acc parallel loop reduction(max:error) for( int j = 1; j < n-1; j++) { #pragma acc loop reduction(max:error) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma acc parallel loop for( int j = 1; j < n-1; j++) { #pragma acc loop for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target { #pragma omp parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } } if(iter++ % 100 == 0) printf("%5d, %0.6fn", iter, error); } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for reduction(max:error) schedule(static,1) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for schedule(static,1) for( int i = 1; i < m-1; i++ ) { #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for reduction(max:error) schedule(static,1) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for schedule(static,1) #pragma omp target data map(alloc:Anew) map(A) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) collapse(2) schedule(static,1) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for collapse(2) schedule(static,1)
  • 14. 14 Execution Time (Smaller is Better) 1.00X 5.12X 0.12X 1.01X 2.47X 0.96X 4.00X 17.42X 4.96X 23.03X 0.00 20.00 40.00 60.00 80.00 100.00 120.00 140.00 Original CPU Threaded GPU Threaded GPU Teams GPU Split GPU Collapse GPU Split Sched GPU Collapse Sched OpenACC (CPU) OpenACC (GPU) 893 Same directives (unoptimized) ExecutionTime(seconds) NVIDIA Tesla K40, Intel Xeon E5-2690 v2 @3.00GHz – CLANGpre-3.8, PGI 16.3
  • 15. 15 Importance of Loop Scheduling 1.00X 5.11X 1.07X 0.96X 17.42X 0 20 40 60 80 100 120 Serial OpenMP (CPU) OpenMP (CPU) interleaved OpenMP (GPU) OpenMP (GPU) interleaved Time(s) NVIDIA Tesla K40, Intel Xeon E5-2690 v2 @3.00GHz –CLANGpre-3.8, PGI 16.3
  • 16. 16 Importance of Loop Scheduling 1.00X 16.55X 2.36X 1.64X 29.84X 0 20 40 60 80 100 120 140 160 180 200 Serial OpenMP (CPU) OpenMP (CPU) interleaved OpenMP (GPU) OpenMP (GPU) interleaved Time(s) NVIDIA Tesla K40, Intel Xeon E5-2690 v2 @3.00GHz – Intel C Compiler 15.0, CLANGpre-3.8, PGI 16.3 ICC 15.0 CLANG
  • 17. 17 Why Can’t OpenMP Do This? (1) #pragma omp target teams distribute parallel for reduction(max:error) collapse(2) schedule(static,1) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for collapse(2) schedule(static,1) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } The compiler can freely specify the default schedule, but not add the collapse Wrong loop for coalescing on the GPU Default schedule is implementation defined
  • 18. 18 Why Can’t OpenMP Do This? (2) #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for reduction(max:error) schedule(static,1) for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute for( int j = 1; j < n-1; j++) { #pragma omp parallel for schedule(static,1) for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } Right loop for coalescing on the GPU Makes no sense to distribute to teams on the CPU Default schedule is implementation defined
  • 19. 19 Why Can’t OpenMP Do This? (3) #pragma omp target teams distribute parallel for reduction(max:error) collapse(2) simd for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for collapse(2) simd for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } Compile can always choose 1 team, making this behave as a parallel for Maybe the compiler can treat SIMD as a coalescing hint? Is this the new best practice? Default schedule is implementation defined
  • 20. 20 Performance Portability Results 0 2 4 6 8 10 12 miniGhost (Mantevo) NEMO (Climate & Ocean) CLOVERLEAF (Physics) CPU: MPI + OpenMP * SpeedupvssingleCPUCore PGI 15.10 Compiler miniGhost: CPU: Intel Xeon E5-2698 v3, 2 sockets, 32-cores total, GPU: Tesla K80 (single GPU) NEMO: Each socket CPU: Intel Xeon E5-‐2698 v3, 16 cores; GPU: NVIDIA K80 both GPUs CLOVERLEAF: CPU: Dual socket Intel Xeon CPUE5-2690 v2, 20 cores total, GPU: Tesla K80 both GPUs * NEMOrun used all-MPI
  • 21. 21 Performance Portability Results 0 2 4 6 8 10 12 miniGhost (Mantevo) NEMO (Climate & Ocean) CLOVERLEAF (Physics) CPU: MPI + OpenMP * CPU: MPI + OpenACC SpeedupvssingleCPUCore PGI 15.10 Compiler miniGhost: CPU: Intel Xeon E5-2698 v3, 2 sockets, 32-cores total, GPU: Tesla K80 (single GPU) NEMO: Each socket CPU: Intel Xeon E5-‐2698 v3, 16 cores; GPU: NVIDIA K80 both GPUs CLOVERLEAF: CPU: Dual socket Intel Xeon CPUE5-2690 v2, 20 cores total, GPU: Tesla K80 both GPUs * NEMOrun used all-MPI
  • 22. 22 Performance Portability Results 0 2 4 6 8 10 12 miniGhost (Mantevo) NEMO (Climate & Ocean) CLOVERLEAF (Physics) CPU: MPI + OpenMP * CPU: MPI + OpenACC CPU + GPU: MPI + OpenACC SpeedupvssingleCPUCore PGI 15.10 Compiler miniGhost: CPU: Intel Xeon E5-2698 v3, 2 sockets, 32-cores total, GPU: Tesla K80 (single GPU) NEMO: Each socket CPU: Intel Xeon E5-‐2698 v3, 16 cores; GPU: NVIDIA K80 both GPUs CLOVERLEAF: CPU: Dual socket Intel Xeon CPUE5-2690 v2, 20 cores total, GPU: Tesla K80 both GPUs 30X * NEMOrun used all-MPI
  • 23. 23 Challenge to the Community OpenMP should additionally adopt a descriptive programming style It is possible to be performance portable by being descriptive first and prescriptive as necessary The community must adopt new best practices (parallel for isn’t enough any more)