SlideShare une entreprise Scribd logo
1  sur  22
Télécharger pour lire hors ligne
VECTORIZATION ON X86
Roberto A. Vitillo (LBNL)

10/23/2013 Software & Computing workshop
Why do we care?

2.6 GHz SB-EP CPU, 8 cores	

8 add + 8 mul FLOPs/cycle/core

Theoretical performance in
GFLOPS

Theoretical performance /
maximum

multi-threaded and vectorized

166.4

100%

multi-threaded and not
vectorized

20.8

12.5%

serial and vectorized

20.8

12.5%

serial and not vectorized

2.6

1.6%

2
Single Instruction, Multiple Data	

•

processor throughput is increased by handling multiple data in parallel
(MMX, SSE, AVX, ...)	


•

vector of data is packed in one large register and handled in one operation	


•

SIMD instructions are energy-efficient	


•

exploiting SIMD is even more of importance for the Xeon PHI and future
architectures

3

blogs.intel.com
20

	

RT
DA 13

void multiplyMatrices(Float32x4List A, Float32x4List B, Float32x4List R) {
var a0 = A[0];
var a1 = A[1];
var a2 = A[2];
var a3 = A[3];

Vector sizes keep increasing but
programmers continue to use simple
abstractions of hardware registers

var b0
R[0] =
var b1
R[1] =
var b2
R[2] =
var b3
R[3] =

= B[0];
b0.xxxx
= B[1];
b1.xxxx
= B[2];
b2.xxxx
= B[3];
b3.xxxx

* a0 + b0.yyyy * a1 + b0.zzzz * a2 + b0.wwww * a3;
* a0 + b1.yyyy * a1 + b1.zzzz * a2 + b1.wwww * a3;
* a0 + b2.yyyy * a1 + b2.zzzz * a2 + b2.wwww * a3;
* a0 + b3.yyyy * a1 + b3.zzzz * a2 + b3.wwww * a3;

}

#ifdef ID_WIN_X86_SSE2_INTRIN

!
const __m128 mask_keep_last = __m128c( _mm_set_epi32( 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 ) );
!

__m128
__m128
__m128
__m128
__m128
__m128

m2a0
m2b0
m2c0
m2a1
m2b1
m2c1

__m128
__m128
__m128
__m128
__m128

tj0
tk0
tl0
tj1
tk1

=
=
=
=
=
=
=
=
=
=
=

_mm_load_ps(
_mm_load_ps(
_mm_load_ps(
_mm_load_ps(
_mm_load_ps(
_mm_load_ps(
_mm_and_ps(
_mm_and_ps(
_mm_and_ps(
_mm_and_ps(
_mm_and_ps(

inFloats2
inFloats2
inFloats2
inFloats2
inFloats2
inFloats2

m1a0,
m1b0,
m1c0,
m1a1,
m1b1,

+
+
+
+
+
+

0
0
0
1
1
1

*
*
*
*
*
*

12
12
12
12
12
12

mask_keep_last
mask_keep_last
mask_keep_last
mask_keep_last
mask_keep_last

+= 2 * 12, inFloats2 += 2 * 12, outFloats += 2 * 12 ) {
+ 0 );
+ 4 );
+ 8 );
+ 0 );
+ 4 );
+ 8 );

3	

M
O
O 05
D 20

for ( int i = 0; i  numJoints; i += 2, inFloats1
__m128 m1a0 = _mm_load_ps( inFloats1 + 0 * 12
__m128 m1b0 = _mm_load_ps( inFloats1 + 0 * 12
__m128 m1c0 = _mm_load_ps( inFloats1 + 0 * 12
__m128 m1a1 = _mm_load_ps( inFloats1 + 1 * 12
__m128 m1b1 = _mm_load_ps( inFloats1 + 1 * 12
__m128 m1c1 = _mm_load_ps( inFloats1 + 1 * 12

+
+
+
+
+
+

0
4
8
0
4
8

);
);
);
);
);

4

);
);
);
);
);
);
Technology
Autovectorization
Intel SIMD Directives

PROS
requires minimal code changes


CONS
unreliable*

might change program meaning,
work well for specific use cases
GCC mainline support missing
unreliable*, requires code
restructuring

OpenCL

clear conceptual model

Vectorized Libraries, e.g. MKL,
Eigen

programmer doesn’t need to
know low level details

Cilk Plus Array Notation

mostly reliable

GCC mainline support missing

ISPC


mostly reliable, clear conceptual
model

proper PHI support missing,
requires code restructuring

* unless significant time is invested inspecting the generated assembly and deciphering compiler messages

5
X

Y

Z

X

Y

Z

X

X

Y

Y

Z

Z

Addressing modes	

•

SSEx and AVX1 do not support strided accesses pattern and gather-scatter
accesses which force the compiler to generate scalar instructions	


•

even when “fancy” access patterns are supported (e.g. IMCI and AVX2) a
significant penalty is paid 	


•

convert your arrays of structures to structures of arrays
6
cac.cornell.edu

Memory alignment	

• unaligned

memory access may generate scalar instructions for
otherwise vectorizable code	


• vectorized
• align

loads from unaligned memory suffer from a penalty	


data in memory to the width of the SIMD unit
7
void foo(double *a, double *b)
{
for(int i = 0; i  SIZE; i++)
{
a[i] += b[i];
}
}

what we would like:

SIZE is a multiple of the
vector lane width	

Alignment of arrays
unknown to the
compiler	


vmovapd ymm0,YMMWORD PTR [rdi+rax*1]
vaddpd ymm0,ymm0,YMMWORD PTR [rsi+rax*1]
vmovapd YMMWORD PTR [rdi+rax*1],ymm0
add
rax,0x20
cmp
rax,0x186a0
jne
1a8 foo2+0x8

0:
4:
7:

lea
cmp
jb

9:
b:
10:
15:
1a:
22:
2a:
2e:
33:
3b:
3f:
45:
47:
4a:

xor
eax,eax
nop
DWORD PTR [rax+rax*1+0x0]
vmovupd xmm0,XMMWORD PTR [rsi+rax*1]
vmovupd xmm1,XMMWORD PTR [rdi+rax*1]
vinsertf128 ymm0,ymm0,XMMWORD PTR [rsi+rax*1+0x10],0x1
vinsertf128 ymm1,ymm1,XMMWORD PTR [rdi+rax*1+0x10],0x1
vaddpd ymm0,ymm1,ymm0
vmovupd XMMWORD PTR [rdi+rax*1],xmm0
vextractf128 XMMWORD PTR [rdi+rax*1+0x10],ymm0,0x1
add
rax,0x20
cmp
rax,0x186a0
jne
10 foo+0x10
vzeroupper
ret

4b:
4f:
52:
54:
56:
60:
65:
6a:
6f:
73:
79:

lea
cmp
jae
xor
nop
vmovsd
vaddsd
vmovsd
add
cmp
jne

!
what we actually get:
with autovectorization:

GCC has to check if the
arrays overlap	


!

If they do, scalar addition
is performed	

•

1a8:
1ad:
1b2:
1b7:
1bb:
1c1:

otherwise loop is
partially vectorized
8

rax,[rsi+0x20]
rdi,rax
4b foo+0x4b

rax,[rdi+0x20]
rsi,rax
9 foo+0x9
eax,eax
WORD PTR cs:[rax+rax*1+0x0]
xmm0,QWORD PTR [rdi+rax*1]
xmm0,xmm0,QWORD PTR [rsi+rax*1]
QWORD PTR [rdi+rax*1],xmm0
rax,0x8
rax,0x186a0
60 foo+0x60
void foo(double * restrict a, double * restrict b)
{
double *x = __builtin_assume_aligned(a, 16);
double *y = __builtin_assume_aligned(b, 16);

}

1a8:
1ad:
1b2:
1b7:
1bb:
1c1:

for(int i = 0; i  SIZE; i++)
{
x[i] += y[i];
}

vmovapd ymm0,YMMWORD PTR [rdi+rax*1]
vaddpd ymm0,ymm0,YMMWORD PTR [rsi+rax*1]
vmovapd YMMWORD PTR [rdi+rax*1],ymm0
add
rax,0x20
cmp
rax,0x186a0
jne
1a8 foo2+0x8

GCC finally generates optimal code	

Be ready to babysit the compiler with options and directives if
you want to use autovectorization

9
SIMD directives in ICC help in loop
nests with no dependencies and
branches 	

•

HPC friendly code	


•

Xeon PHI is well supported as
advertised	


Suggested pragmas:	

•

#pragma unroll(N)	


•

for (i=0; icount; i++){
for (y=1; y  height-1; y++) {
int c = 1 + y*WIDTHP+1;
int n = c-WIDTHP;
int s = c+WIDTHP;
int e = c+1;
int w = c-1;
int nw = n-1;
int ne = n+1;
int sw = s-1;
int se = s+1;
#pragma simd
#pragma vector nontemporal
for (x=1; x  width-1; x++) {
fout[c] = diag * fin[nw] +
diag * fin[ne] +
diag * fin[sw] +
diag * fin[se] +
next * fin[w] +
next * fin[e] +
next * fin[n] +
next * fin[s] +
ctr * fin[c];

#pragma vector aligned	


•

#pragma vector nontemporal	


•

}

#pragma simd assert

}

10

// increment to next location
c++;n++;s++;e++;w++;nw++;ne++;sw++;se++;

}
REAL *ftmp = fin;
fin = fout;
fout = ftmp;
How GPUs solved the problem
__global__ void mxm(float *X, float *Y){
const int i = blockIdx.x*blockDim.x + threadIdx.x;

!

if(X[i] != 0)
X[i] = X[i] - Y[i];
else
Y[i] = X[i] * Y[i];

}

even though scalar code is generated, the
hardware implicitly vectorizes it (SIMT)
/*0058*/
/*0060*/
/*0068*/
/*0070*/
/*0078*/
/*0080*/

@!P0
@P0
@!P0
@P0

FSETP.NEU.AND P0, PT, R2, RZ, PT;
FMUL R3, R2, R0;
FADD R0, R2, -R0;
ST.E [R6], R3;
ST.E [R4], R0;
EXIT ;

instead of branching, instructions are predicated

Scalar loads/stores in GPUs are automatically coalesced into vector
loads/stores by the hardware	

Predicated instructions are generated for simple branching constructs
11
PUSH

POP

POP

...
if(...){
if(...){

PUSH

}
}else{
!
}
...

Branch Synchronization Stack (BSS)	

•

entries consists of a mask (1bit) that determines which lane commits its
results	


•

comes in handy for nested branching constructs	


Instruction markers	

•

push/pop a mask on the BSS when a branch diverges/converges into/from
multiple execution paths	


In other words, writing a vectorizing compiler for a GPU is an easier task than
writing one for CPU
12
How Intel is learning from GPUs

Opmask registers are expressed using the notation “k1” through “k7”.	

Instructions supporting conditional vector operation are expressed
by attaching the notation {k[1-7]} next to the destination operand	

E.g.: VADDPS	
  zmm1	
  {k1}{z},	
  zmm2,	
  zmm3

13
Let’s emulate a GPU in software

Intel SPMD Program Compiler (ISPC) extends a C-based language
with “single program, multiple data” (SPMD) constructs	

An ISPC program describes the behavior of a single program
instance	

•

even though a “gang” of them is in reality being executed	


•

gang size is usually no more than 2-4x the native SIMD width of
the machine	


For CUDA affectionados	

•

ISPC Program is similar to a CUDA thread	


•

An ISPC gang is similar to a CUDA warp
14
Execution of a SPMD program with a gang size of 4

Observations:	

•

diverging control flow reduces the utilization of vector
instructions	


•

vectorization adds masking overhead
15
uniform variable is shared among program instances
make function available to be called from
application code
foreach expresses a parallel computation

each program instance has a private instance of
a non-uniform variable (a.k.a. varying variable)

!

export void simple(uniform float vin[], uniform float vout[],
uniform int count) {
foreach (index = 0 ... count) {
float v = vin[index];
if (v  3.)
v = v * v;
else
v = sqrt(v);
vout[index] = v;
}
}

simple.ispc, compiled with ispc
#include stdio.h
#include simple.h

!

int main() {
float vin[16], vout[16];
for (int i = 0; i  16; ++i)
vin[i] = i;
simple(vin, vout, 16);

}

for (int i = 0; i  16; ++i)
printf(%d: simple(%f) = %fn, i, vin[i], vout[i]);

main.c, compiled with GCC
16

ispc function is called like any other function
from the C/C++ application
foreach(k = 0 ... 6){
int i = k * 7;

Prints [0, 7, 14, 21, 28, 35, ((42)), ((49))]

print(%n, i);
double* dR
double* dA

= P[i];
= P[i+3];

...
}

gang size of 8
0

7

14

21

28

35

(42) (49)

Inactive Program Instances
17
export void foo(uniform float * uniform A, uniform int n){
foreach(i = 0 ... n){
A[i*8] *= A[i*8];
}
}

18
ISPC vs GCC DGEMM
ISPC vs GCC SGEMM
8

7.6

7

inline void mxm(uniform float * uniform A,
uniform float * uniform B,
uniform float * uniform C,
uniform int M,
uniform int N,
uniform int K,
uniform int nmat,
int idx)
{
for(uniform int i = 0; i  M; i++){
for(uniform int j = 0; j  N; j++){
float sum = 0;
for(uniform int k = 0; k  K; k++){
sum += A[i*K*nmat + k*nmat + idx] * B[k*N*nmat + j*nmat + idx];
}

5

C[i*N*nmat + j*nmat + idx] = sum;
}

4
3
1
0

}

3.7

}

!

export void gemm(uniform float * uniform A,
uniform float * uniform B,
uniform float * uniform C,
uniform int M,
uniform int N,
uniform int K,
uniform int nmat)
{
foreach(i = 0 ... nmat){
mxm(A, B, C, M, N, K, nmat, i);
}
}

xGEMM 5x5 speedup over 1000 matrices (GCC 4.8 -O3, Ivy Bridge)

19
scalar

ISPC

8

export void startFilter(uniform KalmanFilter * uniform filter,
uniform KalmanFilterParameter * uniform param){
foreach(i = 0 ... filter-ntracks){
filterTrack(filter, param, i);
}
}

!

7
6
5

5.3

4

inline void filterTrack(uniform KalmanFilter * uniform filter,
uniform KalmanFilterParameter * uniform param,
int i){
...
for(uniform int h = 0; h  param-max_hit_count; h++){
if(h = param-hit_count[i])
continue;
predictState(filter, param, h, i);
predictCovariance(filter, param, h, i);
if(param-hits[h].is2Dim[i]){
...
correctGain2D(filter, i);
correctState2D(filter, i);
correctCovariance2D(filter, i);
}else{
...
correctGain1D(filter, i);
correctState1D(filter, i);
correctCovariance1D(filter, i);
}

3
2
1

1

0
Toy KalmanFilter speedup (double), Ivy Bridge

}
...
}

same technique can be used for Multi-Track Fitter 

(see S. Fleischmann presentation)
20
What did we learn

Use vectorized math libraries like MKL or Eigen wherever
you can	

On Xeons use ISPC to vectorize your code	

•

compatible across different compilers, architectures and
operating systems	


On Xeons/Xeons PHI use SIMD Directives and Cilk Plus to
vectorize your code	

•

if you can use the ICC compiler	


•

if you have “nice” loop nests (HPC-code like)
21
Vectorization on x86: all you need to know

Contenu connexe

Tendances

A Bit More Deferred Cry Engine3
A Bit More Deferred   Cry Engine3A Bit More Deferred   Cry Engine3
A Bit More Deferred Cry Engine3guest11b095
 
OpenGL NVIDIA Command-List: Approaching Zero Driver Overhead
OpenGL NVIDIA Command-List: Approaching Zero Driver OverheadOpenGL NVIDIA Command-List: Approaching Zero Driver Overhead
OpenGL NVIDIA Command-List: Approaching Zero Driver OverheadTristan Lorach
 
Secrets of CryENGINE 3 Graphics Technology
Secrets of CryENGINE 3 Graphics TechnologySecrets of CryENGINE 3 Graphics Technology
Secrets of CryENGINE 3 Graphics TechnologyTiago Sousa
 
Next generation graphics programming on xbox 360
Next generation graphics programming on xbox 360Next generation graphics programming on xbox 360
Next generation graphics programming on xbox 360VIKAS SINGH BHADOURIA
 
Crysis Next-Gen Effects (GDC 2008)
Crysis Next-Gen Effects (GDC 2008)Crysis Next-Gen Effects (GDC 2008)
Crysis Next-Gen Effects (GDC 2008)Tiago Sousa
 
Rendering Technologies from Crysis 3 (GDC 2013)
Rendering Technologies from Crysis 3 (GDC 2013)Rendering Technologies from Crysis 3 (GDC 2013)
Rendering Technologies from Crysis 3 (GDC 2013)Tiago Sousa
 
Code로 이해하는 RNN
Code로 이해하는 RNNCode로 이해하는 RNN
Code로 이해하는 RNNSANG WON PARK
 
Advanced Scenegraph Rendering Pipeline
Advanced Scenegraph Rendering PipelineAdvanced Scenegraph Rendering Pipeline
Advanced Scenegraph Rendering PipelineNarann29
 
GDC 2014 - Deformable Snow Rendering in Batman: Arkham Origins
GDC 2014 - Deformable Snow Rendering in Batman: Arkham OriginsGDC 2014 - Deformable Snow Rendering in Batman: Arkham Origins
GDC 2014 - Deformable Snow Rendering in Batman: Arkham OriginsColin Barré-Brisebois
 
Graphics Gems from CryENGINE 3 (Siggraph 2013)
Graphics Gems from CryENGINE 3 (Siggraph 2013)Graphics Gems from CryENGINE 3 (Siggraph 2013)
Graphics Gems from CryENGINE 3 (Siggraph 2013)Tiago Sousa
 
Optimizing the graphics pipeline with compute
Optimizing the graphics pipeline with computeOptimizing the graphics pipeline with compute
Optimizing the graphics pipeline with computeWuBinbo
 
Gamma and linear color-space
Gamma and linear color-spaceGamma and linear color-space
Gamma and linear color-space민웅 이
 
Practical Occlusion Culling in Killzone 3
Practical Occlusion Culling in Killzone 3Practical Occlusion Culling in Killzone 3
Practical Occlusion Culling in Killzone 3Guerrilla
 
HPG 2018 - Game Ray Tracing: State-of-the-Art and Open Problems
HPG 2018 - Game Ray Tracing: State-of-the-Art and Open ProblemsHPG 2018 - Game Ray Tracing: State-of-the-Art and Open Problems
HPG 2018 - Game Ray Tracing: State-of-the-Art and Open ProblemsElectronic Arts / DICE
 
FlameWorks GTC 2014
FlameWorks GTC 2014FlameWorks GTC 2014
FlameWorks GTC 2014Simon Green
 

Tendances (20)

A Bit More Deferred Cry Engine3
A Bit More Deferred   Cry Engine3A Bit More Deferred   Cry Engine3
A Bit More Deferred Cry Engine3
 
LLVM
LLVMLLVM
LLVM
 
OpenGL NVIDIA Command-List: Approaching Zero Driver Overhead
OpenGL NVIDIA Command-List: Approaching Zero Driver OverheadOpenGL NVIDIA Command-List: Approaching Zero Driver Overhead
OpenGL NVIDIA Command-List: Approaching Zero Driver Overhead
 
Inferred lighting
Inferred lightingInferred lighting
Inferred lighting
 
Secrets of CryENGINE 3 Graphics Technology
Secrets of CryENGINE 3 Graphics TechnologySecrets of CryENGINE 3 Graphics Technology
Secrets of CryENGINE 3 Graphics Technology
 
Next generation graphics programming on xbox 360
Next generation graphics programming on xbox 360Next generation graphics programming on xbox 360
Next generation graphics programming on xbox 360
 
Crysis Next-Gen Effects (GDC 2008)
Crysis Next-Gen Effects (GDC 2008)Crysis Next-Gen Effects (GDC 2008)
Crysis Next-Gen Effects (GDC 2008)
 
Scope Stack Allocation
Scope Stack AllocationScope Stack Allocation
Scope Stack Allocation
 
Rendering Technologies from Crysis 3 (GDC 2013)
Rendering Technologies from Crysis 3 (GDC 2013)Rendering Technologies from Crysis 3 (GDC 2013)
Rendering Technologies from Crysis 3 (GDC 2013)
 
Code로 이해하는 RNN
Code로 이해하는 RNNCode로 이해하는 RNN
Code로 이해하는 RNN
 
Advanced Scenegraph Rendering Pipeline
Advanced Scenegraph Rendering PipelineAdvanced Scenegraph Rendering Pipeline
Advanced Scenegraph Rendering Pipeline
 
GDC 2014 - Deformable Snow Rendering in Batman: Arkham Origins
GDC 2014 - Deformable Snow Rendering in Batman: Arkham OriginsGDC 2014 - Deformable Snow Rendering in Batman: Arkham Origins
GDC 2014 - Deformable Snow Rendering in Batman: Arkham Origins
 
Graphics Gems from CryENGINE 3 (Siggraph 2013)
Graphics Gems from CryENGINE 3 (Siggraph 2013)Graphics Gems from CryENGINE 3 (Siggraph 2013)
Graphics Gems from CryENGINE 3 (Siggraph 2013)
 
Optimizing the graphics pipeline with compute
Optimizing the graphics pipeline with computeOptimizing the graphics pipeline with compute
Optimizing the graphics pipeline with compute
 
Gamma and linear color-space
Gamma and linear color-spaceGamma and linear color-space
Gamma and linear color-space
 
A Real-time Radiosity Architecture
A Real-time Radiosity ArchitectureA Real-time Radiosity Architecture
A Real-time Radiosity Architecture
 
Apresentação Linguagem C
Apresentação Linguagem CApresentação Linguagem C
Apresentação Linguagem C
 
Practical Occlusion Culling in Killzone 3
Practical Occlusion Culling in Killzone 3Practical Occlusion Culling in Killzone 3
Practical Occlusion Culling in Killzone 3
 
HPG 2018 - Game Ray Tracing: State-of-the-Art and Open Problems
HPG 2018 - Game Ray Tracing: State-of-the-Art and Open ProblemsHPG 2018 - Game Ray Tracing: State-of-the-Art and Open Problems
HPG 2018 - Game Ray Tracing: State-of-the-Art and Open Problems
 
FlameWorks GTC 2014
FlameWorks GTC 2014FlameWorks GTC 2014
FlameWorks GTC 2014
 

Similaire à Vectorization on x86: all you need to know

Дмитрий Вовк: Векторизация кода под мобильные платформы
Дмитрий Вовк: Векторизация кода под мобильные платформыДмитрий Вовк: Векторизация кода под мобильные платформы
Дмитрий Вовк: Векторизация кода под мобильные платформыDevGAMM Conference
 
The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...
The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...
The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...Positive Hack Days
 
Static analysis of C++ source code
Static analysis of C++ source codeStatic analysis of C++ source code
Static analysis of C++ source codeAndrey Karpov
 
Static analysis of C++ source code
Static analysis of C++ source codeStatic analysis of C++ source code
Static analysis of C++ source codePVS-Studio
 
PVS-Studio 5.00, a solution for developers of modern resource-intensive appl...
PVS-Studio 5.00, a solution for developers of modern resource-intensive appl...PVS-Studio 5.00, a solution for developers of modern resource-intensive appl...
PVS-Studio 5.00, a solution for developers of modern resource-intensive appl...Andrey Karpov
 
Code vectorization for mobile devices
Code vectorization for mobile devicesCode vectorization for mobile devices
Code vectorization for mobile devicesSt1X
 
Data Acquisition
Data AcquisitionData Acquisition
Data Acquisitionazhar557
 
2.1 ### uVision Project, (C) Keil Software .docx
2.1   ### uVision Project, (C) Keil Software    .docx2.1   ### uVision Project, (C) Keil Software    .docx
2.1 ### uVision Project, (C) Keil Software .docxtarifarmarie
 
Windbg랑 친해지기
Windbg랑 친해지기Windbg랑 친해지기
Windbg랑 친해지기Ji Hun Kim
 
Georgy Nosenko - An introduction to the use SMT solvers for software security
Georgy Nosenko - An introduction to the use SMT solvers for software securityGeorgy Nosenko - An introduction to the use SMT solvers for software security
Georgy Nosenko - An introduction to the use SMT solvers for software securityDefconRussia
 
Cryptography and secure systems
Cryptography and secure systemsCryptography and secure systems
Cryptography and secure systemsVsevolod Stakhov
 
PVS-Studio, a solution for resource intensive applications development
PVS-Studio, a solution for resource intensive applications developmentPVS-Studio, a solution for resource intensive applications development
PVS-Studio, a solution for resource intensive applications developmentOOO "Program Verification Systems"
 
Exploring Compiler Optimization Opportunities for the OpenMP 4.x Accelerator...
Exploring Compiler Optimization Opportunities for the OpenMP 4.x Accelerator...Exploring Compiler Optimization Opportunities for the OpenMP 4.x Accelerator...
Exploring Compiler Optimization Opportunities for the OpenMP 4.x Accelerator...Akihiro Hayashi
 
Simd programming introduction
Simd programming introductionSimd programming introduction
Simd programming introductionChamp Yen
 

Similaire à Vectorization on x86: all you need to know (20)

Дмитрий Вовк: Векторизация кода под мобильные платформы
Дмитрий Вовк: Векторизация кода под мобильные платформыДмитрий Вовк: Векторизация кода под мобильные платформы
Дмитрий Вовк: Векторизация кода под мобильные платформы
 
Vectorization in ATLAS
Vectorization in ATLASVectorization in ATLAS
Vectorization in ATLAS
 
The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...
The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...
The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...
 
Static analysis of C++ source code
Static analysis of C++ source codeStatic analysis of C++ source code
Static analysis of C++ source code
 
Static analysis of C++ source code
Static analysis of C++ source codeStatic analysis of C++ source code
Static analysis of C++ source code
 
Fedor Polyakov - Optimizing computer vision problems on mobile platforms
Fedor Polyakov - Optimizing computer vision problems on mobile platforms Fedor Polyakov - Optimizing computer vision problems on mobile platforms
Fedor Polyakov - Optimizing computer vision problems on mobile platforms
 
PVS-Studio 5.00, a solution for developers of modern resource-intensive appl...
PVS-Studio 5.00, a solution for developers of modern resource-intensive appl...PVS-Studio 5.00, a solution for developers of modern resource-intensive appl...
PVS-Studio 5.00, a solution for developers of modern resource-intensive appl...
 
Code vectorization for mobile devices
Code vectorization for mobile devicesCode vectorization for mobile devices
Code vectorization for mobile devices
 
Data Acquisition
Data AcquisitionData Acquisition
Data Acquisition
 
2.1 ### uVision Project, (C) Keil Software .docx
2.1   ### uVision Project, (C) Keil Software    .docx2.1   ### uVision Project, (C) Keil Software    .docx
2.1 ### uVision Project, (C) Keil Software .docx
 
Windbg랑 친해지기
Windbg랑 친해지기Windbg랑 친해지기
Windbg랑 친해지기
 
Georgy Nosenko - An introduction to the use SMT solvers for software security
Georgy Nosenko - An introduction to the use SMT solvers for software securityGeorgy Nosenko - An introduction to the use SMT solvers for software security
Georgy Nosenko - An introduction to the use SMT solvers for software security
 
3D-DRESD Lorenzo Pavesi
3D-DRESD Lorenzo Pavesi3D-DRESD Lorenzo Pavesi
3D-DRESD Lorenzo Pavesi
 
Cryptography and secure systems
Cryptography and secure systemsCryptography and secure systems
Cryptography and secure systems
 
Joel Falcou, Boost.SIMD
Joel Falcou, Boost.SIMDJoel Falcou, Boost.SIMD
Joel Falcou, Boost.SIMD
 
PVS-Studio, a solution for resource intensive applications development
PVS-Studio, a solution for resource intensive applications developmentPVS-Studio, a solution for resource intensive applications development
PVS-Studio, a solution for resource intensive applications development
 
Exploiting vectorization with ISPC
Exploiting vectorization with ISPCExploiting vectorization with ISPC
Exploiting vectorization with ISPC
 
Boosting Developer Productivity with Clang
Boosting Developer Productivity with ClangBoosting Developer Productivity with Clang
Boosting Developer Productivity with Clang
 
Exploring Compiler Optimization Opportunities for the OpenMP 4.x Accelerator...
Exploring Compiler Optimization Opportunities for the OpenMP 4.x Accelerator...Exploring Compiler Optimization Opportunities for the OpenMP 4.x Accelerator...
Exploring Compiler Optimization Opportunities for the OpenMP 4.x Accelerator...
 
Simd programming introduction
Simd programming introductionSimd programming introduction
Simd programming introduction
 

Plus de Roberto Agostino Vitillo (12)

Telemetry Onboarding
Telemetry OnboardingTelemetry Onboarding
Telemetry Onboarding
 
Growing a Data Pipeline for Analytics
Growing a Data Pipeline for AnalyticsGrowing a Data Pipeline for Analytics
Growing a Data Pipeline for Analytics
 
Telemetry Datasets
Telemetry DatasetsTelemetry Datasets
Telemetry Datasets
 
Growing a SQL Query
Growing a SQL QueryGrowing a SQL Query
Growing a SQL Query
 
Telemetry Onboarding
Telemetry OnboardingTelemetry Onboarding
Telemetry Onboarding
 
All you need to know about Statistics
All you need to know about StatisticsAll you need to know about Statistics
All you need to know about Statistics
 
Spark meets Telemetry
Spark meets TelemetrySpark meets Telemetry
Spark meets Telemetry
 
Sharing C++ objects in Linux
Sharing C++ objects in LinuxSharing C++ objects in Linux
Sharing C++ objects in Linux
 
Performance tools developments
Performance tools developmentsPerformance tools developments
Performance tools developments
 
GOoDA tutorial
GOoDA tutorialGOoDA tutorial
GOoDA tutorial
 
Callgraph analysis
Callgraph analysisCallgraph analysis
Callgraph analysis
 
Inter-process communication on steroids
Inter-process communication on steroidsInter-process communication on steroids
Inter-process communication on steroids
 

Dernier

QCon London: Mastering long-running processes in modern architectures
QCon London: Mastering long-running processes in modern architecturesQCon London: Mastering long-running processes in modern architectures
QCon London: Mastering long-running processes in modern architecturesBernd Ruecker
 
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptxThe Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptxLoriGlavin3
 
2024 April Patch Tuesday
2024 April Patch Tuesday2024 April Patch Tuesday
2024 April Patch TuesdayIvanti
 
The State of Passkeys with FIDO Alliance.pptx
The State of Passkeys with FIDO Alliance.pptxThe State of Passkeys with FIDO Alliance.pptx
The State of Passkeys with FIDO Alliance.pptxLoriGlavin3
 
[Webinar] SpiraTest - Setting New Standards in Quality Assurance
[Webinar] SpiraTest - Setting New Standards in Quality Assurance[Webinar] SpiraTest - Setting New Standards in Quality Assurance
[Webinar] SpiraTest - Setting New Standards in Quality AssuranceInflectra
 
A Deep Dive on Passkeys: FIDO Paris Seminar.pptx
A Deep Dive on Passkeys: FIDO Paris Seminar.pptxA Deep Dive on Passkeys: FIDO Paris Seminar.pptx
A Deep Dive on Passkeys: FIDO Paris Seminar.pptxLoriGlavin3
 
How to write a Business Continuity Plan
How to write a Business Continuity PlanHow to write a Business Continuity Plan
How to write a Business Continuity PlanDatabarracks
 
How to Effectively Monitor SD-WAN and SASE Environments with ThousandEyes
How to Effectively Monitor SD-WAN and SASE Environments with ThousandEyesHow to Effectively Monitor SD-WAN and SASE Environments with ThousandEyes
How to Effectively Monitor SD-WAN and SASE Environments with ThousandEyesThousandEyes
 
Microsoft 365 Copilot: How to boost your productivity with AI – Part one: Ado...
Microsoft 365 Copilot: How to boost your productivity with AI – Part one: Ado...Microsoft 365 Copilot: How to boost your productivity with AI – Part one: Ado...
Microsoft 365 Copilot: How to boost your productivity with AI – Part one: Ado...Nikki Chapple
 
Arizona Broadband Policy Past, Present, and Future Presentation 3/25/24
Arizona Broadband Policy Past, Present, and Future Presentation 3/25/24Arizona Broadband Policy Past, Present, and Future Presentation 3/25/24
Arizona Broadband Policy Past, Present, and Future Presentation 3/25/24Mark Goldstein
 
Generative Artificial Intelligence: How generative AI works.pdf
Generative Artificial Intelligence: How generative AI works.pdfGenerative Artificial Intelligence: How generative AI works.pdf
Generative Artificial Intelligence: How generative AI works.pdfIngrid Airi González
 
Transcript: New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
Transcript: New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024Transcript: New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
Transcript: New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024BookNet Canada
 
Decarbonising Buildings: Making a net-zero built environment a reality
Decarbonising Buildings: Making a net-zero built environment a realityDecarbonising Buildings: Making a net-zero built environment a reality
Decarbonising Buildings: Making a net-zero built environment a realityIES VE
 
Passkey Providers and Enabling Portability: FIDO Paris Seminar.pptx
Passkey Providers and Enabling Portability: FIDO Paris Seminar.pptxPasskey Providers and Enabling Portability: FIDO Paris Seminar.pptx
Passkey Providers and Enabling Portability: FIDO Paris Seminar.pptxLoriGlavin3
 
React Native vs Ionic - The Best Mobile App Framework
React Native vs Ionic - The Best Mobile App FrameworkReact Native vs Ionic - The Best Mobile App Framework
React Native vs Ionic - The Best Mobile App FrameworkPixlogix Infotech
 
Abdul Kader Baba- Managing Cybersecurity Risks and Compliance Requirements i...
Abdul Kader Baba- Managing Cybersecurity Risks  and Compliance Requirements i...Abdul Kader Baba- Managing Cybersecurity Risks  and Compliance Requirements i...
Abdul Kader Baba- Managing Cybersecurity Risks and Compliance Requirements i...itnewsafrica
 
Design pattern talk by Kaya Weers - 2024 (v2)
Design pattern talk by Kaya Weers - 2024 (v2)Design pattern talk by Kaya Weers - 2024 (v2)
Design pattern talk by Kaya Weers - 2024 (v2)Kaya Weers
 
A Framework for Development in the AI Age
A Framework for Development in the AI AgeA Framework for Development in the AI Age
A Framework for Development in the AI AgeCprime
 
Connecting the Dots for Information Discovery.pdf
Connecting the Dots for Information Discovery.pdfConnecting the Dots for Information Discovery.pdf
Connecting the Dots for Information Discovery.pdfNeo4j
 
The Ultimate Guide to Choosing WordPress Pros and Cons
The Ultimate Guide to Choosing WordPress Pros and ConsThe Ultimate Guide to Choosing WordPress Pros and Cons
The Ultimate Guide to Choosing WordPress Pros and ConsPixlogix Infotech
 

Dernier (20)

QCon London: Mastering long-running processes in modern architectures
QCon London: Mastering long-running processes in modern architecturesQCon London: Mastering long-running processes in modern architectures
QCon London: Mastering long-running processes in modern architectures
 
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptxThe Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
 
2024 April Patch Tuesday
2024 April Patch Tuesday2024 April Patch Tuesday
2024 April Patch Tuesday
 
The State of Passkeys with FIDO Alliance.pptx
The State of Passkeys with FIDO Alliance.pptxThe State of Passkeys with FIDO Alliance.pptx
The State of Passkeys with FIDO Alliance.pptx
 
[Webinar] SpiraTest - Setting New Standards in Quality Assurance
[Webinar] SpiraTest - Setting New Standards in Quality Assurance[Webinar] SpiraTest - Setting New Standards in Quality Assurance
[Webinar] SpiraTest - Setting New Standards in Quality Assurance
 
A Deep Dive on Passkeys: FIDO Paris Seminar.pptx
A Deep Dive on Passkeys: FIDO Paris Seminar.pptxA Deep Dive on Passkeys: FIDO Paris Seminar.pptx
A Deep Dive on Passkeys: FIDO Paris Seminar.pptx
 
How to write a Business Continuity Plan
How to write a Business Continuity PlanHow to write a Business Continuity Plan
How to write a Business Continuity Plan
 
How to Effectively Monitor SD-WAN and SASE Environments with ThousandEyes
How to Effectively Monitor SD-WAN and SASE Environments with ThousandEyesHow to Effectively Monitor SD-WAN and SASE Environments with ThousandEyes
How to Effectively Monitor SD-WAN and SASE Environments with ThousandEyes
 
Microsoft 365 Copilot: How to boost your productivity with AI – Part one: Ado...
Microsoft 365 Copilot: How to boost your productivity with AI – Part one: Ado...Microsoft 365 Copilot: How to boost your productivity with AI – Part one: Ado...
Microsoft 365 Copilot: How to boost your productivity with AI – Part one: Ado...
 
Arizona Broadband Policy Past, Present, and Future Presentation 3/25/24
Arizona Broadband Policy Past, Present, and Future Presentation 3/25/24Arizona Broadband Policy Past, Present, and Future Presentation 3/25/24
Arizona Broadband Policy Past, Present, and Future Presentation 3/25/24
 
Generative Artificial Intelligence: How generative AI works.pdf
Generative Artificial Intelligence: How generative AI works.pdfGenerative Artificial Intelligence: How generative AI works.pdf
Generative Artificial Intelligence: How generative AI works.pdf
 
Transcript: New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
Transcript: New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024Transcript: New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
Transcript: New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
 
Decarbonising Buildings: Making a net-zero built environment a reality
Decarbonising Buildings: Making a net-zero built environment a realityDecarbonising Buildings: Making a net-zero built environment a reality
Decarbonising Buildings: Making a net-zero built environment a reality
 
Passkey Providers and Enabling Portability: FIDO Paris Seminar.pptx
Passkey Providers and Enabling Portability: FIDO Paris Seminar.pptxPasskey Providers and Enabling Portability: FIDO Paris Seminar.pptx
Passkey Providers and Enabling Portability: FIDO Paris Seminar.pptx
 
React Native vs Ionic - The Best Mobile App Framework
React Native vs Ionic - The Best Mobile App FrameworkReact Native vs Ionic - The Best Mobile App Framework
React Native vs Ionic - The Best Mobile App Framework
 
Abdul Kader Baba- Managing Cybersecurity Risks and Compliance Requirements i...
Abdul Kader Baba- Managing Cybersecurity Risks  and Compliance Requirements i...Abdul Kader Baba- Managing Cybersecurity Risks  and Compliance Requirements i...
Abdul Kader Baba- Managing Cybersecurity Risks and Compliance Requirements i...
 
Design pattern talk by Kaya Weers - 2024 (v2)
Design pattern talk by Kaya Weers - 2024 (v2)Design pattern talk by Kaya Weers - 2024 (v2)
Design pattern talk by Kaya Weers - 2024 (v2)
 
A Framework for Development in the AI Age
A Framework for Development in the AI AgeA Framework for Development in the AI Age
A Framework for Development in the AI Age
 
Connecting the Dots for Information Discovery.pdf
Connecting the Dots for Information Discovery.pdfConnecting the Dots for Information Discovery.pdf
Connecting the Dots for Information Discovery.pdf
 
The Ultimate Guide to Choosing WordPress Pros and Cons
The Ultimate Guide to Choosing WordPress Pros and ConsThe Ultimate Guide to Choosing WordPress Pros and Cons
The Ultimate Guide to Choosing WordPress Pros and Cons
 

Vectorization on x86: all you need to know

  • 1. VECTORIZATION ON X86 Roberto A. Vitillo (LBNL) 10/23/2013 Software & Computing workshop
  • 2. Why do we care? 2.6 GHz SB-EP CPU, 8 cores 8 add + 8 mul FLOPs/cycle/core Theoretical performance in GFLOPS Theoretical performance / maximum multi-threaded and vectorized 166.4 100% multi-threaded and not vectorized 20.8 12.5% serial and vectorized 20.8 12.5% serial and not vectorized 2.6 1.6% 2
  • 3. Single Instruction, Multiple Data • processor throughput is increased by handling multiple data in parallel (MMX, SSE, AVX, ...) • vector of data is packed in one large register and handled in one operation • SIMD instructions are energy-efficient • exploiting SIMD is even more of importance for the Xeon PHI and future architectures 3 blogs.intel.com
  • 4. 20 RT DA 13 void multiplyMatrices(Float32x4List A, Float32x4List B, Float32x4List R) { var a0 = A[0]; var a1 = A[1]; var a2 = A[2]; var a3 = A[3]; Vector sizes keep increasing but programmers continue to use simple abstractions of hardware registers var b0 R[0] = var b1 R[1] = var b2 R[2] = var b3 R[3] = = B[0]; b0.xxxx = B[1]; b1.xxxx = B[2]; b2.xxxx = B[3]; b3.xxxx * a0 + b0.yyyy * a1 + b0.zzzz * a2 + b0.wwww * a3; * a0 + b1.yyyy * a1 + b1.zzzz * a2 + b1.wwww * a3; * a0 + b2.yyyy * a1 + b2.zzzz * a2 + b2.wwww * a3; * a0 + b3.yyyy * a1 + b3.zzzz * a2 + b3.wwww * a3; } #ifdef ID_WIN_X86_SSE2_INTRIN ! const __m128 mask_keep_last = __m128c( _mm_set_epi32( 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 ) ); ! __m128 __m128 __m128 __m128 __m128 __m128 m2a0 m2b0 m2c0 m2a1 m2b1 m2c1 __m128 __m128 __m128 __m128 __m128 tj0 tk0 tl0 tj1 tk1 = = = = = = = = = = = _mm_load_ps( _mm_load_ps( _mm_load_ps( _mm_load_ps( _mm_load_ps( _mm_load_ps( _mm_and_ps( _mm_and_ps( _mm_and_ps( _mm_and_ps( _mm_and_ps( inFloats2 inFloats2 inFloats2 inFloats2 inFloats2 inFloats2 m1a0, m1b0, m1c0, m1a1, m1b1, + + + + + + 0 0 0 1 1 1 * * * * * * 12 12 12 12 12 12 mask_keep_last mask_keep_last mask_keep_last mask_keep_last mask_keep_last += 2 * 12, inFloats2 += 2 * 12, outFloats += 2 * 12 ) { + 0 ); + 4 ); + 8 ); + 0 ); + 4 ); + 8 ); 3 M O O 05 D 20 for ( int i = 0; i numJoints; i += 2, inFloats1 __m128 m1a0 = _mm_load_ps( inFloats1 + 0 * 12 __m128 m1b0 = _mm_load_ps( inFloats1 + 0 * 12 __m128 m1c0 = _mm_load_ps( inFloats1 + 0 * 12 __m128 m1a1 = _mm_load_ps( inFloats1 + 1 * 12 __m128 m1b1 = _mm_load_ps( inFloats1 + 1 * 12 __m128 m1c1 = _mm_load_ps( inFloats1 + 1 * 12 + + + + + + 0 4 8 0 4 8 ); ); ); ); ); 4 ); ); ); ); ); );
  • 5. Technology Autovectorization Intel SIMD Directives PROS requires minimal code changes
 CONS unreliable* might change program meaning, work well for specific use cases GCC mainline support missing unreliable*, requires code restructuring OpenCL clear conceptual model Vectorized Libraries, e.g. MKL, Eigen programmer doesn’t need to know low level details Cilk Plus Array Notation mostly reliable GCC mainline support missing ISPC
 mostly reliable, clear conceptual model proper PHI support missing, requires code restructuring * unless significant time is invested inspecting the generated assembly and deciphering compiler messages 5
  • 6. X Y Z X Y Z X X Y Y Z Z Addressing modes • SSEx and AVX1 do not support strided accesses pattern and gather-scatter accesses which force the compiler to generate scalar instructions • even when “fancy” access patterns are supported (e.g. IMCI and AVX2) a significant penalty is paid • convert your arrays of structures to structures of arrays 6
  • 7. cac.cornell.edu Memory alignment • unaligned memory access may generate scalar instructions for otherwise vectorizable code • vectorized • align loads from unaligned memory suffer from a penalty data in memory to the width of the SIMD unit 7
  • 8. void foo(double *a, double *b) { for(int i = 0; i SIZE; i++) { a[i] += b[i]; } } what we would like: SIZE is a multiple of the vector lane width Alignment of arrays unknown to the compiler vmovapd ymm0,YMMWORD PTR [rdi+rax*1] vaddpd ymm0,ymm0,YMMWORD PTR [rsi+rax*1] vmovapd YMMWORD PTR [rdi+rax*1],ymm0 add rax,0x20 cmp rax,0x186a0 jne 1a8 foo2+0x8 0: 4: 7: lea cmp jb 9: b: 10: 15: 1a: 22: 2a: 2e: 33: 3b: 3f: 45: 47: 4a: xor eax,eax nop DWORD PTR [rax+rax*1+0x0] vmovupd xmm0,XMMWORD PTR [rsi+rax*1] vmovupd xmm1,XMMWORD PTR [rdi+rax*1] vinsertf128 ymm0,ymm0,XMMWORD PTR [rsi+rax*1+0x10],0x1 vinsertf128 ymm1,ymm1,XMMWORD PTR [rdi+rax*1+0x10],0x1 vaddpd ymm0,ymm1,ymm0 vmovupd XMMWORD PTR [rdi+rax*1],xmm0 vextractf128 XMMWORD PTR [rdi+rax*1+0x10],ymm0,0x1 add rax,0x20 cmp rax,0x186a0 jne 10 foo+0x10 vzeroupper ret 4b: 4f: 52: 54: 56: 60: 65: 6a: 6f: 73: 79: lea cmp jae xor nop vmovsd vaddsd vmovsd add cmp jne ! what we actually get: with autovectorization: GCC has to check if the arrays overlap ! If they do, scalar addition is performed • 1a8: 1ad: 1b2: 1b7: 1bb: 1c1: otherwise loop is partially vectorized 8 rax,[rsi+0x20] rdi,rax 4b foo+0x4b rax,[rdi+0x20] rsi,rax 9 foo+0x9 eax,eax WORD PTR cs:[rax+rax*1+0x0] xmm0,QWORD PTR [rdi+rax*1] xmm0,xmm0,QWORD PTR [rsi+rax*1] QWORD PTR [rdi+rax*1],xmm0 rax,0x8 rax,0x186a0 60 foo+0x60
  • 9. void foo(double * restrict a, double * restrict b) { double *x = __builtin_assume_aligned(a, 16); double *y = __builtin_assume_aligned(b, 16); } 1a8: 1ad: 1b2: 1b7: 1bb: 1c1: for(int i = 0; i SIZE; i++) { x[i] += y[i]; } vmovapd ymm0,YMMWORD PTR [rdi+rax*1] vaddpd ymm0,ymm0,YMMWORD PTR [rsi+rax*1] vmovapd YMMWORD PTR [rdi+rax*1],ymm0 add rax,0x20 cmp rax,0x186a0 jne 1a8 foo2+0x8 GCC finally generates optimal code Be ready to babysit the compiler with options and directives if you want to use autovectorization 9
  • 10. SIMD directives in ICC help in loop nests with no dependencies and branches • HPC friendly code • Xeon PHI is well supported as advertised Suggested pragmas: • #pragma unroll(N) • for (i=0; icount; i++){ for (y=1; y height-1; y++) { int c = 1 + y*WIDTHP+1; int n = c-WIDTHP; int s = c+WIDTHP; int e = c+1; int w = c-1; int nw = n-1; int ne = n+1; int sw = s-1; int se = s+1; #pragma simd #pragma vector nontemporal for (x=1; x width-1; x++) { fout[c] = diag * fin[nw] + diag * fin[ne] + diag * fin[sw] + diag * fin[se] + next * fin[w] + next * fin[e] + next * fin[n] + next * fin[s] + ctr * fin[c]; #pragma vector aligned • #pragma vector nontemporal • } #pragma simd assert } 10 // increment to next location c++;n++;s++;e++;w++;nw++;ne++;sw++;se++; } REAL *ftmp = fin; fin = fout; fout = ftmp;
  • 11. How GPUs solved the problem __global__ void mxm(float *X, float *Y){ const int i = blockIdx.x*blockDim.x + threadIdx.x; ! if(X[i] != 0) X[i] = X[i] - Y[i]; else Y[i] = X[i] * Y[i]; } even though scalar code is generated, the hardware implicitly vectorizes it (SIMT) /*0058*/ /*0060*/ /*0068*/ /*0070*/ /*0078*/ /*0080*/ @!P0 @P0 @!P0 @P0 FSETP.NEU.AND P0, PT, R2, RZ, PT; FMUL R3, R2, R0; FADD R0, R2, -R0; ST.E [R6], R3; ST.E [R4], R0; EXIT ; instead of branching, instructions are predicated Scalar loads/stores in GPUs are automatically coalesced into vector loads/stores by the hardware Predicated instructions are generated for simple branching constructs 11
  • 12. PUSH POP POP ... if(...){ if(...){ PUSH } }else{ ! } ... Branch Synchronization Stack (BSS) • entries consists of a mask (1bit) that determines which lane commits its results • comes in handy for nested branching constructs Instruction markers • push/pop a mask on the BSS when a branch diverges/converges into/from multiple execution paths In other words, writing a vectorizing compiler for a GPU is an easier task than writing one for CPU 12
  • 13. How Intel is learning from GPUs Opmask registers are expressed using the notation “k1” through “k7”. Instructions supporting conditional vector operation are expressed by attaching the notation {k[1-7]} next to the destination operand E.g.: VADDPS  zmm1  {k1}{z},  zmm2,  zmm3 13
  • 14. Let’s emulate a GPU in software Intel SPMD Program Compiler (ISPC) extends a C-based language with “single program, multiple data” (SPMD) constructs An ISPC program describes the behavior of a single program instance • even though a “gang” of them is in reality being executed • gang size is usually no more than 2-4x the native SIMD width of the machine For CUDA affectionados • ISPC Program is similar to a CUDA thread • An ISPC gang is similar to a CUDA warp 14
  • 15. Execution of a SPMD program with a gang size of 4 Observations: • diverging control flow reduces the utilization of vector instructions • vectorization adds masking overhead 15
  • 16. uniform variable is shared among program instances make function available to be called from application code foreach expresses a parallel computation each program instance has a private instance of a non-uniform variable (a.k.a. varying variable) ! export void simple(uniform float vin[], uniform float vout[], uniform int count) { foreach (index = 0 ... count) { float v = vin[index]; if (v 3.) v = v * v; else v = sqrt(v); vout[index] = v; } } simple.ispc, compiled with ispc #include stdio.h #include simple.h ! int main() { float vin[16], vout[16]; for (int i = 0; i 16; ++i) vin[i] = i; simple(vin, vout, 16); } for (int i = 0; i 16; ++i) printf(%d: simple(%f) = %fn, i, vin[i], vout[i]); main.c, compiled with GCC 16 ispc function is called like any other function from the C/C++ application
  • 17. foreach(k = 0 ... 6){ int i = k * 7; Prints [0, 7, 14, 21, 28, 35, ((42)), ((49))] print(%n, i); double* dR double* dA = P[i]; = P[i+3]; ... } gang size of 8 0 7 14 21 28 35 (42) (49) Inactive Program Instances 17
  • 18. export void foo(uniform float * uniform A, uniform int n){ foreach(i = 0 ... n){ A[i*8] *= A[i*8]; } } 18
  • 19. ISPC vs GCC DGEMM ISPC vs GCC SGEMM 8 7.6 7 inline void mxm(uniform float * uniform A, uniform float * uniform B, uniform float * uniform C, uniform int M, uniform int N, uniform int K, uniform int nmat, int idx) { for(uniform int i = 0; i M; i++){ for(uniform int j = 0; j N; j++){ float sum = 0; for(uniform int k = 0; k K; k++){ sum += A[i*K*nmat + k*nmat + idx] * B[k*N*nmat + j*nmat + idx]; } 5 C[i*N*nmat + j*nmat + idx] = sum; } 4 3 1 0 } 3.7 } ! export void gemm(uniform float * uniform A, uniform float * uniform B, uniform float * uniform C, uniform int M, uniform int N, uniform int K, uniform int nmat) { foreach(i = 0 ... nmat){ mxm(A, B, C, M, N, K, nmat, i); } } xGEMM 5x5 speedup over 1000 matrices (GCC 4.8 -O3, Ivy Bridge) 19
  • 20. scalar ISPC 8 export void startFilter(uniform KalmanFilter * uniform filter, uniform KalmanFilterParameter * uniform param){ foreach(i = 0 ... filter-ntracks){ filterTrack(filter, param, i); } } ! 7 6 5 5.3 4 inline void filterTrack(uniform KalmanFilter * uniform filter, uniform KalmanFilterParameter * uniform param, int i){ ... for(uniform int h = 0; h param-max_hit_count; h++){ if(h = param-hit_count[i]) continue; predictState(filter, param, h, i); predictCovariance(filter, param, h, i); if(param-hits[h].is2Dim[i]){ ... correctGain2D(filter, i); correctState2D(filter, i); correctCovariance2D(filter, i); }else{ ... correctGain1D(filter, i); correctState1D(filter, i); correctCovariance1D(filter, i); } 3 2 1 1 0 Toy KalmanFilter speedup (double), Ivy Bridge } ... } same technique can be used for Multi-Track Fitter 
 (see S. Fleischmann presentation) 20
  • 21. What did we learn Use vectorized math libraries like MKL or Eigen wherever you can On Xeons use ISPC to vectorize your code • compatible across different compilers, architectures and operating systems On Xeons/Xeons PHI use SIMD Directives and Cilk Plus to vectorize your code • if you can use the ICC compiler • if you have “nice” loop nests (HPC-code like) 21