4. PATHso your compiler knows where to find the libraries
SET UP
THE
export PATH=/usr/local/cuda-5.0/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda-5.0/lib:/usr/local/cuda-5.0/lib64:$LD_LIBRARY_PATH
[YourAccount@John ~]$ ls –a
[YourAccount@John ~]$ vi .bash_profile
1.Open the bash profile
2.Add these lines to the file
5. MAKEFILE
to configure your compilation for the source code
CREATE
THE
MAIN=filename
${MAIN} .e:
nvcc ${MAIN} .cu –o ${MAIN} .e –m64 –arch sm_35 –lcublas –O3
Create a makefile something like this:
6. MEMORYso the GPU can actually store the data in computation
CHECK
YOUR
Global memory available on one card: 5GB.
8. CUBLAS
Declaration of Device Array
Allocation of Device Array
Declaration of Host Array
Copy Array : Host to Device
Copy Array : Device to Host
De-allocation of Device Array
Execution of CUBLAS
Initialization of CUBLAS
Assignment of CUDA Device
Termination of CUBLAS
#include <cuda_runtime.h>
#include <cublas_v2.h>
…
Double* M;
Double* m;
/* similar for V & v & A & a */
…
cudaSetDevice(0);
9. CUBLAS
Declaration of Device Array
Allocation of Device Array
Declaration of Host Array
Copy Array : Host to Device
Copy Array : Device to Host
De-allocation of Device Array
Execution of CUBLAS
Initialization of CUBLAS
Assignment of CUDA Device
Termination of CUBLAS
…
TS=sizeof(double);
size=n*n*typesize;
cudaMalloc( (void**)&m, size );
/* similar for V & v & A & a */
…
cublasStatus_t status;
cublasHandle_t handle;
status=cublasCreate(&handle);
...
10. CUBLAS
Declaration of Device Array
Allocation of Device Array
Declaration of Host Array
Copy Array : Host to Device
Copy Array : Device to Host
De-allocation of Device Array
Execution of CUBLAS
Initialization of CUBLAS
Assignment of CUDA Device
Termination of CUBLAS
…
cublasSetVector(n*n,TS,M,1,m,1);
/* similar for V & v */
…
cublasDgemv( handle,
CUBLAS_OP_N,
n, n, &alpha, m, n, v, 1, &beta, a, 1
);
11. CUBLAS
Declaration of Device Array
Allocation of Device Array
Declaration of Host Array
Copy Array : Host to Device
Copy Array : Device to Host
De-allocation of Device Array
Execution of CUBLAS
Initialization of CUBLAS
Assignment of CUDA Device
Termination of CUBLAS
…
cublasGetVector(n,TS,a,1,A,1);
…
cublasDestroy(handle);
…
cudaFree(m);
/* similar for v & a */
20. #include <cuda_runtime.h>
#define IJToIdx(i,j,n) (j*n+i)
…
Double* M;
Double* m;
/* similar for V & v & A & a */
…
cudaSetDevice(0);
Direct Parallelization
Declaration of Device Array
Allocation of Device Array
Declaration of Host Array
Copy Array : Host to Device
Copy Array : Device to Host
De-allocation of Device Array
Execution of Kernel
Determination of Size for Grid & Block
Assignment of CUDA Device
Allocation of Device Array
Copy Array : Host to Device
Determination of Size for Grid & Block
21. Direct Parallelization
Declaration of Device Array
Allocation of Device Array
Declaration of Host Array
Copy Array : Host to Device
Copy Array : Device to Host
De-allocation of Device Array
Execution of Kernel
Determination of Size for Grid & Block
Assignment of CUDA Device
Allocation of Device Array
Copy Array : Host to Device
Determination of Size for Grid & Block
…
TS=sizeof(double);
size=n*n*typesize;
cudaMalloc( (void**)&m, size );
/* similar for V & v & A & a */
…
cudaMemcpy(m, M, size,
cudaMemcpyHostToDevice);
/* similar for V & v */
...
22. Direct Parallelization
Declaration of Device Array
Allocation of Device Array
Declaration of Host Array
Copy Array : Host to Device
Copy Array : Device to Host
De-allocation of Device Array
Execution of Kernel
Determination of Size for Grid & Block
Assignment of CUDA Device
Allocation of Device Array
Copy Array : Host to Device
Determination of Size for Grid & Block
1.Memory assessment
2.Memory alignment
3.Data flow
Use as many threads as possible:
a[ i] m[11] … m[1n]
v[1]
…
v[ j]
…
v[n]
= *
23. Direct Parallelization
Declaration of Device Array
Allocation of Device Array
Declaration of Host Array
Copy Array : Host to Device
Copy Array : Device to Host
De-allocation of Device Array
Execution of Kernel
Determination of Size for Grid & Block
Assignment of CUDA Device
Allocation of Device Array
Copy Array : Host to Device
Determination of Size for Grid & Block
…
My_Dgemv<<<n,1>>>( … );
…
__global__ My_Dgemv( … ){
/* algorithm for MV */
};
24. Direct Parallelization
Declaration of Device Array
Allocation of Device Array
Declaration of Host Array
Copy Array : Host to Device
Copy Array : Device to Host
De-allocation of Device Array
Execution of Kernel
Determination of Size for Grid & Block
Assignment of CUDA Device
Allocation of Device Array
Copy Array : Host to Device
Determination of Size for Grid & Block
__global__ My_Dgemv( … ){
…
id=BlockIdx.x;
i=id;
a[i]=0;
For(j=0,j<n,j++){
a[i]=a[i]+m[ IJToIdx(i,j,n) ]*a[j];
}
}
25. Direct Parallelization
Declaration of Device Array
Allocation of Device Array
Declaration of Host Array
Copy Array : Host to Device
Copy Array : Device to Host
De-allocation of Device Array
Execution of Kernel
Determination of Size for Grid & Block
Assignment of CUDA Device
Allocation of Device Array
Copy Array : Host to Device
Determination of Size for Grid & Block
…
cudaMemcpy(M, m, size,
cudaMemcpyDeviceToHost);
/* similar for A & a */
...
cudaFree(m);
/* similar for v & a */
…