This work is licensed under a Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License
#ifndef NUMT #define NUMT #endif
Anatomy of the CUDA matrixMult Program: 2 #defines, #includes, and Globals
#include
Copyright By PowCoder代写 加微信 powcoder
#include
#ifndef MATRIX_SIZE #define MATRIX_SIZE #endif
#define AROWS #define ACOLS
#define BROWS #define BCOLS
#define ACOLSBROWS #define CROWS #define CCOLS
MATRIX_SIZE MATRIX_SIZE
MATRIX_SIZE MATRIX_SIZE
ACOLS AROWS BCOLS
// better be the same!
float hA[AROWS][ACOLS];
float hB[BROWS][BCOLS];
float hC[CROWS][CCOLS];
Computer Graphics
mjb – May 4, 2021
Computer Graphics
CUDA Matrix Multiplication
Mike Bailey
cudaMatrixMult.pptx
mjb – May 4, 2021
Anatomy of a CUDA Program: 3 Error-Checking
void CudaCheckError( ) {
cudaError_t e = cudaGetLastError( ); if( e != cudaSuccess )
fprintf( stderr, “CUDA failure %s:%d: ‘%s’\n”, __FILE__, __LINE__, cudaGetErrorString(e)); }
Computer Graphics
mjb – May 4, 2021
int crow = gid / CCOLS; int ccol = gid % CCOLS;
int aindex = crow * ACOLS;
int bindex = ccol;
int cindex = crow * CCOLS + ccol;
// a[i][0] // b[0][j] // c[i][j]
float cij = 0.;
for( int k = 0; k < ACOLSBROWS; k++ ) {
cij += A[aindex] * B[bindex]; aindex++;
bindex += BCOLS;
C[cindex] = cij;
// __syncthreads( ); }
Computer Graphics
Anatomy of a CUDA Program: 4 The Kernel Function
__global__ void MatrixMul( float *A, float *B, float *C ) {
// [A] is AROWS x ACOLS
// [B] is BROWS x BCOLS
// [C] is CROWS x CCOLS = AROWS x BCOLS
int blockNum = blockIdx.y*gridDim.x + blockIdx.x;
int blockThreads = blockNum*blockDim.x*blockDim.y;
int gid = blockThreads + threadIdx.y*blockDim.x + threadIdx.x;
mjb – May 4, 2021
Anatomy of a CUDA Program: 5 Setting Up the Memory for the Matrices
This is a defined constant in one of the CUDA .h files
In cudaMemcpy( ), it’s always the second argument getting copied to the first!
Computer Graphics
mjb – May 4, 2021
// allocate device memory:
float *dA, *dB, *dC;
cudaMalloc( (void **)(&dA), sizeof(hA) ); cudaMalloc( (void **)(&dB), sizeof(hB) ); cudaMalloc( (void **)(&dC), sizeof(hC) ); CudaCheckError( );
// copy host memory to device memory:
cudaMemcpy( dA, hA, sizeof(hA), cudaMemcpyHostToDevice ); cudaMemcpy( dB, hB, sizeof(hB), cudaMemcpyHostToDevice );
Anatomy of a CUDA Program: 6 Getting Ready to Execute
// setup execution parameters: dim3 threads( NUMT, NUMT, 1 ); if( threads.x > CROWS )
threads.x = CROWS; if( threads.y > CCOLS )
threads.y = CCOLS;
dim3 grid( CROWS / threads.x, CCOLS / threads.y );
// create cuda events for timing: cudaEvent_t start, stop; cudaEventCreate( &start ); cudaEventCreate( &stop ); CudaCheckError( );
// record the start event: cudaEventRecord( start, NULL );
Computer Graphics
mjb – May 4, 2021
Anatomy of a CUDA Program: 7 Executing the Kernel
// execute the kernel:
MatrixMul<<< grid, threads >>>( dA, dB, dC );
Function call arguments # of blocks # of threads per block
• The call to MatrixMul( ) returns immediately!
• If you upload the resulting array (dC) right away, it will have garbage in
• To block until the kernel is finished, call:
cudaDeviceSynchronize( );
Computer Graphics
mjb – May 4, 2021
Anatomy of a CUDA Program: 8 Getting the Stop Time and Printing Performance
cudaDeviceSynchronize( );
// record the stop event: cudaEventRecord( stop, NULL );
// wait for the stop event to complete: cudaEventSynchronize( stop );
float msecTotal;
cudaEventElapsedTime( &millisecsTotal, start, stop );
// performance in multiplies per second:
// note: this in milliseconds
double secondsTotal = millisecsTotal / 1000.0; // change it to seconds
double multipliesTotal = (double)CROWS * (double)CCOLS * (double)ACOLSBROWS; double gigaMultipliesPerSecond = ( multipliesTotal / 1000000000. ) / secondsTotal; fprintf( stderr, “%6d\t%6d\t%10.3lf\n”, CROWS, CCOLS, gigaMultipliesPerSecond );
Computer Graphics
mjb – May 4, 2021
Anatomy of a CUDA Program:
Copying the Matrix from the Device back to the Host
cudaMemcpy( hC, dC ,sizeof(hC), cudaMemcpyDeviceToHost ); CudaCheckError( );
// clean up: cudaFree( dA ); cudaFree( dB ); cudaFree( dC ); CudaCheckError( );
This is a defined constant in one of the CUDA .h files
In cudaMemcpy( ), it’s always the second argument getting copied to the first!
Computer Graphics
mjb – May 4, 2021
程序代写 CS代考 加微信: powcoder QQ: 1823890830 Email: powcoder@163.com