IT代写 This work is licensed under a Creative Commons Attribution-NonCommercial-No

This work is licensed under a Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License
#ifndef NUMT #define NUMT #endif
Anatomy of the CUDA matrixMult Program: 2 #defines, #includes, and Globals
#include #include #include #include #include

#include #include “helper_functions.h” #include “helper_cuda.h”
#ifndef MATRIX_SIZE #define MATRIX_SIZE #endif
#define AROWS #define ACOLS
#define BROWS #define BCOLS
#define ACOLSBROWS #define CROWS #define CCOLS
MATRIX_SIZE MATRIX_SIZE
MATRIX_SIZE MATRIX_SIZE
ACOLS AROWS BCOLS
// better be the same!
float hA[AROWS][ACOLS];
float hB[BROWS][BCOLS];
float hC[CROWS][CCOLS];
Computer Graphics
mjb – May 4, 2021
Computer Graphics
CUDA Matrix Multiplication
Mike Bailey
cudaMatrixMult.pptx
mjb – May 4, 2021
Anatomy of a CUDA Program: 3 Error-Checking
void CudaCheckError( ) {
cudaError_t e = cudaGetLastError( ); if( e != cudaSuccess )
fprintf( stderr, “CUDA failure %s:%d: ‘%s’\n”, __FILE__, __LINE__, cudaGetErrorString(e)); }
Computer Graphics
mjb – May 4, 2021
int crow = gid / CCOLS; int ccol = gid % CCOLS;
int aindex = crow * ACOLS;
int bindex = ccol;
int cindex = crow * CCOLS + ccol;
// a[i][0] // b[0][j] // c[i][j]
float cij = 0.;
for( int k = 0; k < ACOLSBROWS; k++ ) { cij += A[aindex] * B[bindex]; aindex++; bindex += BCOLS; C[cindex] = cij; // __syncthreads( ); } Computer Graphics Anatomy of a CUDA Program: 4 The Kernel Function __global__ void MatrixMul( float *A, float *B, float *C ) { // [A] is AROWS x ACOLS // [B] is BROWS x BCOLS // [C] is CROWS x CCOLS = AROWS x BCOLS int blockNum = blockIdx.y*gridDim.x + blockIdx.x; int blockThreads = blockNum*blockDim.x*blockDim.y; int gid = blockThreads + threadIdx.y*blockDim.x + threadIdx.x; mjb – May 4, 2021 Anatomy of a CUDA Program: 5 Setting Up the Memory for the Matrices This is a defined constant in one of the CUDA .h files In cudaMemcpy( ), it’s always the second argument getting copied to the first! Computer Graphics mjb – May 4, 2021 // allocate device memory: float *dA, *dB, *dC; cudaMalloc( (void **)(&dA), sizeof(hA) ); cudaMalloc( (void **)(&dB), sizeof(hB) ); cudaMalloc( (void **)(&dC), sizeof(hC) ); CudaCheckError( ); // copy host memory to device memory: cudaMemcpy( dA, hA, sizeof(hA), cudaMemcpyHostToDevice ); cudaMemcpy( dB, hB, sizeof(hB), cudaMemcpyHostToDevice ); Anatomy of a CUDA Program: 6 Getting Ready to Execute // setup execution parameters: dim3 threads( NUMT, NUMT, 1 ); if( threads.x > CROWS )
threads.x = CROWS; if( threads.y > CCOLS )
threads.y = CCOLS;
dim3 grid( CROWS / threads.x, CCOLS / threads.y );
// create cuda events for timing: cudaEvent_t start, stop; cudaEventCreate( &start ); cudaEventCreate( &stop ); CudaCheckError( );
// record the start event: cudaEventRecord( start, NULL );
Computer Graphics
mjb – May 4, 2021
Anatomy of a CUDA Program: 7 Executing the Kernel
// execute the kernel:
MatrixMul<<< grid, threads >>>( dA, dB, dC );
Function call arguments # of blocks # of threads per block
• The call to MatrixMul( ) returns immediately!
• If you upload the resulting array (dC) right away, it will have garbage in
• To block until the kernel is finished, call:
cudaDeviceSynchronize( );
Computer Graphics
mjb – May 4, 2021
Anatomy of a CUDA Program: 8 Getting the Stop Time and Printing Performance
cudaDeviceSynchronize( );
// record the stop event: cudaEventRecord( stop, NULL );
// wait for the stop event to complete: cudaEventSynchronize( stop );
float msecTotal;
cudaEventElapsedTime( &millisecsTotal, start, stop );
// performance in multiplies per second:
// note: this in milliseconds
double secondsTotal = millisecsTotal / 1000.0; // change it to seconds
double multipliesTotal = (double)CROWS * (double)CCOLS * (double)ACOLSBROWS; double gigaMultipliesPerSecond = ( multipliesTotal / 1000000000. ) / secondsTotal; fprintf( stderr, “%6d\t%6d\t%10.3lf\n”, CROWS, CCOLS, gigaMultipliesPerSecond );
Computer Graphics
mjb – May 4, 2021

Anatomy of a CUDA Program:
Copying the Matrix from the Device back to the Host
cudaMemcpy( hC, dC ,sizeof(hC), cudaMemcpyDeviceToHost ); CudaCheckError( );
// clean up: cudaFree( dA ); cudaFree( dB ); cudaFree( dC ); CudaCheckError( );
This is a defined constant in one of the CUDA .h files
In cudaMemcpy( ), it’s always the second argument getting copied to the first!
Computer Graphics
mjb – May 4, 2021

程序代写 CS代考加微信: powcoder QQ: 1823890830 Email: powcoder@163.com

Related Posts