CUDA Array Multiplication
This work is licensed under a Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License
Computer Graphics
Copyright By PowCoder代写 加微信 powcoder
cudaArrayMult.pptx
mjb – March 27, 2021
Anatomy of the CUDA arrayMult Program: 2 #defines, #includes, and Globals
#include
// CUDA runtime
#include
// Helper functions and utilities to work with CUDA #include “helper_functions.h”
#include “helper_cuda.h”
#ifndef THREADS_PER_BLOCK #define THREADS_PER_BLOCK #endif
#ifndef DATASET_SIZE #define DATASET_SIZE #endif
float hA[ DATASET_SIZE ]; float hB[ DATASET_SIZE ]; float hC[ DATASET_SIZE ];
( 8*1024*1024 )
// number of threads in each block
// WARNING: DON’T CALL THIS “ARRAYSIZE” ! // size of the array
The defined constant ARRAYSIZE is already used in one of the CUDA .h files
Computer Graphics
mjb – March 27, 2021
Anatomy of a CUDA Program: 3 Error-Checking
void CudaCheckError( ) {
cudaError_t e = cudaGetLastError( ); if( e != cudaSuccess )
fprintf( stderr, “CUDA failure %s:%d: ‘%s’\n”, __FILE__, __LINE__, cudaGetErrorString(e)); }
Computer Graphics
mjb – March 27, 2021
Computer Graphics
Anatomy of a CUDA Program: 4 The Kernel Function
// array multiplication on the device: C = A * B
__global__ void ArrayMul( float *dA, float *dB, float *dC ) {
int gid = blockIdx.x*blockDim.x + threadIdx.x; if( gid < DATASET_SIZE )
dC[gid] = dA[gid] * dB[gid];
Note: “__” is 2 underscore characters
mjb – March 27, 2021
Anatomy of a CUDA Program: 5 Setting Up the Memory for the Arrays
// fill host memory:
for( int i = 0; i < SIZE; i++ ) {
hA[ i ] = hB[ i ] = (float) sqrtf( (float)i ); }
// allocate device memory: float *dA, *dB, *dC;
cudaMalloc( (void **)(&dA), sizeof(hA) ); cudaMalloc( (void **)(&dB), sizeof(hB) ); cudaMalloc( (void **)(&dC), sizeof(hC) );
CudaCheckError( );
Assign values into host (CPU) memory
Allocate storage in device (GPU) memory
Computer Graphics
mjb – March 27, 2021
Anatomy of a CUDA Program: 6 Copying the Arrays from the Host to the Device
// copy host memory to the device:
cudaMemcpy( dA, hA, DATASET_SIZE*sizeof(float), cudaMemcpyHostToDevice ); cudaMemcpy( dB, hB, DATASET_SIZE*sizeof(float), cudaMemcpyHostToDevice ); CudaCheckError( );
This is a defined constant in one of the CUDA .h files
In cudaMemcpy( ), it’s always the second argument getting copied to the first!
Computer Graphics
mjb – March 27, 2021
Anatomy of a CUDA Program: Getting Ready to Execute
// setup the execution parameters:
dim3 grid( DATASET_SIZE / THREADS_PER_BLOCK, 1, 1 ); dim3 threads( THREADS_PER_BLOCK, 1, 1 );
// create and start the timer: cudaDeviceSynchronize( );
// allocate the events that we'll use for timing: cudaEvent_t start, stop;
cudaEventCreate( &start ); cudaEventCreate( &stop );
CudaCheckError( );
// record the start event: cudaEventRecord( start, NULL ); CudaCheckError( );
Computer Graphics
Grid Size and Block Size
mjb – March 27, 2021
Anatomy of a CUDA Program: 8 Executing the Kernel
// execute the kernel:
ArrayMul<<< grid, threads >>>( dA, dB, dC );
# of blocks
Function call arguments # of threads per block
The call to ArrayMul( ) returns immediately!
If you upload the resulting array (dC) right away, it will have garbage in it. To block until the kernel is finished, call:
cudaDeviceSynchronize( );
Computer Graphics
mjb – March 27, 2021
Anatomy of a CUDA Program: 9 Getting the Stop Time and Printing Performance
// record the stop event: cudaEventRecord( stop, NULL ); CudaCheckError( );
// wait for the stop event to complete: cudaEventSynchronize( stop ); CudaCheckError( );
float msecTotal;
cudaEventElapsedTime( &msecTotal, start, stop ); CudaCheckError( );
// compute and print the performance
double secondsTotal = 0.001 * (double)msecTotal;
double multsPerSecond = (double)DATASET_SIZE / secondsTotal;
double megaMultsPerSecond = multsPerSecond / 1000000.;
fprintf( stderr, “%12d\t%4d\t%10.2lf\n”, DATASET_SIZE, THREADS_PER_BLOCK, megaMultsPerSecond );
Computer Graphics
mjb – March 27, 2021
Computer Graphics
Anatomy of a CUDA Program: Copying the Array from the Device to the Host
// copy result from the device to the host:
cudaMemcpy( hC, dC, sizeof(hC), cudaMemcpyDeviceToHost ); CudaCheckError( );
// clean up: cudaFree( dA ); cudaFree( dB ); cudaFree( dC ); CudaCheckError( );
This is a defined constant in one of the CUDA .h files
In cudaMemcpy( ), it’s always the second argument getting copied to the first!
mjb – March 27, 2021
Anatomy of a CUDA Program: Running the Program
rabbit 139% cat Makefile
CUDA_PATH =
CUDA_BIN_PATH =
CUDA_NVCC = $(CUDA_BIN_PATH)/nvcc
/usr/local/apps/cuda/cuda-10.1 $(CUDA_PATH)/bin
arrayMul: arrayMul.cu
$(CUDA_NVCC) -o arrayMul arrayMul.cu
rabbit 140% make arrayMul /usr/local/apps/cuda/cuda-10.1/bin/nvcc -o arrayMul arrayMul.cu
rabbit 141% ./arrayMul 8388608 128 16169.75
Computer Graphics
mjb – March 27, 2021
Anatomy of a CUDA Program: Running the Program within a Loop
rabbit 142% cat loop.csh #!/bin/csh
foreach t ( 32 64 128 256 )
/usr/local/apps/cuda/cuda-10.1/bin/nvcc -DTHREADS_PER_BLOCK=$t -o arrayMul arrayMul.cu
./arrayMul end
rabbit 143% loop.csh 8388608 32 8388608 64 8388608 128 8388608 256
9204.82 13363.10 16576.70 15496.81
Computer Graphics
mjb – March 27, 2021
程序代写 CS代考 加微信: powcoder QQ: 1823890830 Email: powcoder@163.com