EEEE4115 Advanced Computational Engineering
Introduction to GPU Programming with CUDA ®
NVIDIA’s CUDA: A Recap
Copyright By PowCoder代写 加微信 powcoder
CUDA Programming Model
▪The GPU is seen as a ‘compute device’ to execute all or part of an application that:
▪Can be isolated as a function ▪Has to be executed many times ▪Is highly data parallel
▪Is lightweight
▪The function when compiled to run on the ‘device’ is called a ‘Kernel’ 3
NVIDIA’s CUDA: A Recap
CUDA Programming Model
▪ Allocate memory on device ▪Allocated by cudaMalloc()
▪ Copy input arrays from host to device memory ▪ cudaMemcpy()
▪ Execute kernel(s) on the device
▪ kernel <<< block size, number of blocks, shared memory size
>>>(parameters);
▪ Copy output arrays from device to host ▪ cudaMemcpy()
NVIDIA’s CUDA: A Recap
Thread/Block Model
▪Each thread has a unique local index.
▪Each block has a unique local index.
▪These indices can be used to index data in arrays
NVIDIA’s CUDA: A Recap
Thread/Block Model
int data [10] = { 2, 4, 6, 8, 10, 12, 14, 16, 18, 20} creates:
to select ith element:
0123456789 data
kernel <<<1,10>>>: int i = threadIdx.x;
do something with data[i];
kernel <<<10,1>>>: int i = blockIdx.x;
do something with data[i];
kernel <<<2,5>>>: int i = blockDim.x * blockIdx.x + threadIdx.x; do something with data[i];
(kernel <<
NVIDIA’s CUDA: A Recap
a[] = {1,2,3,4,5,6,7,8,9,…} (number of elements = int length)
a: 1 2 3 4 5 6 7
CPU code to access elements of a[]:
void foo (int *a, int *b) {
for (int i=0; i
Allocate memory on device
1D TLM Algorithm
//allocate memory on device cudaMalloc((void**)&dev_VR, N*sizeof(float)); cudaMalloc((void**)&dev_VL, N*sizeof(float)); cudaMalloc((void**)&dev_Vt, NT*sizeof(float));
//zero arrays ……
//copy memory areas from host to device
cudaMemcpy(dev_VR, VR, N*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(dev_VL, VL, N*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(dev_Vt, Vt, NT*sizeof(float),cudaMemcpyHostToDevice);
//run TLM algorithm for NT times steps for (int m = 0; m < NT; m++)
float source = tlmSource(m*dt, delay, width); tlmScatter <<<1,N>>>( dev_VR, dev_VL, N, m, source); tlmConnect<<<1,N>>>( dev_VR, dev_VL, dev_Vt, N, m);
Copy memory areas from host to device
1D TLM Algorithm
//allocate memory on device cudaMalloc((void**)&dev_VR, N*sizeof(float)); cudaMalloc((void**)&dev_VL, N*sizeof(float)); cudaMalloc((void**)&dev_Vt, NT*sizeof(float));
//zero arrays ……
//copy memory areas from host to device cudaMemcpy(dev_VR, VR, N*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(dev_VL, VL, N*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(dev_Vt, Vt, NT*sizeof(float),cudaMemcpyHostToDevice);
//run TLM algorithm for NT times steps
for (int m = 0; m < NT; m++) {
float source = tlmSource(m*dt, delay, width); tlmScatter <<<1,N>>>( dev_VR, dev_VL, N, m, source); tlmConnect <<<1,N>>>( dev_VR, dev_VL, dev_Vt, N, m); }
Launch kernels on device
Note: for generality scatter & connect are launched as two separate kernels to ensure the scatter completes before the connect.
If using single block of threads then both routines could be in the same kernel
1D TLM Algorithm
//allocate memory on device cudaMalloc((void**)&dev_VR, N*sizeof(float)); cudaMalloc((void**)&dev_VL, N*sizeof(float)); cudaMalloc((void**)&dev_Vt, NT*sizeof(float));
//zero arrays ……
//copy memory areas from host to device
cudaMemcpy(dev_VR, VR, N*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(dev_VL, VL, N*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(dev_Vt, Vt, NT*sizeof(float),cudaMemcpyHostToDevice);
//run TLM algorithm for NT times steps for (int m = 0; m < NT; m++)
float source = tlmSource(m*dt, delay, width); tlmScatter <<<1,N>>>( dev_VR, dev_VL, N, m, source); tlmConnect<<<1,N>>>( dev_VR, dev_VL, dev_Vt, N, m);
__global__ void tlmScatter (float* VR, float* VL,int N, int m, float source) {
unsigned int idx = threadIdx.x; //apply source
if (idx == 0)
VL[0] += source;
if (idx < N) {
float V = VL[idx] + VR[idx]; VR[idx] = V - VR[idx]; VL[idx] = V - VL[idx];
* Kernel calls for each timestep m are queued on device
1D TLM Algorithm
//allocate memory on device cudaMalloc((void**)&dev_VR, N*sizeof(float)); cudaMalloc((void**)&dev_VL, N*sizeof(float)); cudaMalloc((void**)&dev_Vt, NT*sizeof(float));
//zero arrays ......
//copy memory areas from host to device
cudaMemcpy(dev_VR, VR, N*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(dev_VL, VL, N*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(dev_Vt, Vt, NT*sizeof(float),cudaMemcpyHostToDevice);
//run TLM algorithm for NT times steps for (int m = 0; m < NT; m++)
float source = tlmSource(m*dt, delay, width);
tlmScatter <<<1,N>>>( dev_VR, dev_VL, N, m, source); tlmConnect<<<1,N>>>( dev_VR, dev_VL, dev_Vt, N, m); }
__global__ void tlmConnect (float* VR, float* VL, float* Vt, int N, int m)
unsigned int idx = threadIdx.x;
if (idx > 0 && idx < N) {
float V = VR[idx-1];
VR[idx-1] = VL[idx];
VL[idx] = V;
//apply boundaries
if (idx == 0) {
VR[N-1] *= -1.f;
VL[0] = 0.f;
Vt[m] = VL[2] + VR[2]; // o/p
1D TLM Algorithm
__global__ void tlmScatter (float* VR, float* VL, int N, int m, float source) {
unsigned int idx = threadIdx.x + blockIdx.x*blockDim.x; //apply source
if (idx == 0) //only one thread to apply source
VL[0] += source;
if(idx
1D TLM Algorithm
N Sections (cells) to represent line
CUDA Coursework 2
Porting the 1D TLM Algorithm to CUDA
CUDA Coursework 2
Coursework is worth 25% of the Module
▪ Full submission to include the full application code as a single .cu file and a brief report (approximately 10 pages) that includes:
▪ A benchmark of the provided 2D TLM code on your platform of choice (illustrative result and run time)
▪ A discussion that identifies the areas of the code that are data parallel
▪ A description of the scatter and connect functions as implemented on the
GPU Obtain performance results
▪ A discussion of any routes taken to optimise the application
▪ A benchmark of the GPU application (illustrative result, runtime and speed up compared to CPU version)
▪ Coursework deadline 11th January 2021 at 15:00
程序代写 CS代考 加微信: powcoder QQ: 1823890830 Email: powcoder@163.com