b’cuda_demo.tar.gz’
nvcc cuda_tutorial11.cu -o cuda_tutorial11.out
#include
#include
/*
This file can be downloaded from supercomputingblog.com.
This is part of a series of tutorials that demonstrate how to use CUDA
The tutorials will also demonstrate the speed of using CUDA
*/
// IMPORTANT NOTE: for this data size, your graphics card should have at least 512 megabytes of memory.
// If your GPU has less memory, then you will need to decrease this data size.
#define MAX_DATA_SIZE 1024*1024*32 // about 32 million elements.
// The max data size must be an integer multiple of 128*256, because each block will have 256 threads,
// and the block grid width will be 128. These are arbitrary numbers I choose.
void get_walltime(double* wcTime) {
struct timeval tp;
gettimeofday(&tp, NULL);
*wcTime = (double)(tp.tv_sec + tp.tv_usec/1000000.0);
}
double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end – d_start);
}
void GoldenBrick(float *pA, float *pB, float *pResult, int count)
{
for (int i=0; i < count; i++)
{
//pResult[count] = pA[count] * pB[count];
//pResult[count] = pA[count] * pB[count] / 12.34567;
//pResult[count] = sqrt(pA[count] * pB[count] / 12.34567);
pResult[count] = sqrt(pA[count] * pB[count] / 12.34567) * sin(pA[count]);
}
}
__global__ void multiplyNumbersGPU(float *pDataA, float *pDataB, float *pResult)
{
// Because of the simplicity of this tutorial, we are going to assume that
// every block has 256 threads. Each thread simply multiplies two numbers,
// and then stores the result.
// The grid of blocks is 128 blocks long.
int tid = (blockIdx.y * 128 * 256) + blockIdx.x * 256 + threadIdx.x; // This gives every thread a unique ID.
// By no coincidence, we'll be using this thread ID to determine which data elements to multiply.
//pResult[tid] = pDataA[tid] * pDataB[tid]; // Each thread only multiplies one data element.
//pResult[tid] = pDataA[tid] * pDataB[tid] / 12.34567;
//pResult[tid] = sqrt(pDataA[tid] * pDataB[tid] / 12.34567);
pResult[tid] = sqrt(pDataA[tid] * pDataB[tid] / 12.34567) * sin(pDataA[tid]);
}
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv){
float *h_dataA, *h_dataB, *h_resultC;
float *d_dataA, *d_dataB, *d_resultC;
double gpuTime;
int i;
timeval start, end;
printf("Initializing data...\n");
h_dataA = (float *)malloc(sizeof(float) * MAX_DATA_SIZE);
h_dataB = (float *)malloc(sizeof(float) * MAX_DATA_SIZE);
h_resultC = (float *)malloc(sizeof(float) * MAX_DATA_SIZE);
cudaMalloc( (void **)&d_dataA, sizeof(float) * MAX_DATA_SIZE) ;
cudaMalloc( (void **)&d_dataB, sizeof(float) * MAX_DATA_SIZE) ;
cudaMalloc( (void **)&d_resultC , sizeof(float) * MAX_DATA_SIZE) ;
srand(123);
for(i = 0; i < MAX_DATA_SIZE; i++)
{
h_dataA[i] = (float)rand() / (float)RAND_MAX;
h_dataB[i] = (float)rand() / (float)RAND_MAX;
}
int firstRun = 1; // Indicates if it's the first execution of the for loop
const int useGPU = 1; // When 0, only the CPU is used. When 1, only the GPU is used
for (int dataAmount = MAX_DATA_SIZE; dataAmount > 128*256; dataAmount /= 2)
{
int blockGridWidth = 128;
int blockGridHeight = (dataAmount / 256) / blockGridWidth;
dim3 blockGridRows(blockGridWidth, blockGridHeight);
dim3 threadBlockRows(256, 1);
// Start the timer.
// We want to measure copying data, running the kernel, and copying the results back to host
gettimeofday(&start, NULL);
if (useGPU == 0)
{
// Copy the data to the device
cudaMemcpy(d_dataA, h_dataA, sizeof(float) * dataAmount, cudaMemcpyHostToDevice) ;
cudaMemcpy(d_dataB, h_dataB, sizeof(float) * dataAmount, cudaMemcpyHostToDevice) ;
// Do the multiplication on the GPU
multiplyNumbersGPU<<
cudaThreadSynchronize() ;
// Copy the data back to the host
cudaMemcpy(h_resultC, d_resultC, sizeof(float) * dataAmount, cudaMemcpyDeviceToHost) ;
}
else
{
// We’re using the CPU only
GoldenBrick(h_dataA, h_dataB, h_resultC, dataAmount);
}
// Stop the timer, print the total round trip execution time.
gettimeofday(&end, NULL);
gpuTime = myDiffTime(start, end);
if (!firstRun || !useGPU)
{
printf(“Elements: %d – convolution time : %f msec – %f Multiplications/sec\n”, dataAmount, gpuTime, blockGridHeight * 128 * 256 / (gpuTime * 0.001));
}
else
{
firstRun = 0;
// We discard the results of the first run because of the extra overhead incurred
// during the first time a kernel is ever executed.
dataAmount *= 2; // reset to first run value
}
}
printf(“Cleaning up…\n”);
cudaFree(d_resultC ) ;
cudaFree(d_dataB) ;
cudaFree(d_dataA) ;
free(h_resultC);
free(h_dataB);
free(h_dataA);
}
#include
#include
#include
/*
This file can be downloaded from supercomputingblog.com.
This is part of a series of tutorials that demonstrate how to use CUDA
The tutorials will also demonstrate the speed of using CUDA
*/
// IMPORTANT NOTE: for this data size, your graphics card should have at least 256 megabytes of memory.
// If your GPU has less memory, then you will need to decrease this data size.
#define MAX_DATA_SIZE 1024*1024*32 // about 32 million elements.
// The max data size must be an integer multiple of 128*256, because each block will have 256 threads,
// and the block grid width will be 128. These are arbitrary numbers I choose.
#define THREADS_PER_BLOCK 256
#define BLOCKS_PER_GRID_ROW 128
double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end – d_start);
}
__global__ void getStats(float *pArray, float *pMaxResults, float *pMinResults, float *pAvgResults)
{
// Declare arrays to be in shared memory.
// 256 elements * (4 bytes / element) * 3 = 3KB.
__shared__ float min[256];
__shared__ float max[256];
__shared__ float avg[256];
// Calculate which element this thread reads from memory
int arrayIndex = 128*256*blockIdx.y + 256*blockIdx.x + threadIdx.x;
min[threadIdx.x] = max[threadIdx.x] = avg[threadIdx.x] = pArray[arrayIndex];
__syncthreads();
int nTotalThreads = blockDim.x; // Total number of active threads
while(nTotalThreads > 1)
{
int halfPoint = (nTotalThreads >> 1); // divide by two
// only the first half of the threads will be active.
if (threadIdx.x < halfPoint)
{
// Get the shared value stored by another thread
float temp = min[threadIdx.x + halfPoint];
if (temp < min[threadIdx.x]) min[threadIdx.x] = temp;
temp = max[threadIdx.x + halfPoint];
if (temp > max[threadIdx.x]) max[threadIdx.x] = temp;
// when calculating the average, sum and divide
avg[threadIdx.x] += avg[threadIdx.x + halfPoint];
avg[threadIdx.x] /= 2;
}
__syncthreads();
nTotalThreads = (nTotalThreads >> 1); // divide by two.
}
// At this point in time, thread zero has the min, max, and average
// It’s time for thread zero to write it’s final results.
// Note that the address structure of pResults is different, because
// there is only one value for every thread block.
if (threadIdx.x == 0)
{
pMaxResults[128*blockIdx.y + blockIdx.x] = max[0];
pMinResults[128*blockIdx.y + blockIdx.x] = min[0];
pAvgResults[128*blockIdx.y + blockIdx.x] = avg[0];
}
}
void getStatsCPU(float *pArray, int nElems, float *pMin, float *pMax, float *pAvg)
{
// This function uses the CPU to find the min, max and average of an array
if (nElems <= 0) return;
float min, max, avg;
min = max = avg = pArray[0];
for (int i=1; i < nElems; i++)
{
float temp = pArray[i];
if (temp < min) min = temp;
if (temp > max) max = temp;
avg += temp; // we will divide once after for loop for speed.
}
avg /= (float)nElems;
*pMin = min;
*pMax = max;
*pAvg = avg;
}
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv){
float *h_data, *h_resultMax, *h_resultMin, *h_resultAvg;
float *d_data, *d_resultMax, *d_resultMin, *d_resultAvg;
double gpuTime;
int i;
timeval start, end;
printf(“Initializing data…\n”);
h_data = (float *)malloc(sizeof(float) * MAX_DATA_SIZE);
h_resultMax = (float *)malloc(sizeof(float) * MAX_DATA_SIZE / THREADS_PER_BLOCK);
h_resultMin = (float *)malloc(sizeof(float) * MAX_DATA_SIZE / THREADS_PER_BLOCK);
h_resultAvg = (float *)malloc(sizeof(float) * MAX_DATA_SIZE / THREADS_PER_BLOCK);
cudaMalloc( (void **)&d_data, sizeof(float) * MAX_DATA_SIZE);
cudaMalloc( (void **)&d_resultMax, sizeof(float) * MAX_DATA_SIZE / THREADS_PER_BLOCK);
cudaMalloc( (void **)&d_resultMin, sizeof(float) * MAX_DATA_SIZE / THREADS_PER_BLOCK);
cudaMalloc( (void **)&d_resultAvg, sizeof(float) * MAX_DATA_SIZE / THREADS_PER_BLOCK);
srand(123);
for(i = 0; i < MAX_DATA_SIZE; i++)
{
h_data[i] = (float)rand() / (float)RAND_MAX;
}
int firstRun = 1; // Indicates if it's the first execution of the for loop
const int useGPU = 1; // When 0, only the CPU is used. When 1, only the GPU is used
for (int dataAmount = MAX_DATA_SIZE; dataAmount > BLOCKS_PER_GRID_ROW*THREADS_PER_BLOCK; dataAmount /= 2)
{
float tempMin,tempMax,tempAvg;
int blockGridWidth = BLOCKS_PER_GRID_ROW;
int blockGridHeight = (dataAmount / THREADS_PER_BLOCK) / blockGridWidth;
dim3 blockGridRows(blockGridWidth, blockGridHeight);
dim3 threadBlockRows(THREADS_PER_BLOCK, 1);
// Start the timer.
// We want to measure copying data, running the kernel, and copying the results back to host
gettimeofday(&start, NULL);
if (useGPU == 1)
{
// Copy the data to the device
cudaMemcpy(d_data, h_data, sizeof(float) * dataAmount, cudaMemcpyHostToDevice);
// Do the multiplication on the GPU
getStats<<
cudaThreadSynchronize();
// Copy the data back to the host
cudaMemcpy(h_resultMin, d_resultMin, sizeof(float) * dataAmount / THREADS_PER_BLOCK, cudaMemcpyDeviceToHost);
cudaMemcpy(h_resultMax, d_resultMax, sizeof(float) * dataAmount / THREADS_PER_BLOCK, cudaMemcpyDeviceToHost);
cudaMemcpy(h_resultAvg, d_resultAvg, sizeof(float) * dataAmount / THREADS_PER_BLOCK, cudaMemcpyDeviceToHost);
// Each block returned one result, so lets finish this off with the cpu.
// By using CUDA, we basically reduced how much the CPU would have to work by about 256 times.
tempMin = h_resultMin[0];
tempMax = h_resultMax[0];
tempAvg = h_resultAvg[0];
for (int i=1 ; i < dataAmount / THREADS_PER_BLOCK; i++)
{
if (h_resultMin[i] < tempMin) tempMin = h_resultMin[i];
if (h_resultMax[i] > tempMax) tempMax = h_resultMax[i];
tempAvg += h_resultAvg[i];
}
tempAvg /= (dataAmount / THREADS_PER_BLOCK);
}
else
{
// We’re using the CPU only
getStatsCPU(h_data, dataAmount, &tempMin, &tempMax, &tempAvg);
}
printf(“Min: %f Max %f Avg %f\n”, tempMin, tempMax, tempAvg);
// Stop the timer, print the total round trip execution time.
gettimeofday(&end, NULL);
gpuTime = myDiffTime(start, end);
if (!firstRun || !useGPU)
{
printf(“Elements: %d – convolution time : %f msec – %f Multiplications/sec\n”, dataAmount, gpuTime, blockGridHeight * 128 * 256 / (gpuTime * 0.001));
}
else
{
firstRun = 0;
// We discard the results of the first run because of the extra overhead incurred
// during the first time a kernel is ever executed.
dataAmount *= 2; // reset to first run value
}
}
printf(“Cleaning up…\n”);
cudaFree(d_resultMin );
cudaFree(d_resultMax );
cudaFree(d_resultAvg );
cudaFree(d_data);
free(h_resultMin);
free(h_resultMax);
free(h_resultAvg);
free(h_data);
}
#include
#include
#define N 512
#define BLOCK_DIM 512
double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end – d_start);
}
__global__ void matrixAdd (int *a, int *b, int *c);
void matrixAddCPU(int *a, int *b, int *c);
int main()
{
int a[N*N], b[N*N], c[N*N];
int *dev_a, *dev_b, *dev_c;
timeval start, end;
int size = N * N * sizeof(int);
// initialize a and b with real values (NOT SHOWN)
cudaMalloc((void**)&dev_a, size);
cudaMalloc((void**)&dev_b, size);
cudaMalloc((void**)&dev_c, size);
//gettimeofday(&start, NULL);
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
dim3 dimGrid((int)ceil(N/dimBlock.x),(int)ceil(N/dimBlock.y));
gettimeofday(&start, NULL);
matrixAdd<<
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);
//gettimeofday(&end, NULL);
printf(“GPU Time for %i additions: %f\n”, N, myDiffTime(start, end));
gettimeofday(&start, NULL);
matrixAddCPU(a, b, c);
gettimeofday(&end, NULL);
printf(“CPU Time for %i additions: %f\n”, N, myDiffTime(start, end));
cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c);
}
__global__ void matrixAdd (int *a, int *b, int *c)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = col + row * N;
if (col < N && row < N)
{
c[index] = a[index] + b[index];
}
}
void matrixAddCPU(int *a, int *b, int *c)
{
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
c[i*N + j] = a[i*N + j] + b[i*N + j];
}
#include
#include
#define N 64
__global__ void matrixMult (int *a, int *b, int *c, int width);
void matrixMultCPU (int a[N][N], int b[N][N], int c[N][N], int width);
double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end – d_start);
}
int main()
{
int a[N][N], b[N][N], c[N][N], g[N][N];
int *dev_a, *dev_b, *dev_c;
timeval start, end;
// initialize matrices a and b with appropriate values
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
a[i][j] = i*N + j;
b[i][j] = i + j;
}
}
int size = N * N * sizeof(int);
cudaMalloc((void **) &dev_a, size);
cudaMalloc((void **) &dev_b, size);
cudaMalloc((void **) &dev_c, size);
gettimeofday(&start, NULL);
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
dim3 dimGrid(1, 1);
dim3 dimBlock(N, N);
matrixMult<<
cudaDeviceSynchronize();
cudaMemcpy(g, dev_c, size, cudaMemcpyDeviceToHost);
gettimeofday(&end, NULL);
printf(“GPU Time for %i additions: %f\n”, N, myDiffTime(start, end));
gettimeofday(&start, NULL);
matrixMultCPU(a, b, c, N);
gettimeofday(&end, NULL);
printf(“CPU Time for %i additions: %f\n”, N, myDiffTime(start, end));
cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c);
// print verification
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
if (c[i][j] != g[i][j])
{
printf("Results do not match! %i, %i, c=%i, g=%i\n", i, j, c[i][j], g[i][j]);
exit(1);
}
}
}
}
__global__ void matrixMult (int *a, int *b, int *c, int width)
{
int k, sum = 0;
int col = threadIdx.x + blockDim.x * blockIdx.x;
int row = threadIdx.y + blockDim.y * blockIdx.y;
if(col < width && row < width)
{
for (k = 0; k < width; k++)
sum += a[row * width + k] * b[k * width + col];
c[row * width + col] = sum;
}
}
void matrixMultCPU (int a[N][N], int b[N][N], int c[N][N], int width)
{
for (int i = 0; i < width; i++)
{
for (int j = 0; j < width; j++)
{
int sum = 0;
for (int k = 0; k < width; k++)
{
int m = a[i][k];
int n = b[k][j];
sum += m * n;
}
c[i][j] = sum;
}
}
}
#include
#include
#define N 512
#define TILE_WIDTH 16
__global__ void matrixMult (int *a, int *b, int *c, int width);
void matrixMultCPU (int a[N][N], int b[N][N], int c[N][N], int width);
double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end – d_start);
}
int main()
{
int a[N][N], b[N][N], c[N][N], g[N][N];
timeval start, end;
int *dev_a, *dev_b, *dev_c;
int size = N * N * sizeof(int);
// initialize matrices a and b with appropriate values
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
a[i][j] = i*N + j;
b[i][j] = i + j;
}
}
// initialize a and b matrices here
cudaMalloc((void **) &dev_a, size);
cudaMalloc((void **) &dev_b, size);
cudaMalloc((void **) &dev_c, size);
gettimeofday(&start, NULL);
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
dim3 dimGrid((int)ceil(N/dimBlock.x), (int)ceil(N/dimBlock.y));
matrixMult<<
cudaDeviceSynchronize();
cudaMemcpy(g, dev_c, size, cudaMemcpyDeviceToHost);
gettimeofday(&end, NULL);
printf(“GPU Time for %i additions: %f\n”, N, myDiffTime(start, end));
gettimeofday(&start, NULL);
matrixMultCPU(a, b, c, N);
gettimeofday(&end, NULL);
printf(“CPU Time for %i additions: %f\n”, N, myDiffTime(start, end));
cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c);
// print verification
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
if (c[i][j] != g[i][j])
{
printf("Results do not match! %i, %i, c=%i, g=%i\n", i, j, c[i][j], g[i][j]);
exit(1);
}
}
}
}
__global__ void matrixMult(int* A, int* B, int* C, int width)
{
int k, sum = 0;
int col = blockIdx.x*TILE_WIDTH + threadIdx.x;
int row = blockIdx.y*TILE_WIDTH + threadIdx.y;
if(col < width && row < width)
{
for (k = 0; k < width; k++)
sum += A[row * width + k] * B[k * width + col];
C[row * width + col] = sum;
}
}
void matrixMultCPU (int a[N][N], int b[N][N], int c[N][N], int width)
{
for (int i = 0; i < width; i++)
{
for (int j = 0; j < width; j++)
{
int sum = 0;
for (int k = 0; k < width; k++)
{
int m = a[i][k];
int n = b[k][j];
sum += m * n;
}
c[i][j] = sum;
}
}
}
#include
#include
#define N 65535
#define T 1024 // max threads per block
double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end – d_start);
}
__global__ void vecAdd (int *a, int *b, int *c);
void vecAddCPU(int *a, int *b, int *c);
int main()
{
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
timeval start, end;
// initialize a and b with real values
for (int i = 0; i < N; i++)
{
a[i] = i;
b[i] = N-i;
c[i] = 0;
}
int size = N * sizeof(int);
cudaMalloc((void**)&dev_a, size);
cudaMalloc((void**)&dev_b, size);
cudaMalloc((void**)&dev_c, size);
gettimeofday(&start, NULL);
cudaMemcpy(dev_a, a, size,cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size,cudaMemcpyHostToDevice);
//gettimeofday(&start, NULL);
vecAdd<<<(int)ceil(N/T),T>>>(dev_a,dev_b,dev_c);
//gettimeofday(&end, NULL);
cudaMemcpy(c, dev_c, size,cudaMemcpyDeviceToHost);
gettimeofday(&end, NULL);
printf(“GPU Time for %i additions: %f\n”, N, myDiffTime(start, end));
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
gettimeofday(&start, NULL);
vecAddCPU(a, b, c);
gettimeofday(&end, NULL);
printf(“CPU Time for %i additions: %f\n”, N, myDiffTime(start, end));
exit (0);
}
__global__ void vecAdd (int *a, int *b, int *c)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N)
{
c[i] = a[i] + b[i];
}
}
void vecAddCPU(int *a, int *b, int *c)
{
for (int i = 0; i < N; i++)
c[i] = a[i] + b[i];
}
#include
#include
#include
#define N 1024
#define NUM_BANKS 16
#define LOG_NUM_BANKS 4
#ifdef ZERO_BANK_CONFLICTS
#define CONFLICT_FREE_OFFSET(n) ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS))
#else
#define CONFLICT_FREE_OFFSET(n) ((n) >> LOG_NUM_BANKS)
#endif
__global__ void scan(float *g_odata, float *g_idata, int n);
__global__ void prescan(float *g_odata, float *g_idata, int n);
void scanCPU(float *f_out, float *f_in, int i_n);
double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end – d_start);
}
int main()
{
float a[N], c[N], g[N];
timeval start, end;
float *dev_a, *dev_g;
int size = N * sizeof(float);
double d_gpuTime, d_cpuTime;
// initialize matrices a
for (int i = 0; i < N; i++)
{
a[i] = (float)(rand() % 1000000) / 1000.0;
//printf("a[%i] = %f\n", i, a[i]);
}
// initialize a and b matrices here
cudaMalloc((void **) &dev_a, size);
cudaMalloc((void **) &dev_g, size);
gettimeofday(&start, NULL);
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
prescan<<<1,N,2*N*sizeof(float)>>>(dev_g, dev_a, N);
cudaDeviceSynchronize();
cudaMemcpy(g, dev_g, size, cudaMemcpyDeviceToHost);
gettimeofday(&end, NULL);
d_gpuTime = myDiffTime(start, end);
gettimeofday(&start, NULL);
scanCPU(c, a, N);
gettimeofday(&end, NULL);
d_cpuTime = myDiffTime(start, end);
cudaFree(dev_a); cudaFree(dev_g);
for (int i = 0; i < N; i++)
{
printf("c[%i] = %0.3f, g[%i] = %0.3f\n", i, c[i], i, g[i]);
//if (c[i] != g[i])
//{
// printf("Results do not match! c[%i]=%f, g[%i]=%f\n", i, c[i], i, g[i]);
// break;
//}
}
printf("GPU Time for scan size %i: %f\n", N, d_gpuTime);
printf("CPU Time for scan size %i: %f\n", N, d_cpuTime);
}
__global__ void prescan(float *g_odata, float *g_idata, int n)
{
extern __shared__ float temp[];
// allocated on invocation
int thid = threadIdx.x;
int offset = 1;
//A
int ai = thid;
int bi = thid + (n/2);
int bankOffsetA = CONFLICT_FREE_OFFSET(ai);
int bankOffsetB = CONFLICT_FREE_OFFSET(ai);
temp[ai + bankOffsetA] = g_idata[ai];
temp[bi + bankOffsetB] = g_idata[bi];
for (int d = n>>1; d > 0; d >>= 1)
// build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
//B
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
ai += CONFLICT_FREE_OFFSET(ai);
bi += CONFLICT_FREE_OFFSET(bi);
temp[bi] += temp[ai];
}
offset *= 2;
}
//C
if (thid==0)
{
int z = CONFLICT_FREE_OFFSET(n - 1);
temp[n - 1 + z] = 0;
}
// clear the last element
for (int d = 1; d < n; d *= 2)
// traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
//D
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
ai += CONFLICT_FREE_OFFSET(ai);
bi += CONFLICT_FREE_OFFSET(bi);
float t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
//E
g_odata[ai] = temp[ai + bankOffsetA];
g_odata[bi] = temp[bi + bankOffsetB];
}
void scanCPU(float *f_out, float *f_in, int i_n)
{
f_out[0] = 0;
for (int i = 1; i < i_n; i++)
f_out[i] = f_out[i-1] + f_in[i-1];
}
#include
#include
#include
#define N 1024
__global__ void scan(float *g_odata, float *g_idata, int n);
__global__ void prescan(float *g_odata, float *g_idata, int n);
void scanCPU(float *f_out, float *f_in, int i_n);
double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end – d_start);
}
int main()
{
float a[N], c[N], g[N];
timeval start, end;
float *dev_a, *dev_g;
int size = N * sizeof(float);
double d_gpuTime, d_cpuTime;
// initialize matrices a
for (int i = 0; i < N; i++)
{
a[i] = (float)(rand() % 1000000) / 1000.0;
//printf("a[%i] = %f\n", i, a[i]);
}
// initialize a and b matrices here
cudaMalloc((void **) &dev_a, size);
cudaMalloc((void **) &dev_g, size);
gettimeofday(&start, NULL);
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
//scan<<<1,N,2*N*sizeof(float)>>>(dev_g, dev_a, N);
prescan<<<1,N,2*N*sizeof(float)>>>(dev_g, dev_a, N);
cudaDeviceSynchronize();
cudaMemcpy(g, dev_g, size, cudaMemcpyDeviceToHost);
gettimeofday(&end, NULL);
d_gpuTime = myDiffTime(start, end);
gettimeofday(&start, NULL);
scanCPU(c, a, N);
gettimeofday(&end, NULL);
d_cpuTime = myDiffTime(start, end);
cudaFree(dev_a); cudaFree(dev_g);
for (int i = 0; i < N; i++)
{
printf("c[%i] = %0.3f, g[%i] = %0.3f\n", i, c[i], i, g[i]);
//if (c[i] != g[i])
//{
// printf("Results do not match! c[%i]=%f, g[%i]=%f\n", i, c[i], i, g[i]);
// break;
//}
}
printf("GPU Time for scan size %i: %f\n", N, d_gpuTime);
printf("CPU Time for scan size %i: %f\n", N, d_cpuTime);
}
__global__ void scan(float *g_odata, float *g_idata, int n)
{
extern __shared__ float temp[]; // allocated on invocation
int thid = threadIdx.x;
int pout = 0, pin = 1;
// Load input into shared memory.
// This is exclusive scan, so shift right by one
// and set first element to 0
temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;
__syncthreads();
for (int offset = 1; offset < n; offset *= 2)
{
pout = 1 - pout; // swap double buffer indices
pin = 1 - pout;
if (thid >= offset)
temp[pout*n+thid] += temp[pin*n+thid – offset];
else
temp[pout*n+thid] = temp[pin*n+thid];
__syncthreads();
}
g_odata[thid] = temp[pout*n+thid]; // write output
}
__global__ void prescan(float *g_odata, float *g_idata, int n)
{
extern __shared__ float temp[];
// allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid];
// load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1)
// build sum in place up the tree
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
if (thid == 0)
{
temp[n - 1] = 0;
}
// clear the last element
for (int d = 1; d < n; d *= 2)
// traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
float t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
g_odata[2*thid] = temp[2*thid];
// write results to device memory
g_odata[2*thid+1] = temp[2*thid+1];
}
void scanCPU(float *f_out, float *f_in, int i_n)
{
f_out[0] = 0;
for (int i = 1; i < i_n; i++)
f_out[i] = f_out[i-1] + f_in[i-1];
}
commands.txt
cuda_tutorial11.cu
CUDA_Tutorial_3.cu
m_add.cu
m_mult.cu
m_multTile.cu
v_add.cu
v_bankScan.cu
v_naiveScan.cu