CS计算机代考程序代写 cuda AI GPU b’cuda_demo.tar.gz’

b’cuda_demo.tar.gz’

nvcc cuda_tutorial11.cu -o cuda_tutorial11.out

#include
#include

/*
This file can be downloaded from supercomputingblog.com.
This is part of a series of tutorials that demonstrate how to use CUDA
The tutorials will also demonstrate the speed of using CUDA
*/

// IMPORTANT NOTE: for this data size, your graphics card should have at least 512 megabytes of memory.
// If your GPU has less memory, then you will need to decrease this data size.

#define MAX_DATA_SIZE 1024*1024*32 // about 32 million elements.
// The max data size must be an integer multiple of 128*256, because each block will have 256 threads,
// and the block grid width will be 128. These are arbitrary numbers I choose.

void get_walltime(double* wcTime) {

struct timeval tp;

gettimeofday(&tp, NULL);

*wcTime = (double)(tp.tv_sec + tp.tv_usec/1000000.0);

}

double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end – d_start);
}

void GoldenBrick(float *pA, float *pB, float *pResult, int count)
{
for (int i=0; i < count; i++) { //pResult[count] = pA[count] * pB[count]; //pResult[count] = pA[count] * pB[count] / 12.34567; //pResult[count] = sqrt(pA[count] * pB[count] / 12.34567); pResult[count] = sqrt(pA[count] * pB[count] / 12.34567) * sin(pA[count]); } } __global__ void multiplyNumbersGPU(float *pDataA, float *pDataB, float *pResult) { // Because of the simplicity of this tutorial, we are going to assume that // every block has 256 threads. Each thread simply multiplies two numbers, // and then stores the result. // The grid of blocks is 128 blocks long. int tid = (blockIdx.y * 128 * 256) + blockIdx.x * 256 + threadIdx.x; // This gives every thread a unique ID. // By no coincidence, we'll be using this thread ID to determine which data elements to multiply. //pResult[tid] = pDataA[tid] * pDataB[tid]; // Each thread only multiplies one data element. //pResult[tid] = pDataA[tid] * pDataB[tid] / 12.34567; //pResult[tid] = sqrt(pDataA[tid] * pDataB[tid] / 12.34567); pResult[tid] = sqrt(pDataA[tid] * pDataB[tid] / 12.34567) * sin(pDataA[tid]); } //////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv){ float *h_dataA, *h_dataB, *h_resultC; float *d_dataA, *d_dataB, *d_resultC; double gpuTime; int i; timeval start, end; printf("Initializing data...\n"); h_dataA = (float *)malloc(sizeof(float) * MAX_DATA_SIZE); h_dataB = (float *)malloc(sizeof(float) * MAX_DATA_SIZE); h_resultC = (float *)malloc(sizeof(float) * MAX_DATA_SIZE); cudaMalloc( (void **)&d_dataA, sizeof(float) * MAX_DATA_SIZE) ; cudaMalloc( (void **)&d_dataB, sizeof(float) * MAX_DATA_SIZE) ; cudaMalloc( (void **)&d_resultC , sizeof(float) * MAX_DATA_SIZE) ; srand(123); for(i = 0; i < MAX_DATA_SIZE; i++) { h_dataA[i] = (float)rand() / (float)RAND_MAX; h_dataB[i] = (float)rand() / (float)RAND_MAX; } int firstRun = 1; // Indicates if it's the first execution of the for loop const int useGPU = 1; // When 0, only the CPU is used. When 1, only the GPU is used for (int dataAmount = MAX_DATA_SIZE; dataAmount > 128*256; dataAmount /= 2)
{
int blockGridWidth = 128;
int blockGridHeight = (dataAmount / 256) / blockGridWidth;

dim3 blockGridRows(blockGridWidth, blockGridHeight);
dim3 threadBlockRows(256, 1);

// Start the timer.
// We want to measure copying data, running the kernel, and copying the results back to host
gettimeofday(&start, NULL);

if (useGPU == 0)
{

// Copy the data to the device
cudaMemcpy(d_dataA, h_dataA, sizeof(float) * dataAmount, cudaMemcpyHostToDevice) ;
cudaMemcpy(d_dataB, h_dataB, sizeof(float) * dataAmount, cudaMemcpyHostToDevice) ;

// Do the multiplication on the GPU
multiplyNumbersGPU<<>>(d_dataA, d_dataB, d_resultC);
cudaThreadSynchronize() ;

// Copy the data back to the host
cudaMemcpy(h_resultC, d_resultC, sizeof(float) * dataAmount, cudaMemcpyDeviceToHost) ;
}
else
{
// We’re using the CPU only
GoldenBrick(h_dataA, h_dataB, h_resultC, dataAmount);
}

// Stop the timer, print the total round trip execution time.
gettimeofday(&end, NULL);
gpuTime = myDiffTime(start, end);
if (!firstRun || !useGPU)
{
printf(“Elements: %d – convolution time : %f msec – %f Multiplications/sec\n”, dataAmount, gpuTime, blockGridHeight * 128 * 256 / (gpuTime * 0.001));
}
else
{
firstRun = 0;
// We discard the results of the first run because of the extra overhead incurred
// during the first time a kernel is ever executed.
dataAmount *= 2; // reset to first run value
}
}

printf(“Cleaning up…\n”);
cudaFree(d_resultC ) ;
cudaFree(d_dataB) ;
cudaFree(d_dataA) ;
free(h_resultC);
free(h_dataB);
free(h_dataA);

}

#include
#include
#include

/*
This file can be downloaded from supercomputingblog.com.
This is part of a series of tutorials that demonstrate how to use CUDA
The tutorials will also demonstrate the speed of using CUDA
*/

// IMPORTANT NOTE: for this data size, your graphics card should have at least 256 megabytes of memory.
// If your GPU has less memory, then you will need to decrease this data size.

#define MAX_DATA_SIZE 1024*1024*32 // about 32 million elements.
// The max data size must be an integer multiple of 128*256, because each block will have 256 threads,
// and the block grid width will be 128. These are arbitrary numbers I choose.
#define THREADS_PER_BLOCK 256
#define BLOCKS_PER_GRID_ROW 128

double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end – d_start);
}

__global__ void getStats(float *pArray, float *pMaxResults, float *pMinResults, float *pAvgResults)
{
// Declare arrays to be in shared memory.
// 256 elements * (4 bytes / element) * 3 = 3KB.
__shared__ float min[256];
__shared__ float max[256];
__shared__ float avg[256];

// Calculate which element this thread reads from memory
int arrayIndex = 128*256*blockIdx.y + 256*blockIdx.x + threadIdx.x;
min[threadIdx.x] = max[threadIdx.x] = avg[threadIdx.x] = pArray[arrayIndex];
__syncthreads();
int nTotalThreads = blockDim.x; // Total number of active threads

while(nTotalThreads > 1)
{
int halfPoint = (nTotalThreads >> 1); // divide by two
// only the first half of the threads will be active.
if (threadIdx.x < halfPoint) { // Get the shared value stored by another thread float temp = min[threadIdx.x + halfPoint]; if (temp < min[threadIdx.x]) min[threadIdx.x] = temp; temp = max[threadIdx.x + halfPoint]; if (temp > max[threadIdx.x]) max[threadIdx.x] = temp;

// when calculating the average, sum and divide
avg[threadIdx.x] += avg[threadIdx.x + halfPoint];
avg[threadIdx.x] /= 2;
}
__syncthreads();

nTotalThreads = (nTotalThreads >> 1); // divide by two.
}

// At this point in time, thread zero has the min, max, and average
// It’s time for thread zero to write it’s final results.
// Note that the address structure of pResults is different, because
// there is only one value for every thread block.

if (threadIdx.x == 0)
{
pMaxResults[128*blockIdx.y + blockIdx.x] = max[0];
pMinResults[128*blockIdx.y + blockIdx.x] = min[0];
pAvgResults[128*blockIdx.y + blockIdx.x] = avg[0];
}
}

void getStatsCPU(float *pArray, int nElems, float *pMin, float *pMax, float *pAvg)
{
// This function uses the CPU to find the min, max and average of an array

if (nElems <= 0) return; float min, max, avg; min = max = avg = pArray[0]; for (int i=1; i < nElems; i++) { float temp = pArray[i]; if (temp < min) min = temp; if (temp > max) max = temp;
avg += temp; // we will divide once after for loop for speed.
}
avg /= (float)nElems;
*pMin = min;
*pMax = max;
*pAvg = avg;
}

////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv){
float *h_data, *h_resultMax, *h_resultMin, *h_resultAvg;
float *d_data, *d_resultMax, *d_resultMin, *d_resultAvg;
double gpuTime;
int i;

timeval start, end;

printf(“Initializing data…\n”);
h_data = (float *)malloc(sizeof(float) * MAX_DATA_SIZE);
h_resultMax = (float *)malloc(sizeof(float) * MAX_DATA_SIZE / THREADS_PER_BLOCK);
h_resultMin = (float *)malloc(sizeof(float) * MAX_DATA_SIZE / THREADS_PER_BLOCK);
h_resultAvg = (float *)malloc(sizeof(float) * MAX_DATA_SIZE / THREADS_PER_BLOCK);

cudaMalloc( (void **)&d_data, sizeof(float) * MAX_DATA_SIZE);
cudaMalloc( (void **)&d_resultMax, sizeof(float) * MAX_DATA_SIZE / THREADS_PER_BLOCK);
cudaMalloc( (void **)&d_resultMin, sizeof(float) * MAX_DATA_SIZE / THREADS_PER_BLOCK);
cudaMalloc( (void **)&d_resultAvg, sizeof(float) * MAX_DATA_SIZE / THREADS_PER_BLOCK);

srand(123);
for(i = 0; i < MAX_DATA_SIZE; i++) { h_data[i] = (float)rand() / (float)RAND_MAX; } int firstRun = 1; // Indicates if it's the first execution of the for loop const int useGPU = 1; // When 0, only the CPU is used. When 1, only the GPU is used for (int dataAmount = MAX_DATA_SIZE; dataAmount > BLOCKS_PER_GRID_ROW*THREADS_PER_BLOCK; dataAmount /= 2)
{
float tempMin,tempMax,tempAvg;

int blockGridWidth = BLOCKS_PER_GRID_ROW;
int blockGridHeight = (dataAmount / THREADS_PER_BLOCK) / blockGridWidth;

dim3 blockGridRows(blockGridWidth, blockGridHeight);
dim3 threadBlockRows(THREADS_PER_BLOCK, 1);

// Start the timer.
// We want to measure copying data, running the kernel, and copying the results back to host
gettimeofday(&start, NULL);

if (useGPU == 1)
{
// Copy the data to the device
cudaMemcpy(d_data, h_data, sizeof(float) * dataAmount, cudaMemcpyHostToDevice);

// Do the multiplication on the GPU
getStats<<>>(d_data, d_resultMax, d_resultMin, d_resultAvg);

cudaThreadSynchronize();

// Copy the data back to the host
cudaMemcpy(h_resultMin, d_resultMin, sizeof(float) * dataAmount / THREADS_PER_BLOCK, cudaMemcpyDeviceToHost);
cudaMemcpy(h_resultMax, d_resultMax, sizeof(float) * dataAmount / THREADS_PER_BLOCK, cudaMemcpyDeviceToHost);
cudaMemcpy(h_resultAvg, d_resultAvg, sizeof(float) * dataAmount / THREADS_PER_BLOCK, cudaMemcpyDeviceToHost);

// Each block returned one result, so lets finish this off with the cpu.
// By using CUDA, we basically reduced how much the CPU would have to work by about 256 times.

tempMin = h_resultMin[0];
tempMax = h_resultMax[0];
tempAvg = h_resultAvg[0];
for (int i=1 ; i < dataAmount / THREADS_PER_BLOCK; i++) { if (h_resultMin[i] < tempMin) tempMin = h_resultMin[i]; if (h_resultMax[i] > tempMax) tempMax = h_resultMax[i];
tempAvg += h_resultAvg[i];
}
tempAvg /= (dataAmount / THREADS_PER_BLOCK);
}
else
{
// We’re using the CPU only
getStatsCPU(h_data, dataAmount, &tempMin, &tempMax, &tempAvg);
}
printf(“Min: %f Max %f Avg %f\n”, tempMin, tempMax, tempAvg);

// Stop the timer, print the total round trip execution time.
gettimeofday(&end, NULL);
gpuTime = myDiffTime(start, end);
if (!firstRun || !useGPU)
{
printf(“Elements: %d – convolution time : %f msec – %f Multiplications/sec\n”, dataAmount, gpuTime, blockGridHeight * 128 * 256 / (gpuTime * 0.001));
}
else
{
firstRun = 0;
// We discard the results of the first run because of the extra overhead incurred
// during the first time a kernel is ever executed.
dataAmount *= 2; // reset to first run value
}
}

printf(“Cleaning up…\n”);
cudaFree(d_resultMin );
cudaFree(d_resultMax );
cudaFree(d_resultAvg );
cudaFree(d_data);
free(h_resultMin);
free(h_resultMax);
free(h_resultAvg);
free(h_data);

}

#include
#include

#define N 512
#define BLOCK_DIM 512

double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end – d_start);
}

__global__ void matrixAdd (int *a, int *b, int *c);
void matrixAddCPU(int *a, int *b, int *c);

int main()
{
int a[N*N], b[N*N], c[N*N];
int *dev_a, *dev_b, *dev_c;
timeval start, end;

int size = N * N * sizeof(int);

// initialize a and b with real values (NOT SHOWN)
cudaMalloc((void**)&dev_a, size);
cudaMalloc((void**)&dev_b, size);
cudaMalloc((void**)&dev_c, size);

//gettimeofday(&start, NULL);

cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);

dim3 dimBlock(BLOCK_DIM, BLOCK_DIM);
dim3 dimGrid((int)ceil(N/dimBlock.x),(int)ceil(N/dimBlock.y));

gettimeofday(&start, NULL);
matrixAdd<<>>(dev_a,dev_b,dev_c);
cudaDeviceSynchronize();
gettimeofday(&end, NULL);

cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);

//gettimeofday(&end, NULL);
printf(“GPU Time for %i additions: %f\n”, N, myDiffTime(start, end));

gettimeofday(&start, NULL);
matrixAddCPU(a, b, c);
gettimeofday(&end, NULL);
printf(“CPU Time for %i additions: %f\n”, N, myDiffTime(start, end));

cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c);
}

__global__ void matrixAdd (int *a, int *b, int *c)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int index = col + row * N;
if (col < N && row < N) { c[index] = a[index] + b[index]; } } void matrixAddCPU(int *a, int *b, int *c) { for (int i = 0; i < N; i++) for (int j = 0; j < N; j++) c[i*N + j] = a[i*N + j] + b[i*N + j]; } #include
#include

#define N 64

__global__ void matrixMult (int *a, int *b, int *c, int width);
void matrixMultCPU (int a[N][N], int b[N][N], int c[N][N], int width);

double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end – d_start);
}

int main()
{
int a[N][N], b[N][N], c[N][N], g[N][N];
int *dev_a, *dev_b, *dev_c;
timeval start, end;

// initialize matrices a and b with appropriate values
for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { a[i][j] = i*N + j; b[i][j] = i + j; } } int size = N * N * sizeof(int); cudaMalloc((void **) &dev_a, size); cudaMalloc((void **) &dev_b, size); cudaMalloc((void **) &dev_c, size); gettimeofday(&start, NULL); cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice); dim3 dimGrid(1, 1); dim3 dimBlock(N, N); matrixMult<<>>(dev_a, dev_b, dev_c, N);
cudaDeviceSynchronize();
cudaMemcpy(g, dev_c, size, cudaMemcpyDeviceToHost);

gettimeofday(&end, NULL);
printf(“GPU Time for %i additions: %f\n”, N, myDiffTime(start, end));

gettimeofday(&start, NULL);
matrixMultCPU(a, b, c, N);

gettimeofday(&end, NULL);
printf(“CPU Time for %i additions: %f\n”, N, myDiffTime(start, end));

cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c);

// print verification
for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { if (c[i][j] != g[i][j]) { printf("Results do not match! %i, %i, c=%i, g=%i\n", i, j, c[i][j], g[i][j]); exit(1); } } } } __global__ void matrixMult (int *a, int *b, int *c, int width) { int k, sum = 0; int col = threadIdx.x + blockDim.x * blockIdx.x; int row = threadIdx.y + blockDim.y * blockIdx.y; if(col < width && row < width) { for (k = 0; k < width; k++) sum += a[row * width + k] * b[k * width + col]; c[row * width + col] = sum; } } void matrixMultCPU (int a[N][N], int b[N][N], int c[N][N], int width) { for (int i = 0; i < width; i++) { for (int j = 0; j < width; j++) { int sum = 0; for (int k = 0; k < width; k++) { int m = a[i][k]; int n = b[k][j]; sum += m * n; } c[i][j] = sum; } } } #include
#include

#define N 512
#define TILE_WIDTH 16

__global__ void matrixMult (int *a, int *b, int *c, int width);
void matrixMultCPU (int a[N][N], int b[N][N], int c[N][N], int width);

double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end – d_start);
}

int main()
{
int a[N][N], b[N][N], c[N][N], g[N][N];
timeval start, end;

int *dev_a, *dev_b, *dev_c;
int size = N * N * sizeof(int);

// initialize matrices a and b with appropriate values
for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { a[i][j] = i*N + j; b[i][j] = i + j; } } // initialize a and b matrices here cudaMalloc((void **) &dev_a, size); cudaMalloc((void **) &dev_b, size); cudaMalloc((void **) &dev_c, size); gettimeofday(&start, NULL); cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice); dim3 dimBlock(TILE_WIDTH, TILE_WIDTH); dim3 dimGrid((int)ceil(N/dimBlock.x), (int)ceil(N/dimBlock.y)); matrixMult<<>>(dev_a, dev_b, dev_c, N);
cudaDeviceSynchronize();

cudaMemcpy(g, dev_c, size, cudaMemcpyDeviceToHost);

gettimeofday(&end, NULL);
printf(“GPU Time for %i additions: %f\n”, N, myDiffTime(start, end));

gettimeofday(&start, NULL);
matrixMultCPU(a, b, c, N);

gettimeofday(&end, NULL);
printf(“CPU Time for %i additions: %f\n”, N, myDiffTime(start, end));

cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c);

// print verification
for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { if (c[i][j] != g[i][j]) { printf("Results do not match! %i, %i, c=%i, g=%i\n", i, j, c[i][j], g[i][j]); exit(1); } } } } __global__ void matrixMult(int* A, int* B, int* C, int width) { int k, sum = 0; int col = blockIdx.x*TILE_WIDTH + threadIdx.x; int row = blockIdx.y*TILE_WIDTH + threadIdx.y; if(col < width && row < width) { for (k = 0; k < width; k++) sum += A[row * width + k] * B[k * width + col]; C[row * width + col] = sum; } } void matrixMultCPU (int a[N][N], int b[N][N], int c[N][N], int width) { for (int i = 0; i < width; i++) { for (int j = 0; j < width; j++) { int sum = 0; for (int k = 0; k < width; k++) { int m = a[i][k]; int n = b[k][j]; sum += m * n; } c[i][j] = sum; } } } #include
#include

#define N 65535
#define T 1024 // max threads per block

double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end – d_start);
}

__global__ void vecAdd (int *a, int *b, int *c);
void vecAddCPU(int *a, int *b, int *c);

int main()
{
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
timeval start, end;
// initialize a and b with real values
for (int i = 0; i < N; i++) { a[i] = i; b[i] = N-i; c[i] = 0; } int size = N * sizeof(int); cudaMalloc((void**)&dev_a, size); cudaMalloc((void**)&dev_b, size); cudaMalloc((void**)&dev_c, size); gettimeofday(&start, NULL); cudaMemcpy(dev_a, a, size,cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, size,cudaMemcpyHostToDevice); //gettimeofday(&start, NULL); vecAdd<<<(int)ceil(N/T),T>>>(dev_a,dev_b,dev_c);
//gettimeofday(&end, NULL);

cudaMemcpy(c, dev_c, size,cudaMemcpyDeviceToHost);
gettimeofday(&end, NULL);

printf(“GPU Time for %i additions: %f\n”, N, myDiffTime(start, end));

cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);

gettimeofday(&start, NULL);
vecAddCPU(a, b, c);
gettimeofday(&end, NULL);

printf(“CPU Time for %i additions: %f\n”, N, myDiffTime(start, end));

exit (0);
}

__global__ void vecAdd (int *a, int *b, int *c)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) { c[i] = a[i] + b[i]; } } void vecAddCPU(int *a, int *b, int *c) { for (int i = 0; i < N; i++) c[i] = a[i] + b[i]; } #include
#include
#include

#define N 1024

#define NUM_BANKS 16
#define LOG_NUM_BANKS 4
#ifdef ZERO_BANK_CONFLICTS
#define CONFLICT_FREE_OFFSET(n) ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS))
#else
#define CONFLICT_FREE_OFFSET(n) ((n) >> LOG_NUM_BANKS)
#endif

__global__ void scan(float *g_odata, float *g_idata, int n);
__global__ void prescan(float *g_odata, float *g_idata, int n);
void scanCPU(float *f_out, float *f_in, int i_n);

double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end – d_start);
}

int main()
{
float a[N], c[N], g[N];
timeval start, end;

float *dev_a, *dev_g;
int size = N * sizeof(float);

double d_gpuTime, d_cpuTime;

// initialize matrices a
for (int i = 0; i < N; i++) { a[i] = (float)(rand() % 1000000) / 1000.0; //printf("a[%i] = %f\n", i, a[i]); } // initialize a and b matrices here cudaMalloc((void **) &dev_a, size); cudaMalloc((void **) &dev_g, size); gettimeofday(&start, NULL); cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice); prescan<<<1,N,2*N*sizeof(float)>>>(dev_g, dev_a, N);
cudaDeviceSynchronize();

cudaMemcpy(g, dev_g, size, cudaMemcpyDeviceToHost);

gettimeofday(&end, NULL);
d_gpuTime = myDiffTime(start, end);

gettimeofday(&start, NULL);
scanCPU(c, a, N);

gettimeofday(&end, NULL);
d_cpuTime = myDiffTime(start, end);

cudaFree(dev_a); cudaFree(dev_g);

for (int i = 0; i < N; i++) { printf("c[%i] = %0.3f, g[%i] = %0.3f\n", i, c[i], i, g[i]); //if (c[i] != g[i]) //{ // printf("Results do not match! c[%i]=%f, g[%i]=%f\n", i, c[i], i, g[i]); // break; //} } printf("GPU Time for scan size %i: %f\n", N, d_gpuTime); printf("CPU Time for scan size %i: %f\n", N, d_cpuTime); } __global__ void prescan(float *g_odata, float *g_idata, int n) { extern __shared__ float temp[]; // allocated on invocation int thid = threadIdx.x; int offset = 1; //A int ai = thid; int bi = thid + (n/2); int bankOffsetA = CONFLICT_FREE_OFFSET(ai); int bankOffsetB = CONFLICT_FREE_OFFSET(ai); temp[ai + bankOffsetA] = g_idata[ai]; temp[bi + bankOffsetB] = g_idata[bi]; for (int d = n>>1; d > 0; d >>= 1)
// build sum in place up the tree
{
__syncthreads();
if (thid < d) { //B int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1; ai += CONFLICT_FREE_OFFSET(ai); bi += CONFLICT_FREE_OFFSET(bi); temp[bi] += temp[ai]; } offset *= 2; } //C if (thid==0) { int z = CONFLICT_FREE_OFFSET(n - 1); temp[n - 1 + z] = 0; } // clear the last element for (int d = 1; d < n; d *= 2) // traverse down tree & build scan { offset >>= 1;
__syncthreads();
if (thid < d) { //D int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1; ai += CONFLICT_FREE_OFFSET(ai); bi += CONFLICT_FREE_OFFSET(bi); float t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t; } } __syncthreads(); //E g_odata[ai] = temp[ai + bankOffsetA]; g_odata[bi] = temp[bi + bankOffsetB]; } void scanCPU(float *f_out, float *f_in, int i_n) { f_out[0] = 0; for (int i = 1; i < i_n; i++) f_out[i] = f_out[i-1] + f_in[i-1]; } #include
#include
#include

#define N 1024

__global__ void scan(float *g_odata, float *g_idata, int n);
__global__ void prescan(float *g_odata, float *g_idata, int n);
void scanCPU(float *f_out, float *f_in, int i_n);

double myDiffTime(struct timeval &start, struct timeval &end)
{
double d_start, d_end;
d_start = (double)(start.tv_sec + start.tv_usec/1000000.0);
d_end = (double)(end.tv_sec + end.tv_usec/1000000.0);
return (d_end – d_start);
}

int main()
{
float a[N], c[N], g[N];
timeval start, end;

float *dev_a, *dev_g;
int size = N * sizeof(float);

double d_gpuTime, d_cpuTime;

// initialize matrices a
for (int i = 0; i < N; i++) { a[i] = (float)(rand() % 1000000) / 1000.0; //printf("a[%i] = %f\n", i, a[i]); } // initialize a and b matrices here cudaMalloc((void **) &dev_a, size); cudaMalloc((void **) &dev_g, size); gettimeofday(&start, NULL); cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice); //scan<<<1,N,2*N*sizeof(float)>>>(dev_g, dev_a, N);
prescan<<<1,N,2*N*sizeof(float)>>>(dev_g, dev_a, N);
cudaDeviceSynchronize();

cudaMemcpy(g, dev_g, size, cudaMemcpyDeviceToHost);

gettimeofday(&end, NULL);
d_gpuTime = myDiffTime(start, end);

gettimeofday(&start, NULL);
scanCPU(c, a, N);

gettimeofday(&end, NULL);
d_cpuTime = myDiffTime(start, end);

cudaFree(dev_a); cudaFree(dev_g);

for (int i = 0; i < N; i++) { printf("c[%i] = %0.3f, g[%i] = %0.3f\n", i, c[i], i, g[i]); //if (c[i] != g[i]) //{ // printf("Results do not match! c[%i]=%f, g[%i]=%f\n", i, c[i], i, g[i]); // break; //} } printf("GPU Time for scan size %i: %f\n", N, d_gpuTime); printf("CPU Time for scan size %i: %f\n", N, d_cpuTime); } __global__ void scan(float *g_odata, float *g_idata, int n) { extern __shared__ float temp[]; // allocated on invocation int thid = threadIdx.x; int pout = 0, pin = 1; // Load input into shared memory. // This is exclusive scan, so shift right by one // and set first element to 0 temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;
__syncthreads();
for (int offset = 1; offset < n; offset *= 2) { pout = 1 - pout; // swap double buffer indices pin = 1 - pout; if (thid >= offset)
temp[pout*n+thid] += temp[pin*n+thid – offset];
else
temp[pout*n+thid] = temp[pin*n+thid];

__syncthreads();
}
g_odata[thid] = temp[pout*n+thid]; // write output
}

__global__ void prescan(float *g_odata, float *g_idata, int n)
{
extern __shared__ float temp[];
// allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = g_idata[2*thid];
// load input into shared memory
temp[2*thid+1] = g_idata[2*thid+1];
for (int d = n>>1; d > 0; d >>= 1)
// build sum in place up the tree
{
__syncthreads();
if (thid < d) { int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1; temp[bi] += temp[ai]; } offset *= 2; } if (thid == 0) { temp[n - 1] = 0; } // clear the last element for (int d = 1; d < n; d *= 2) // traverse down tree & build scan { offset >>= 1;
__syncthreads();
if (thid < d) { int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1; float t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t; } } __syncthreads(); g_odata[2*thid] = temp[2*thid]; // write results to device memory g_odata[2*thid+1] = temp[2*thid+1]; } void scanCPU(float *f_out, float *f_in, int i_n) { f_out[0] = 0; for (int i = 1; i < i_n; i++) f_out[i] = f_out[i-1] + f_in[i-1]; } commands.txt cuda_tutorial11.cu CUDA_Tutorial_3.cu m_add.cu m_mult.cu m_multTile.cu v_add.cu v_bankScan.cu v_naiveScan.cu