#include
#include
#include
#include
#include
#define N 800
#define ITERATIONS 10
#define DIM_THREAD_BLOCK_X 32
#define DIM_THREAD_BLOCK_Y 8
using namespace std;
__global__ void sgemm(float *A, float *B, float *C, int n, float a, float b) {
int j = blockIdx.x * blockDim.x + threadIdx.x;
int i = blockIdx.y * blockDim.y + threadIdx.y;
__shared__ float shareA[DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y];
__shared__ float shareB[DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y];
//add your code here
if( (i
cudaDeviceSynchronize();
cudaMemcpy(C_gpu_final, C_gpu, sizeof(float)*N*N, cudaMemcpyDeviceToHost);
double time1=timestamp();
for(int numOfTimes=0; numOfTimes
}
cudaDeviceSynchronize();
double time2=timestamp();
double time = (time2-time1)/ITERATIONS;
//double flops = 2*N*N*N;
double gflopsPerSecond = 2*N/1000*N*N/(1000000)/time;
double GB = (double)(N)*N*4/1000000000;
double GBpS = (double)(N)*N*4/1000000000/time;
printf(“GFLOPS/s=%lf\n”,gflopsPerSecond );
printf(“GB/s=%lf\n”,GBpS);
printf(“GFLOPS=%lf\n”,float(2*N/1000*N*N/(1000000)));
printf(“GB=%lf\n”,GB);
printf(“time(s)=%lf\n”,time);
cudaFree(A_gpu);
cudaFree(B_gpu);
cudaFree(C_gpu);
return 0;
}