#include
#include
#include
#include
Copyright By PowCoder代写 加微信 powcoder
#include
#include “helper_cuda.h”
// kernel routine
__global__ void my_first_kernel(float *x)
int tid = threadIdx.x + blockDim.x*blockIdx.x;
x[tid] = (float)threadIdx.x;
// main code
int main(int argc, const char **argv)
// initialise card
findCudaDevice(argc, argv);
// set number of blocks, and threads per block
int nblocks = 2;
int nthreads = 8;
int nsize = nblocks*nthreads ;
// allocate memory for array
checkCudaErrors(cudaMallocManaged(&x, nsize*sizeof(float)));
// execute kernel
my_first_kernel<<
getLastCudaError(“my_first_kernel execution failed\n”);
// synchronize to wait for kernel to finish, and data copied back
cudaDeviceSynchronize();
for (int n=0;n