#include
#include
#include
#include
Copyright By PowCoder代写 加微信 powcoder
#include
// kernel routine
__global__ void my_first_kernel(float *x)
int tid = threadIdx.x + blockDim.x*blockIdx.x;
x[tid] = (float)threadIdx.x;
// main code
int main(int argc, const char **argv)
// set number of blocks, and threads per block
int nblocks = 2;
int nthreads = 8;
int nsize = nblocks*nthreads ;
// allocate memory for array
float *x_d;
cudaMalloc((void **)&x_d, nsize*sizeof(float));
std::vector
// execute kernel
my_first_kernel<<
// copy back results and print them out
cudaMemcpy(&x_h[0], x_d, nsize*sizeof(float), cudaMemcpyDeviceToHost);
for (int n=0;n