CS代考 #include

#include
#include
#include
#include

Copyright By PowCoder代写 加微信 powcoder

#include

// kernel routine

__global__ void my_first_kernel(float *x)
int tid = threadIdx.x + blockDim.x*blockIdx.x;
x[tid] = (float)threadIdx.x;

// main code

int main(int argc, const char **argv)
// set number of blocks, and threads per block
int nblocks = 2;
int nthreads = 8;
int nsize = nblocks*nthreads ;

// allocate memory for array
float *x_d;
cudaMalloc((void **)&x_d, nsize*sizeof(float));
std::vector x_h(nsize);

// execute kernel
my_first_kernel<<>>(x_d);

// copy back results and print them out
cudaMemcpy(&x_h[0], x_d, nsize*sizeof(float), cudaMemcpyDeviceToHost);

for (int n=0;nCS代考 加微信: powcoder QQ: 1823890830 Email: powcoder@163.com