CS代写 #include “cuda.cuh”

#include “cuda.cuh”

#include

Copyright By PowCoder代写 加微信 powcoder

#include “helper.h”

/// Algorithm storage
// Host copy of input image
Image cuda_input_image;
// Host copy of image tiles in each dimension
unsigned int cuda_TILES_X, cuda_TILES_Y;
// Pointer to device buffer for calculating the sum of each tile mosaic, this must be passed to a kernel to be used on device
unsigned long long* d_mosaic_sum;
// Pointer to device buffer for storing the output pixels of each tile, this must be passed to a kernel to be used on device
unsigned char* d_mosaic_value;
// Pointer to device image data buffer, for storing the input image, this must be passed to a kernel to be used on device
unsigned char* d_input_image_data;
// Pointer to device image data buffer, for storing the output image data, this must be passed to a kernel to be used on device
unsigned char* d_output_image_data;
// Pointer to device buffer for the global pixel average sum, this must be passed to a kernel to be used on device
unsigned long long* d_global_pixel_sum;

void cuda_begin(const Image *input_image) {
// These are suggested CUDA memory allocations that match the CPU implementation
// If you would prefer, you can rewrite this function (and cuda_end()) to suit your preference

cuda_TILES_X = input_image->width / TILE_SIZE;
cuda_TILES_Y = input_image->height / TILE_SIZE;

// Allocate buffer for calculating the sum of each tile mosaic
CUDA_CALL(cudaMalloc(&d_mosaic_sum, cuda_TILES_X * cuda_TILES_Y * input_image->channels * sizeof(unsigned long long)));

// Allocate buffer for storing the output pixel value of each tile
CUDA_CALL(cudaMalloc(&d_mosaic_value, cuda_TILES_X * cuda_TILES_Y * input_image->channels * sizeof(unsigned char)));

const size_t image_data_size = input_image->width * input_image->height * input_image->channels * sizeof(unsigned char);
// Allocate copy of input image
cuda_input_image = *input_image;
cuda_input_image.data = (unsigned char*)malloc(image_data_size);
memcpy(cuda_input_image.data, input_image->data, image_data_size);

// Allocate and fill device buffer for storing input image data
CUDA_CALL(cudaMalloc(&d_input_image_data, image_data_size));
CUDA_CALL(cudaMemcpy(d_input_image_data, input_image->data, image_data_size, cudaMemcpyHostToDevice));

// Allocate device buffer for storing output image data
CUDA_CALL(cudaMalloc(&d_output_image_data, image_data_size));

// Allocate and zero buffer for calculation global pixel average
CUDA_CALL(cudaMalloc(&d_global_pixel_sum, input_image->channels * sizeof(unsigned long long)));
void cuda_stage1() {
// Optionally during development call the skip function with the correct inputs to skip this stage
// skip_tile_sum(input_image, mosaic_sum);

#ifdef VALIDATION
// TODO: Uncomment and call the validation function with the correct inputs
// You will need to copy the data back to host before passing to these functions
// (Ensure that data copy is carried out within the ifdef VALIDATION so that it doesn’t affect your benchmark results!)
// validate_tile_sum(&input_image, mosaic_sum);
void cuda_stage2(unsigned char* output_global_average) {
// Optionally during development call the skip function with the correct inputs to skip this stage
// skip_compact_mosaic(TILES_X, TILES_Y, mosaic_sum, compact_mosaic, global_pixel_average);

#ifdef VALIDATION
// TODO: Uncomment and call the validation functions with the correct inputs
// You will need to copy the data back to host before passing to these functions
// (Ensure that data copy is carried out within the ifdef VALIDATION so that it doesn’t affect your benchmark results!)
// validate_compact_mosaic(TILES_X, TILES_Y, mosaic_sum, mosaic_value, output_global_average);
void cuda_stage3() {
// Optionally during development call the skip function with the correct inputs to skip this stage
// skip_broadcast(input_image, compact_mosaic, output_image);

#ifdef VALIDATION
// TODO: Uncomment and call the validation function with the correct inputs
// You will need to copy the data back to host before passing to these functions
// (Ensure that data copy is carried out within the ifdef VALIDATION so that it doesn’t affect your benchmark results!)
// validate_broadcast(&input_image, mosaic_value, &output_image);
void cuda_end(Image *output_image) {
// This function matches the provided cuda_begin(), you may change it if desired

// Store return value
output_image->width = cuda_input_image.width;
output_image->height = cuda_input_image.height;
output_image->channels = cuda_input_image.channels;
CUDA_CALL(cudaMemcpy(output_image->data, d_output_image_data, output_image->width * output_image->height * output_image->channels * sizeof(unsigned char), cudaMemcpyDeviceToHost));
// Release allocations
free(cuda_input_image.data);
CUDA_CALL(cudaFree(d_mosaic_value));
CUDA_CALL(cudaFree(d_mosaic_sum));
CUDA_CALL(cudaFree(d_input_image_data));
CUDA_CALL(cudaFree(d_output_image_data));

程序代写 CS代考 加微信: powcoder QQ: 1823890830 Email: powcoder@163.com