CS计算机代考程序代写 cuda GPU algorithm b'code (1).tar.gz'

b’code (1).tar.gz’

[ 0.03,
0.029,
0.028,
0.027,
0.026,
0.0251037,
0.0247694,
0.024435,
0.0241007,
0.0237664,
0.023432,
0.0230977,
0.0227634,
0.0224291,
0.0221155,
0.021808 ] // Standard Result for the Small Dataset of PMPH Project.

16 // OUTER
32 // NUM_X
256 // NUM_Y
90 // NUM_T

[ 0.03,
0.029,
0.0283208,
0.0279088,
0.0274968,
0.0270848,
0.0266728,
0.0262608,
0.0258723,
0.0254883,
0.0251043,
0.0247204,
0.0243364,
0.0239561,
0.0236013,
0.0232464,
0.0228915,
0.0225366,
0.0221817,
0.0218386,
0.0215117,
0.0211848,
0.020858,
0.0205311,
0.0202043,
0.0198962,
0.0195957,
0.0192953,
0.0189949,
0.0186945,
0.018394,
0.0181183 ] // Standard Result for the Medium Dataset of PMPH Project.

32 // OUTER
47 // NUM_X
181 // NUM_Y
93 // NUM_T

[ 0.03,
0.0291938,
0.0287513,
0.0283132,
0.0278803,
0.0274529,
0.0270311,
0.026615,
0.0262046,
0.0258003,
0.0254017,
0.0250088,
0.0246216,
0.02424,
0.0238639,
0.0234934,
0.0231283,
0.0227686,
0.0224142,
0.0220651,
0.0217212,
0.0213824,
0.0210487,
0.02072,
0.0203962,
0.0200774,
0.0197633,
0.019454,
0.0191494,
0.0188494,
0.018554,
0.0182632,
0.017977,
0.017695,
0.0174174,
0.0171441,
0.0168749,
0.0166099,
0.0163489,
0.0160919,
0.0158389,
0.0155898,
0.0153445,
0.015103,
0.0148652,
0.0146311,
0.0144006,
0.0141737,
0.0139503,
0.0137304,
0.0135138,
0.0133007,
0.0130909,
0.0128844,
0.0126812,
0.0124812,
0.0122842,
0.0120903,
0.0118995,
0.0117116,
0.0115266,
0.0113446,
0.0111653,
0.0109889,
0.0108152,
0.0106443,
0.010476,
0.0103104,
0.0101473,
0.00998681,
0.00982882,
0.00967331,
0.00952024,
0.00936957,
0.00922132,
0.00907546,
0.00893188,
0.00879056,
0.00865146,
0.00851453,
0.00837976,
0.00824711,
0.00811654,
0.00798802,
0.00786152,
0.00773701,
0.00761445,
0.00749382,
0.00737509,
0.00725823,
0.0071432,
0.00702999,
0.00691855,
0.00680886,
0.00670091,
0.00659464,
0.00649011,
0.00638725,
0.006286,
0.00618635,
0.00608826,
0.00599171,
0.00589667,
0.00580313,
0.00571106,
0.00562043,
0.00553123,
0.00544343,
0.005357,
0.00527193,
0.00518819,
0.00510577,
0.00502464,
0.00494478,
0.00486617,
0.00478879,
0.00471262,
0.00463764,
0.00456389,
0.00449129,
0.00441983,
0.00434948,
0.00428023,
0.00421206,
0.00414494,
0.00407888,
0.00401383,
0.0039498 ] // Standard Result for the Large Dataset of PMPH Project.

128 // OUTER
256 // NUM_X
256 // NUM_Y
128 // NUM_T

To compile and run:
$ make clean; make; make run_small
_medium
_large

Folder `OrigImpl’ contains the original implementation:
— `ProjectMain.cpp’ contains the main function
— `ProjCoreOrig.cpp’ contains the core functions
(to parallelize)
— `ProjHelperFun.cpp’ contains the functions that compute
the input parameters, and
(can be parallelize as well)

Folder `include’ contains
— `ParserC.h’ implements a simple parser
— `ParseInput.h’ reads the input/output data
and provides validation.
— `OpenmpUtil.h’ some OpenMP-related helpers.
— `Constants.h’ currently only sets up REAL
to either double or float
based on the compile-time
parameter WITH_FLOATS.

— `CudaUtilProj.cu.h’ provides stubs for calling
transposition and inclusive
(segmented) scan.
— `TestCudaUtil.cu’ a simple tester for
transposition and scan.

Folder `ParTridagCuda` contains code that demonstrates how TRIDAG can be parallelized by intra-block scans, i.e., it assumes that NUM_X, NUM_Y are a power of 2 less than or equal to 1024.

#include “ProjHelperFun.h”
#include “Constants.h”
#include “TridagPar.h”

void updateParams(const unsigned g, const REAL alpha, const REAL beta, const REAL nu, PrivGlobs& globs)
{
for(unsigned i=0;i& a, // size [n]
const vector& b, // size [n]
const vector& c, // size [n]
const vector& r, // size [n]
const int n,
vector& u, // size [n]
vector& uu // size [n] temporary
) {
int i, offset;
REAL beta;

u[0] = r[0];
uu[0] = b[0];

for(i=1; i=0; i–) {
u[i] = (u[i] – c[i]*u[i+1]) / uu[i];
}
#else
// Hint: X) can be written smth like (once you make a non-constant)
for(i=0; i > u(numY, vector(numX)); // [numY][numX]
vector > v(numX, vector(numY)); // [numX][numY]
vector a(numZ), b(numZ), c(numZ), y(numZ); // [max(numX,numY)]
vector yy(numZ); // temporary used in tridag // [max(numX,numY)]

// explicit x
for(i=0;i 0) {
u[j][i] += 0.5*( 0.5*globs.myVarX[i][j]*globs.myDxx[i][0] )
* globs.myResult[i-1][j];
}
u[j][i] += 0.5*( 0.5*globs.myVarX[i][j]*globs.myDxx[i][1] )
* globs.myResult[i][j];
if(i < numX-1) { u[j][i] += 0.5*( 0.5*globs.myVarX[i][j]*globs.myDxx[i][2] ) * globs.myResult[i+1][j]; } } } // explicit y for(j=0;j 0) {
v[i][j] += ( 0.5*globs.myVarY[i][j]*globs.myDyy[j][0] )
* globs.myResult[i][j-1];
}
v[i][j] += ( 0.5*globs.myVarY[i][j]*globs.myDyy[j][1] )
* globs.myResult[i][j];
if(j < numY-1) { v[i][j] += ( 0.5*globs.myVarY[i][j]*globs.myDyy[j][2] ) * globs.myResult[i][j+1]; } u[j][i] += v[i][j]; } } // implicit x for(j=0;j=0;–i)
{
updateParams(i,alpha,beta,nu,globs);
rollback(i, globs);
}

return globs.myResult[globs.myXindex][globs.myYindex];
}

void run_OrigCPU(
const unsigned int& outer,
const unsigned int& numX,
const unsigned int& numY,
const unsigned int& numT,
const REAL& s0,
const REAL& t,
const REAL& alpha,
const REAL& nu,
const REAL& beta,
REAL* res // [outer] RESULT
) {
REAL strike;
PrivGlobs globs(numX, numY, numT);

for( unsigned i = 0; i < outer; ++ i ) { strike = 0.001*i; res[i] = value( globs, s0, strike, t, alpha, nu, beta, numX, numY, numT ); } } //#endif // PROJ_CORE_ORIG CXX = g++ LIB = -L$(OPENCL_LIBDIR) -lOpenCL CXXFLAGS = -fopenmp -O3 -DWITH_FLOATS=0 INCLUDES += -I ../include GPU_OPTS = -D lgWARP=5 SOURCES_CPP =ProjectMain.cpp ProjHelperFun.cpp ProjCoreOrig.cpp HELPERS =ProhHelperFun.h ../include/Constants.h ../include/ParseInput.h ../include/ParserC.h ../include/OpenmpUtil.h OBJECTS =ProjectMain.o ProjHelperFun.o ProjCoreOrig.o EXECUTABLE =runproject default: cpu .cpp.o: $(SOURCES_CPP) $(HELPERS) $(CXX) $(CXXFLAGS) $(GPU_OPTS) $(INCLUDES) -c -o $@ $< cpu: $(EXECUTABLE) $(EXECUTABLE): $(OBJECTS) $(CXX) $(CXXFLAGS) $(INCLUDES) -o $(EXECUTABLE) $(OBJECTS) run_small: $(EXECUTABLE) cat ../Data/Small/input.data ../Data/Small/output.data | ./$(EXECUTABLE) 2> Debug.txt

run_medium: $(EXECUTABLE)
cat ../Data/Medium/input.data ../Data/Medium/output.data | ./$(EXECUTABLE) 2> Debug.txt

run_large: $(EXECUTABLE)
cat ../Data/Large/input.data ../Data/Large/output.data | ./$(EXECUTABLE) 2> Debug.txt

clean:
rm -f Debug.txt $(EXECUTABLE) $(OBJECTS)

#include “ProjHelperFun.h”

/**************************/
/**** HELPER FUNCTIONS ****/
/**************************/

/**
* Fills in:
* globs.myTimeline of size [0..numT-1]
* globs.myX of size [0..numX-1]
* globs.myY of size [0..numY-1]
* and also sets
* globs.myXindex and globs.myYindex (both scalars)
*/
void initGrid( const REAL s0, const REAL alpha, const REAL nu,const REAL t,
const unsigned numX, const unsigned numY, const unsigned numT, PrivGlobs& globs
) {
for(unsigned i=0;i(s0/dx) % numX;

for(unsigned i=0;i(numY/2.0);

for(unsigned i=0;i& x,
vector >& Dxx
) {
const unsigned n = x.size();

REAL dxl, dxu;

// lower boundary
dxl = 0.0;
dxu = x[1] – x[0];

Dxx[0][0] = 0.0;
Dxx[0][1] = 0.0;
Dxx[0][2] = 0.0;
Dxx[0][3] = 0.0;

// standard case
for(unsigned i=1;i
if 0 < i then {b[i], 0.0-a[i]*c[i-1], 1.0, 0.0} else {1.0, 0.0, 0.0, 1.0} , iota(n) ) in let scmt = scan( fn {real,real,real,real} ( {real,real,real,real} a, {real,real,real,real} b ) =>
let {a0,a1,a2,a3} = a in
let {b0,b1,b2,b3} = b in
let val = 1.0/(a0*b0) in
{ (b0*a0 + b1*a2)*val,
(b0*a1 + b1*a3)*val,
(b2*a0 + b3*a2)*val,
(b2*a1 + b3*a3)*val
}
, {1.0, 0.0, 0.0, 1.0}, mats ) in
let b = map ( fn real ({real,real,real,real} tup) =>
let {t0,t1,t2,t3} = tup in
(t0*b0 + t1) / (t2*b0 + t3)
, scmt ) in
——————————————————
— Recurrence 2: y[i] = y[i] – (a[i]/b[i-1])*y[i-1] —
— solved by scan with linear func comp operator —
——————————————————
let y0 = y[0] in
let lfuns= map ( fn {real,real} (int i) =>
if 0 < i then {y[i], 0.0-a[i]/b[i-1]} else {0.0, 1.0 } , iota(n) ) in let cfuns= scan( fn {real,real} ({real,real} a, {real,real} b) =>
let {a0,a1} = a in
let {b0,b1} = b in
{ b0 + b1*a0, a1*b1 }
, {0.0, 1.0}, lfuns ) in
let y = map ( fn real ({real,real} tup) =>
let {a,b} = tup in
a + b*y0
, cfuns ) in
——————————————————
— Recurrence 3: backward recurrence solved via —
— scan with linear func comp operator —
——————————————————
let yn = y[n-1]/b[n-1] in
let lfuns= map ( fn {real,real} (int k) =>
let i = n-k-1
in if 0 < k then {y[i]/b[i], 0.0-c[i]/b[i]} else {0.0, 1.0 } , iota(n) ) in let cfuns= scan( fn {real,real} ({real,real} a, {real,real} b) =>
let {a0,a1} = a in
let {b0,b1} = b in
{b0 + b1*a0, a1*b1}
, {0.0, 1.0}, lfuns ) in
let y = map ( fn real ({real,real} tup) =>
let {a,b} = tup in
a + b*yn
, cfuns ) in
let y = map (fn real (int i) => y[n-i-1], iota(n)) in
copy(y)
#endif

struct MyReal4 {
REAL x;
REAL y;
REAL z;
REAL w;

// constructors
inline MyReal4() { x = y = z = w = 0.0; }
inline MyReal4(const REAL a, const REAL b, const REAL c, const REAL d) {
x = a; y = b; z = c; w = d;
}
// copy constructor
inline MyReal4(const MyReal4& i4) {
x = i4.x; y = i4.y; z = i4.z; w = i4.w;
}
// assignment operator
inline MyReal4& operator=(const MyReal4& i4) {
x = i4.x; y = i4.y; z = i4.z; w = i4.w;
return *this;
}
};

struct MatMult2b2 {
typedef MyReal4 OpTp;
static MyReal4 apply(const MyReal4 a, const MyReal4 b) {
REAL val = 1.0/(a.x*b.x);
return MyReal4( (b.x*a.x + b.y*a.z)*val,
(b.x*a.y + b.y*a.w)*val,
(b.z*a.x + b.w*a.z)*val,
(b.z*a.y + b.w*a.w)*val );
}
};

struct MyReal2 {
REAL x;
REAL y;
// constructors
inline MyReal2() { x = y = 0.0; }
inline MyReal2(const REAL a, const REAL b) {
x = a; y = b;
}
// copy constructor
inline MyReal2(const MyReal2& i4) {
x = i4.x; y = i4.y;
}
// assignment operator
inline MyReal2& operator=(const MyReal2& i4) {
x = i4.x; y = i4.y;
return *this;
}
};

struct LinFunComp {
typedef MyReal2 OpTp;
static MyReal2 apply(const MyReal2 a, const MyReal2 b) {
return MyReal2( b.x + b.y*a.x, a.y*b.y );
}
};

template
void inplaceScanInc(const int n, vector& inpres) {
typename OP::OpTp acc = inpres[0];
for(int i=1; i& a, // size [n]
const vector& b, // size [n]
const vector& c, // size [n]
const vector& r, // size [n]
const int n,
vector& u, // size [n]
vector& uu // size [n] temporary
) {
int i, offset;

//vector scanres(n); // supposed to also be in shared memory and to reuse the space of mats
//————————————————–
// Recurrence 1: b[i] = b[i] – a[i]*c[i-1]/b[i-1] —
// solved by scan with 2×2 matrix mult operator —
//————————————————–
vector mats(n); // supposed to be in shared memory!
REAL b0 = b[0];
for(int i=0; i(n,mats);
for(int i=0; i uu
//—————————————————-
// Recurrence 2: y[i] = y[i] – (a[i]/b[i-1])*y[i-1] —
// solved by scan with linear func comp operator —
//—————————————————-
vector lfuns(n);
REAL y0 = r[0];
for(int i=0; i(n,lfuns);
for(int i=0; i u

//—————————————————-
// Recurrence 3: backward recurrence solved via —
// scan with linear func comp operator —
//—————————————————-
REAL yn = u[n-1]/uu[n-1];
for(int i=0; i(n,lfuns);
for(int i=0; i
#include
#include
#include
#include “Constants.h”

using namespace std;

struct PrivGlobs {

// grid
vector myX; // [numX]
vector myY; // [numY]
vector myTimeline; // [numT]
unsigned myXindex;
unsigned myYindex;

// variable
vector > myResult; // [numX][numY]

// coeffs
vector > myVarX; // [numX][numY]
vector > myVarY; // [numX][numY]

// operators
vector > myDxx; // [numX][4]
vector > myDyy; // [numY][4]

PrivGlobs( ) {
printf(“Invalid Contructor: need to provide the array sizes! EXITING…!\n”);
exit(0);
}

PrivGlobs( const unsigned int& numX,
const unsigned int& numY,
const unsigned int& numT ) {
this-> myX.resize(numX);
this->myDxx.resize(numX);
for(int k=0; kmyDxx[k].resize(4);
}

this-> myY.resize(numY);
this->myDyy.resize(numY);
for(int k=0; kmyDyy[k].resize(4);
}

this->myTimeline.resize(numT);

this-> myVarX.resize(numX);
this-> myVarY.resize(numX);
this->myResult.resize(numX);
for(unsigned i=0;i myVarX[i].resize(numY);
this-> myVarY[i].resize(numY);
this->myResult[i].resize(numY);
}

}
} __attribute__ ((aligned (128)));

void initGrid( const REAL s0, const REAL alpha, const REAL nu,const REAL t,
const unsigned numX, const unsigned numY, const unsigned numT, PrivGlobs& globs
);

void initOperator( const vector& x,
vector >& Dxx
);

void updateParams(const unsigned g, const REAL alpha, const REAL beta, const REAL nu, PrivGlobs& globs);

void setPayoff(const REAL strike, PrivGlobs& globs );

void tridag(
const vector& a, // size [n]
const vector& b, // size [n]
const vector& c, // size [n]
const vector& r, // size [n]
const int n,
vector& u, // size [n]
vector& uu // size [n] temporary
);

void rollback( const unsigned g, PrivGlobs& globs );

REAL value( PrivGlobs globs,
const REAL s0,
const REAL strike,
const REAL t,
const REAL alpha,
const REAL nu,
const REAL beta,
const unsigned int numX,
const unsigned int numY,
const unsigned int numT
);

#endif // PROJ_HELPER_FUNS

#include “OpenmpUtil.h”
#include “ParseInput.h”

#include “ProjHelperFun.h”

int main()
{
unsigned int OUTER_LOOP_COUNT, NUM_X, NUM_Y, NUM_T;
const REAL s0 = 0.03, strike = 0.03, t = 5.0, alpha = 0.2, nu = 0.6, beta = 0.5;

readDataSet( OUTER_LOOP_COUNT, NUM_X, NUM_Y, NUM_T );

const int Ps = get_CPU_num_threads();
REAL* res = (REAL*)malloc(OUTER_LOOP_COUNT*sizeof(REAL));

{ // Original Program (Sequential CPU Execution)
cout<<"\n// Running Original, Sequential Project Program"<

/***************************************************/
/*********** MATRIX TRANSPOSITION HELPER ***********/
/***************************************************/

/**
* Matrix Transposition Tiled KERNEL
* (only coalesced accesses to global memory)
* ASSUMPTIONS:
* blockDim.y = TILE; blockDim.x = TILE
* each block transposes a square TILE
* INPUT:
* matrix `A’ of height `heightA’ (number of rows),
* and width `widthA’ (number of cols).
* OUTPUT:
* matrix `B’ of height `widthA’ (number of rows),
* and width `heightA’ (number of cols).
*/
template
__global__ void matTransposeTiledKer(T* A, T* B, int heightA, int widthA) {

__shared__ T shtileTR[TILE][TILE+1];

int x = blockIdx.x * TILE + threadIdx.x;
int y = blockIdx.y * TILE + threadIdx.y;

if( x < widthA && y < heightA ) shtileTR[threadIdx.y][threadIdx.x] = A[y*widthA + x]; __syncthreads(); x = blockIdx.y * TILE + threadIdx.x; y = blockIdx.x * TILE + threadIdx.y; if( x < heightA && y < widthA ) B[y*heightA + x] = shtileTR[threadIdx.x][threadIdx.y]; } /** * Matrix Transposition CPU Stub: * INPUT: * `inp_d' input matrix (already in device memory) * `height' number of rows of input matrix `inp_d' * `width' number of columns of input matrix `inp_d' * OUTPUT: * `out_d' the transposed matrix with * `width' rows and `height' columns! */ template
void transpose( float* inp_d,
float* out_d,
const unsigned int height,
const unsigned int width
) {
// 1. setup block and grid parameters
int dimy = ceil( ((float)height)/tile );
int dimx = ceil( ((float) width)/tile );
dim3 block(tile, tile, 1);
dim3 grid (dimx, dimy, 1);

//2. execute the kernel
matTransposeTiledKer <<< grid, block >>>
(inp_d, out_d, height, width);
cudaThreadSynchronize();
}

/***************************************************/
/*********** Data-Structures for Scan **************/
/***************************************************/

template
class Add {
public:
typedef T BaseType;
static __device__ __host__ inline T identity() { return (T)0; }
static __device__ __host__ inline T apply(const T t1, const T t2) { return t1 + t2; }
};

class MyInt4 {
public:
int x; int y; int z; int w;
__device__ __host__ inline MyInt4() {
x = 0; y = 0; z = 0; w = 0;
}
__device__ __host__ inline MyInt4(const int& a, const int& b, const int& c, const int& d) {
x = a; y = b; z = c; w = d;
}
__device__ __host__ inline MyInt4(const MyInt4& i4) {
x = i4.x; y = i4.y; z = i4.z; w = i4.w;
}
volatile __device__ __host__ inline MyInt4& operator=(const MyInt4& i4) volatile {
x = i4.x; y = i4.y; z = i4.z; w = i4.w;
return *this;
}
};

class MsspOp {
public:
typedef MyInt4 BaseType;
static __device__ inline MyInt4 identity() { return MyInt4(0,0,0,0); }
static __device__ inline MyInt4 apply(volatile MyInt4& t1, volatile MyInt4& t2) {
int mss = max(t1.x, max(t2.x,t1.z+t2.y));
int mis = max(t1.y, t1.w+t2.y);
int mcs = max(t2.z, t1.z+t2.w);
int t = t1.w + t2.w;
return MyInt4(mss, mis, mcs, t);
}
};

/***************************************************/
/********* INCLUSIVE SCAN HELPER and KERNEL ********/
/***************************************************/

template
__device__ inline
T scanIncWarp( volatile T* ptr, const unsigned int idx ) {
const unsigned int lane = idx & 31;

// no synchronization needed inside a WARP,
// i.e., SIMD execution
if (lane >= 1) ptr[idx] = OP::apply(ptr[idx-1], ptr[idx]);
if (lane >= 2) ptr[idx] = OP::apply(ptr[idx-2], ptr[idx]);
if (lane >= 4) ptr[idx] = OP::apply(ptr[idx-4], ptr[idx]);
if (lane >= 8) ptr[idx] = OP::apply(ptr[idx-8], ptr[idx]);
if (lane >= 16) ptr[idx] = OP::apply(ptr[idx-16], ptr[idx]);

return const_cast(ptr[idx]);
}

template
__device__ inline
T scanIncBlock(volatile T* ptr, const unsigned int idx) {
const unsigned int lane = idx & 31;
const unsigned int warpid = idx >> 5;

T val = scanIncWarp(ptr,idx);
__syncthreads();

// place the end-of-warp results in
// the first warp. This works because
// warp size = 32, and
// max block size = 32^2 = 1024
if (lane == 31) { ptr[warpid] = const_cast(ptr[idx]); }
__syncthreads();

//
if (warpid == 0) scanIncWarp(ptr, idx);
__syncthreads();

if (warpid > 0) {
val = OP::apply(ptr[warpid-1], val);
}

return val;
}

template
__global__ void
scanIncKernel(T* d_in, T* d_out, unsigned int d_size) {
extern __shared__ char sh_mem_inc_scan[];
volatile T* sh_memT = (volatile T*)sh_mem_inc_scan;
const unsigned int tid = threadIdx.x;
const unsigned int gid = blockIdx.x*blockDim.x + tid;
T el = (gid < d_size) ? d_in[gid] : OP::identity(); sh_memT[tid] = el; __syncthreads(); T res = scanIncBlock < OP, T >(sh_memT, tid);
if (gid < d_size) d_out [gid] = res; } /***********************************************************/ /*** Kernels to copy/distribute the end of block results ***/ /***********************************************************/ template
__global__ void
copyEndOfBlockKernel(T* d_in, T* d_out, unsigned int d_out_size) {
const unsigned int gid = blockIdx.x*blockDim.x + threadIdx.x;

if(gid < d_out_size) d_out[gid] = d_in[ blockDim.x*(gid+1) - 1]; } template
__global__ void
distributeEndBlock(T* d_in, T* d_out, unsigned int d_size) {
const unsigned int gid = blockIdx.x*blockDim.x + threadIdx.x;

if(gid < d_size && blockIdx.x > 0)
d_out[gid] = OP::apply(d_out[gid],d_in[blockIdx.x-1]);
}

template
__global__ void
shiftRightByOne(T* d_in, T* d_out, T ne, unsigned int d_size) {
const unsigned int gid = blockIdx.x*blockDim.x + threadIdx.x;

if (gid == 0) d_out[gid] = ne;
else if (gid < d_size) d_out[gid] = d_in[gid-1]; } /*************************************************/ /*************************************************/ /*** Segmented Inclusive Scan Helpers & Kernel ***/ /*************************************************/ /*************************************************/ template
__device__ inline
T sgmScanIncWarp(volatile T* ptr, volatile F* flg, const unsigned int idx) {
const unsigned int lane = idx & 31;

// no synchronization needed inside a WARP,
// i.e., SIMD execution
if (lane >= 1) {
if(flg[idx] == 0) { ptr[idx] = OP::apply(ptr[idx-1], ptr[idx]); }
flg[idx] = flg[idx-1] | flg[idx];
}
if (lane >= 2) {
if(flg[idx] == 0) { ptr[idx] = OP::apply(ptr[idx-2], ptr[idx]); }
flg[idx] = flg[idx-2] | flg[idx];
}
if (lane >= 4) {
if(flg[idx] == 0) { ptr[idx] = OP::apply(ptr[idx-4], ptr[idx]); }
flg[idx] = flg[idx-4] | flg[idx];
}
if (lane >= 8) {
if(flg[idx] == 0) { ptr[idx] = OP::apply(ptr[idx-8], ptr[idx]); }
flg[idx] = flg[idx-8] | flg[idx];
}
if (lane >= 16) {
if(flg[idx] == 0) { ptr[idx] = OP::apply(ptr[idx-16], ptr[idx]); }
flg[idx] = flg[idx-16] | flg[idx];
}

return const_cast(ptr[idx]);
}

template
__device__ inline
T sgmScanIncBlock(volatile T* ptr, volatile F* flg, const unsigned int idx) {
const unsigned int lane = idx & 31;
const unsigned int warpid = idx >> 5;
const unsigned int warplst= (warpid<<5) + 31; // 1a: record whether this warp begins with an ``open'' segment. bool warp_is_open = (flg[(warpid << 5)] == 0); __syncthreads(); // 1b: intra-warp segmented scan for each warp T val = sgmScanIncWarp(ptr,flg,idx);

// 2a: the last value is the correct partial result
T warp_total = const_cast(ptr[warplst]);

// 2b: warp_flag is the OR-reduction of the flags
// in a warp, and is computed indirectly from
// the mindex in hd[]
bool warp_flag = flg[warplst]!=0 || !warp_is_open;
bool will_accum= warp_is_open && (flg[idx] == 0);

__syncthreads();

// 2c: the last thread in a warp writes partial results
// in the first warp. Note that all fit in the first
// warp because warp = 32 and max block size is 32^2
if (lane == 31) {
ptr[warpid] = warp_total; //ptr[idx];
flg[warpid] = warp_flag;
}
__syncthreads();

//
if (warpid == 0) sgmScanIncWarp(ptr, flg, idx);
__syncthreads();

if (warpid > 0 && will_accum) {
val = OP::apply(ptr[warpid-1], val);
}
return val;
}

template
__global__ void
sgmScanIncKernel(T* d_in, int* flags, T* d_out,
int* f_rec, T* d_rec, unsigned int d_size) {
extern __shared__ char sh_mem_sgm_scan[];
volatile T* vals_sh = (volatile T*)sh_mem_sgm_scan;
volatile int* flag_sh = (int*) (vals_sh + blockDim.x);
const unsigned int tid = threadIdx.x;
const unsigned int gid = blockIdx.x*blockDim.x + tid;
int fl;
if (gid < d_size) { vals_sh[tid] = d_in[gid]; fl = flags[gid]; } else { vals_sh[tid] = OP::identity(); fl = 0; } flag_sh[tid] = fl; __syncthreads(); T res = sgmScanIncBlock (vals_sh, flag_sh, tid);
if (gid < d_size) d_out [gid] = res; // set the flags and data for the recursive step! if(tid == 0) { f_rec[blockIdx.x] = 0; } __syncthreads(); if(fl > 0) { f_rec[blockIdx.x] = 1; }
if(tid == (blockDim.x – 1)) { d_rec[blockIdx.x] = res; }
}

template
__global__ void
sgmDistributeEndBlock(T* d_rec_in, T* d_out, int* f_inds, unsigned int d_size) {
const unsigned int gid = blockIdx.x*blockDim.x + threadIdx.x;

if(gid < d_size && blockIdx.x > 0) {
if(f_inds[gid] == 0)
d_out[gid] = OP::apply(d_out[gid], d_rec_in[blockIdx.x-1]);
}
}

/**********************************************************************/
/******* Host/CPU Stubs for Inclusive Scan and Segmented Scan *********/
/**********************************************************************/
/**
* block_size is the size of the cuda block (must be a multiple
* of 32 less than 1025)
* d_size is the size of both the input and output arrays.
* d_in is the device array; it is supposably
* allocated and holds valid values (input).
* d_out is the output GPU array — if you want
* its data on CPU needs to copy it back to host.
*
* OP class denotes the associative binary operator
* and should have an implementation similar to
* `class Add’ in ScanUtil.cu, i.e., exporting
* `identity’ and `apply’ functions.
* T denotes the type on which OP operates,
* e.g., float or int.
*/
template
void scanInc( unsigned int block_size,
unsigned long d_size,
T* d_in, // device
T* d_out // device
) {
unsigned int num_blocks;

num_blocks = ( (d_size % block_size) == 0) ?
d_size / block_size :
d_size / block_size + 1 ;

//Assumes 32 bytes per thread of shared memory is sufficient!
scanIncKernel<<< num_blocks, block_size, block_size * 32 >>>(d_in, d_out, d_size);
cudaThreadSynchronize();

if (block_size >= d_size) { return; }

/**********************/
/*** Recursive Case ***/
/**********************/

// 1. allocate new device input & output array of size num_blocks
T *d_rec_in, *d_rec_out;
cudaMalloc((void**)&d_rec_in , num_blocks*sizeof(T));
cudaMalloc((void**)&d_rec_out, num_blocks*sizeof(T));

unsigned int num_blocks_rec = ( (num_blocks % block_size) == 0 ) ?
num_blocks / block_size :
num_blocks / block_size + 1 ;

// 2. copy in the end-of-block results of the previous scan
copyEndOfBlockKernel<<< num_blocks_rec, block_size >>>(d_out, d_rec_in, num_blocks);
cudaThreadSynchronize();

// 3. scan recursively the last elements of each CUDA block
scanInc( block_size, num_blocks, d_rec_in, d_rec_out );

// 4. distribute the the corresponding element of the
// recursively scanned data to all elements of the
// corresponding original block
distributeEndBlock<<< num_blocks, block_size >>>(d_rec_out, d_out, d_size);
cudaThreadSynchronize();

// 5. clean up
cudaFree(d_rec_in );
cudaFree(d_rec_out);
}

/**
* block_size is the size of the cuda block (must be a multiple
* of 32 less than 1025)
* d_size is the size of both the input and output arrays.
* d_in is the input array; it is supposably allocated on
* device and holds valid values (GPU device).
* flags is the flag array, in which !=0 indicates
* start of a segment.
* Allocated and holds valid values on device (GPU).
* d_out is the output GPU array on device (GPU) — if you want
* its data on CPU you need to copy it back to host.
*
* OP class denotes the associative binary operator
* and should have an implementation similar to
* `class Add’ in ScanUtil.cu, i.e., exporting
* `identity’ and `apply’ functions.
* T denotes the type on which OP operates,
* e.g., float or int.
*/
template
void sgmScanInc( const unsigned int block_size,
const unsigned long d_size,
T* d_in, //device
int* flags, //device
T* d_out //device
) {
unsigned int num_blocks;
//unsigned int val_sh_size = block_size * sizeof(T );
unsigned int flg_sh_size = block_size * sizeof(int);

num_blocks = ( (d_size % block_size) == 0) ?
d_size / block_size :
d_size / block_size + 1 ;

T *d_rec_in;
int *f_rec_in;
cudaMalloc((void**)&d_rec_in, num_blocks*sizeof(T ));
cudaMalloc((void**)&f_rec_in, num_blocks*sizeof(int));

// Assumes 32 bytes per thread of shared memory is sufficient!
sgmScanIncKernel <<< num_blocks, block_size, 32*block_size >>>
(d_in, flags, d_out, f_rec_in, d_rec_in, d_size);
cudaThreadSynchronize();
//cudaError_t err = cudaThreadSynchronize();
//if( err != cudaSuccess)
// printf(“cudaThreadSynchronize error: %s\n”, cudaGetErrorString(err));

if (block_size >= d_size) { cudaFree(d_rec_in); cudaFree(f_rec_in); return; }

// 1. allocate new device input & output array of size num_blocks
T *d_rec_out;
int *f_inds;
cudaMalloc((void**)&d_rec_out, num_blocks*sizeof(T ));
cudaMalloc((void**)&f_inds, d_size *sizeof(int ));

// 2. recursive segmented scan on the last elements of each CUDA block
sgmScanInc
( block_size, num_blocks, d_rec_in, f_rec_in, d_rec_out );

// 3. create an index array that is non-zero for all elements
// that correspond to an open segment that crosses two blocks,
// and different than zero otherwise. This is implemented
// as a CUDA-block level inclusive scan on the flag array,
// i.e., the segment that start the block has zero-flags,
// which will be preserved by the inclusive scan.
scanIncKernel,int> <<< num_blocks, block_size, flg_sh_size >>>
( flags, f_inds, d_size );

// 4. finally, accumulate the recursive result of segmented scan
// to the elements from the first segment of each block (if
// segment is open).
sgmDistributeEndBlock <<< num_blocks, block_size >>>
( d_rec_out, d_out, f_inds, d_size );
cudaThreadSynchronize();

// 5. clean up
cudaFree(d_rec_in );
cudaFree(d_rec_out);
cudaFree(f_rec_in );
cudaFree(f_inds );
}

#endif //CUDA_PROJ_HELPER

#ifndef PARSE_INPUT
#define PARSE_INPUT

#include “ParserC.h”
#include
#include
#include

#include
#include
#include
//#include
#include

using namespace std;

#include
using std::cout;
using std::endl;

#include “Constants.h”

#define MAX_VAL 1024

#if WITH_FLOATS
#define read_real read_float
#else
#define read_real read_double
#endif

const float EPS = 0.00001;

/*******************************************************/
/***** Utilities Related to Time Instrumentation *****/
/*******************************************************/

fprintf(stderr, “Syntax error when reading the dataset, i.e., four ints.\n”);
exit(1);
}

{ // check dataset invariants:
bool atr_ok = true;

atr_ok = outer > 0;
assert(atr_ok && “Outer loop count less than 0!”);

atr_ok = (num_X > 0) && (num_X <= MAX_VAL); assert(atr_ok && "Illegal NUM_X value!"); atr_ok = (num_Y > 0) && (num_Y <= MAX_VAL); assert(atr_ok && "Illegal NUM_X value!"); atr_ok = num_T > 0;
assert(atr_ok && “NUM_T value less or equal to zero!!”);
}
}

REAL* readOutput( const int& N ) {
REAL* result;
int64_t shape[3];

// reading the standard output
if (read_array(sizeof(REAL), read_real, (void**)&result, shape, 1) ) {
fprintf(stderr, “Syntax error when reading the output.\n”);
exit(1);
}

bool ok = ( shape[0] == N );
assert(ok && “Incorrect shape of the standard result!”);

return result;
}

bool validate( const REAL* res, const int& N ) {
bool is_valid = true;

REAL* std_res = readOutput( N );

for ( int i = 0; i < N; i ++ ) { float err = fabs(std_res[i] - res[i]); if ( isnan(res[i]) || isinf(res[i]) || err > EPS ) {
is_valid = false;
fprintf(stderr, “Error[%d] = %f, EPS = %f!\n”, i, err, EPS);
break;
}
}

return is_valid;
}

void writeStatsAndResult( const bool& valid, const REAL* data,
const int & outer, const int & num_X,
const int & num_Y, const int & num_T,
const bool& is_gpu,const int & P,
const unsigned long int& elapsed
) {
// print stats to stdout
fprintf(stdout, “// OUTER=%d, NUM_X=%d, NUM_Y=%d, NUM_T=%d.\n”,
outer, num_X, num_Y, num_T );

if(valid) { fprintf(stdout, “1\t\t// VALID Result,\n”); }
else { fprintf(stdout, “0\t\t// INVALID Result,\n”); }

fprintf(stdout, “%ld\t\t// Runtime in microseconds,\n”, elapsed);
if(is_gpu) fprintf(stdout, “%d\t\t// GPU Threads,\n\n”, P);
else fprintf(stdout, “%d\t\t// CPU Threads,\n\n”, P);

// write the result
write_1Darr( data, static_cast(outer),
“PMPH Project Result” );
}

#if 0
void writeResult( const REAL* res, const unsigned int N, const char* msg ) {
//ofstream fout;
//fout.open (“output.data”);
cout << "\n[ "; for( int k=0; k
#include
#include
#include
#include

struct array_reader {
char* elems;
int64_t n_elems_space;
int64_t elem_size;
int64_t n_elems_used;
int64_t *shape;
int (*elem_reader)(void*);
};

int peekc() {
int c = getchar();
ungetc(c,stdin);
return c;
}

void skipspaces() {
int c = getchar();
if (isspace(c)) {
skipspaces();
} else if (c == ‘/’) {
// Skip to end of line.
for (; c != ‘\n’ && c != EOF; c = getchar())
;
skipspaces(); // Next line may have more spaces.
} else if (c != EOF) {
ungetc(c, stdin);
}
}

int read_elem(struct array_reader *reader) {
int ret;
if (reader->n_elems_used == reader->n_elems_space) {
reader->n_elems_space *= 2;
reader->elems = static_cast (
realloc(reader->elems,
reader->n_elems_space * reader->elem_size) );
}

ret = reader->elem_reader(reader->elems + reader->n_elems_used * reader->elem_size);

if (ret == 0) {
reader->n_elems_used++;
}

return ret;
}

int read_array_elems(struct array_reader *reader, int dims) {
int c;
int ret;
int first = 1;
char *knows_dimsize = static_cast ( calloc(dims,sizeof(char)) );
int cur_dim = dims-1;
int64_t *elems_read_in_dim = static_cast( calloc(dims,sizeof(int64_t)) );
while (1) {
skipspaces();

c = getchar();
if (c == ‘]’) {
if (knows_dimsize[cur_dim]) {
if (reader->shape[cur_dim] != elems_read_in_dim[cur_dim]) {
ret = 1;
break;
}
} else {
knows_dimsize[cur_dim] = 1;
reader->shape[cur_dim] = elems_read_in_dim[cur_dim];
}
if (cur_dim == 0) {
ret = 0;
break;
} else {
cur_dim–;
elems_read_in_dim[cur_dim]++;
}
} else if (c == ‘,’) {
skipspaces();
c = getchar();
if (c == ‘[‘) {
if (cur_dim == dims – 1) {
ret = 1;
break;
}
first = 1;
cur_dim++;
elems_read_in_dim[cur_dim] = 0;
} else if (cur_dim == dims – 1) {
ungetc(c, stdin);
ret = read_elem(reader);
if (ret != 0) {
break;
}
elems_read_in_dim[cur_dim]++;
} else {
ret = 1;
break;
}
} else if (c == EOF) {
ret = 1;
break;
} else if (first) {
if (c == ‘[‘) {
if (cur_dim == dims – 1) {
ret = 1;
break;
}
cur_dim++;
elems_read_in_dim[cur_dim] = 0;
} else {
ungetc(c, stdin);
ret = read_elem(reader);
if (ret != 0) {
break;
}
elems_read_in_dim[cur_dim]++;
first = 0;
}
} else {
ret = 1;
break;
}
}

free(knows_dimsize);
free(elems_read_in_dim);
return ret;
}
////////////////////
/// Entry Points ///
////////////////////
int read_array(int64_t elem_size, int (*elem_reader)(void*),
void **data, int64_t *shape, int64_t dims) {
int ret;
struct array_reader reader;
int64_t read_dims = 0;
while (1) {
int c;
skipspaces();
c = getchar();
if (c=='[‘) {
read_dims++;
} else {
if (c != EOF) {
ungetc(c, stdin);
}
break;
}
}

if (read_dims != dims) {
// printf(“ERROR DIMS: %ld %ld \n\n”, read_dims, dims);
return 1;
}

reader.shape = shape;
reader.n_elems_used = 0;
reader.elem_size = elem_size;
reader.n_elems_space = 16;
reader.elems = static_cast( calloc(elem_size, reader.n_elems_space) );
reader.elem_reader = elem_reader;

ret = read_array_elems(&reader, dims);

*data = reader.elems;

return ret;
}

int read_int(void* dest) {
skipspaces();
if (scanf(“%d”, (int*)dest) == 1) {
return 0;
} else {
return 1;
}
}

int read_char(void* dest) {
skipspaces();
if (scanf(“%c”, (char*)dest) == 1) {
return 0;
} else {
return 1;
}
}

int read_float(void* dest) {
skipspaces();
if (scanf(“%f”, (float*)dest) == 1) {
return 0;
} else {
return 1;
}
}

int read_double(void* dest) {
skipspaces();
if (scanf(“%lf”, (double*)dest) == 1) {
return 0;
} else {
return 1;
}
}

///////////////////////
/// Writing Dataset ///
///////////////////////
void write_scal( const int * i, const char* msg ) {
fprintf(stdout, “%d “, *i);
if( msg ) fprintf(stdout, “\t// %s\n”, msg);
}
void write_scal( const double* r, const char* msg ) {
fprintf(stdout, “%lf “, *r);
if( msg ) fprintf(stdout, “\t// %s\n”, msg);
}
void write_scal( const float * r, const char* msg ) {
fprintf(stdout, “%f “, *r);
if( msg ) fprintf(stdout, “\t// %s\n”, msg);
}

template
void write_1Darr( const T* ptr, const int& N, const char* msg ) {
fprintf(stdout, “\n [ “);
for( int i = 0; i < N-1; i ++ ) { write_scal(&ptr[i], NULL); fprintf(stdout, ", "); } write_scal(&ptr[N-1], NULL); if (msg) fprintf(stdout, " ]\t//%s\n\n", msg); else fprintf(stdout, " ]"); } template
void write_2Darr( const T* ptr, const int& Nouter, const int& Ninner, const char* msg ) {
fprintf(stdout, “\n[ “);
for( int i = 0; i < Nouter-1; i ++ ) { write_1Darr( ptr + i*Ninner, Ninner, NULL ); fprintf(stdout, ","); } write_1Darr( ptr + (Nouter-1)*Ninner, Ninner, NULL ); if (msg) fprintf(stdout, "\n]\t//%s\n\n", msg); else fprintf(stdout, "\n]\n"); } #endif // DATASET_PARSER #ifndef OPENMP_UTILITIES #define OPENMP_UTILITIES //#include
#include
#include

int get_CPU_num_threads() {
int procs;

#pragma omp parallel shared(procs)
{
int th_id = omp_get_thread_num();
if(th_id == 0) { procs = omp_get_num_threads(); }
}

bool valid_procs = (procs > 0) && (procs <= 1024); assert(valid_procs && "Number of threads NOT in {1, ..., 1024}"); return procs; } #endif //OPENMP_UTILITIES #include
#include
#include
#include

#include
#include

int timeval_subtract(struct timeval *result, struct timeval *t2, struct timeval *t1)
{
unsigned int resolution=1000000;
long int diff = (t2->tv_usec + resolution * t2->tv_sec) – (t1->tv_usec + resolution * t1->tv_sec);
result->tv_sec = diff / resolution;
result->tv_usec = diff % resolution;
return (diff<0); } #include "CudaUtilProj.cu.h" __global__ void trivial_map(int* inp_d, MyInt4* inp_lift, int inp_size) { const unsigned int gid = blockIdx.x*blockDim.x + threadIdx.x; if(gid < inp_size) { int el = inp_d[gid]; MyInt4 res(el,el,el,el); if(el < 0) { res.x = 0; res.y = 0; res.z = 0; } inp_lift[gid] = res; } } int MsspProblem(int block_size, int inp_size) { int mem_size = inp_size*sizeof(MyInt4); int *inp_h = (int*)malloc(inp_size*sizeof(int)); int *inp_d; cudaMalloc((void**)&inp_d , inp_size*sizeof(int)); MyInt4 *inp_lift; cudaMalloc((void**)&inp_lift, mem_size); MyInt4 *res_d; cudaMalloc((void**)&res_d, mem_size); for(int i = 0; i < inp_size; i+=9) { inp_h[i ] = -15; inp_h[i+1 ] = 2; inp_h[i+2 ] = -1; inp_h[i+3 ] = 3; inp_h[i+4 ] = -2; inp_h[i+5 ] = 4; inp_h[i+6 ] = 3; inp_h[i+7 ] = 1; inp_h[i+8 ] = -4; } inp_h[(inp_size/9)*9 - 18 + 7] = 2; int num_blocks = ( (inp_size % block_size) == 0) ? inp_size / block_size : inp_size / block_size + 1 ; unsigned long int elapsed; struct timeval t_start, t_end, t_diff; gettimeofday(&t_start, NULL); cudaMemcpy(inp_d, inp_h, inp_size*sizeof(int), cudaMemcpyHostToDevice); { // KERNELS // 1. apply map, i.e., lift each element x to // (max(x,0),max(x,0), max(x,0), x) trivial_map<<< num_blocks, block_size >>>(inp_d, inp_lift, inp_size);
cudaThreadSynchronize();

// 2. apply scan with the given operator, i.e.,
// write the apply operator in class MsspOP in
// ScanKernels.cu.h and call scanInc from ScanHost.cu.h
scanInc< MsspOp,MyInt4 > ( block_size, inp_size, inp_lift, res_d );
cudaThreadSynchronize();
}
MyInt4 res_h(0,0,0,0);
// 3. copy back only the last element of the res_d array (of size sizeof(MyInt4))
cudaMemcpy(&res_h, res_d+inp_size-1, sizeof(MyInt4), cudaMemcpyDeviceToHost);

gettimeofday(&t_end, NULL);
timeval_subtract(&t_diff, &t_end, &t_start);
elapsed = (t_diff.tv_sec*1e6+t_diff.tv_usec);
printf(“MSSP version runs in: %lu microsecs\n”, elapsed);

printf(“RESULT is: %d %d %d %d\n”, res_h.x, res_h.y, res_h.z, res_h.w);

if(res_h.x == 11) {
printf(“MSSP VALID EXECUTION!\n”);
} else {
printf(“MSSP INVALID EXECUTION!\n”);
}

free(inp_h);
cudaFree(inp_d);
cudaFree(inp_lift);
cudaFree(res_d);

return 1;
}

int scanIncTest() {
const unsigned int num_threads = 8353455;
const unsigned int block_size = 512;
unsigned int mem_size = num_threads * sizeof(int);

int* h_in = (int*) malloc(mem_size);
int* h_out = (int*) malloc(mem_size);
int* flags_h = (int*) malloc(num_threads*sizeof(int));

int sgm_size = 123;
{ // init segments and flags
for(unsigned int i=0; i,int > ( block_size, num_threads, d_in, flags_d, d_out );

// copy host memory to device
cudaMemcpy(h_out, d_out, mem_size, cudaMemcpyDeviceToHost);

// cleanup memory
cudaFree(d_in );
cudaFree(d_out);
cudaFree(flags_d);
}

gettimeofday(&t_end, NULL);
timeval_subtract(&t_diff, &t_end, &t_start);
elapsed = (t_diff.tv_sec*1e6+t_diff.tv_usec);
printf(“Scan Inclusive on GPU runs in: %lu microsecs\n”, elapsed);

{ // validation
bool success = true;
int accum = 0;
for(int i=0; i
bool validateTranspose(float* A,float* trA, unsigned int rowsA, unsigned int colsA){
bool valid = true;
for(unsigned int i = 0; i < rowsA; i++) { for(unsigned int j = 0; j < colsA; j++) { if(trA[j*rowsA + i] != A[i*colsA + j]) { printf("row: %d, col: %d, A: %.4f, trA: %.4f\n", i, j, A[i*colsA + j], trA[j*rowsA + i] ); valid = false; break; } } if(!valid) break; } if (valid) printf("GPU TRANSPOSITION VALID!\n"); else printf("GPU TRANSPOSITION INVALID!\n"); return valid; } void testTranspose() { // set seed for rand() srand(2006); const unsigned int HEIGHT_A = 1024*8; const unsigned int WIDTH_A = 1024*8; // 1. allocate host memory for the two matrices size_t size_A = WIDTH_A * HEIGHT_A; size_t mem_size_A = sizeof(float) * size_A; float* h_A = (float*) malloc(mem_size_A); float* h_B = (float*) malloc(mem_size_A); // 2. initialize host memory randomInit(h_A, size_A); // 3. allocate device memory float* d_A; float* d_B; cudaMalloc((void**) &d_A, mem_size_A); cudaMalloc((void**) &d_B, mem_size_A); // 4. copy host memory to device cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); { unsigned long int elapsed; struct timeval t_start, t_end, t_diff; gettimeofday(&t_start, NULL); //transpose( d_A, d_B, HEIGHT_A, WIDTH_A );
transpose( d_A, d_B, HEIGHT_A, WIDTH_A );

gettimeofday(&t_end, NULL);
timeval_subtract(&t_diff, &t_end, &t_start);
elapsed = (t_diff.tv_sec*1e6+t_diff.tv_usec);
printf(“Transpose on GPU runs in: %lu microsecs\n”, elapsed);

// copy result from device to host
cudaMemcpy(h_B, d_B, mem_size_A, cudaMemcpyDeviceToHost);

// 12. validate
//validateTranspose( h_A, h_B, HEIGHT_A, WIDTH_A );
validateTranspose( h_A, h_B, HEIGHT_A, WIDTH_A );
}

// clean up memory
free(h_A);
free(h_B);
cudaFree(d_A);
cudaFree(d_B);
}

int main(int argc, char** argv) {
const unsigned int mssp_list_size = 8353455;
const unsigned int block_size = 256;

scanIncTest();
MsspProblem(block_size, mssp_list_size);
testTranspose();
}

#ifndef CONSTANTS
#define CONSTANTS

#if (WITH_FLOATS==0)
typedef double REAL;
#else
typedef float REAL;
#endif

#endif // CONSTANTS

#include
#include
#include
#include
#include
#include
#include “TridagKernel.cu.h”
#include “TridagPar.h”

int timeval_subtract(struct timeval *result, struct timeval *t2, struct timeval *t1)
{
unsigned int resolution=1000000;
long int diff = (t2->tv_usec + resolution * t2->tv_sec) – (t1->tv_usec + resolution * t1->tv_sec);
result->tv_sec = diff / resolution;
result->tv_usec = diff % resolution;
return (diff<0); } /** * solves a segmented tridag, i.e., * solves `n/sgm_size` independent tridag problems. * Logically, the arrays should be of size [n/sgm_size][sgm_size], * and the segmented tridag corresponds to a map on the outer * dimension which is applying tridag to its inner dimension. * This is the CUDA parallel implementation, which uses * block-level segmented scans. This version assumes that * `n` is a multiple of `sgm_sz` and also that `block_size` is * a multiple of `sgm_size`, i.e., such that segments do not * cross block boundaries. */ void tridagCUDAWrapper( const unsigned int block_size, REAL* a, REAL* b, REAL* c, REAL* r, const unsigned int n, const unsigned int sgm_sz, REAL* u, REAL* uu ) { unsigned int num_blocks; unsigned int sh_mem_size = block_size * 32; // assumes sgm_sz divides block_size if((block_size % sgm_sz)!=0) { printf("Invalid segment or block size. Exiting!\n\n!"); exit(0); } if((n % sgm_sz)!=0) { printf("Invalid total size (not a multiple of segment size). Exiting!\n\n!"); exit(0); } num_blocks = (n + (block_size - 1)) / block_size; TRIDAG_SOLVER<<< num_blocks, block_size, sh_mem_size >>>(a, b, c, r, n, sgm_sz, u, uu);
cudaThreadSynchronize();
}

/**
* solves a segmented tridag, i.e.,
* solves `n/sgm_size` independent tridag problems.
* Logically, the arrays should be of size [n/sgm_size][sgm_size],
* and the segmented tridag corresponds to a map on the outer
* dimension which is applying tridag to its inner dimension.
* This is the CPU sequential implementation, but morally the
* code is re-written to use (sequential) scans.
*/
void
goldenSeqTridagPar(
const REAL* a, // size [n]
const REAL* b, // size [n]
const REAL* c, // size [n]
const REAL* r, // size [n]
const int n,
const int sgm_size,
REAL* u, // size [n]
REAL* uu // size [n] temporary
) {
if((n % sgm_size)!=0) {
printf(“Invalid total size (not a multiple of segment size). Exiting!\n\n!”);
exit(0);
}

for(int i=0; i EPS ) {
printf(“INVALID Result at index %d, %f %f diff: %f. Exiting!\n\n”, i, cpu[i], gpu[i], diff);
exit(0);
}
}
}

int main(int argc, char** argv) {
const unsigned int mem_size = N * sizeof(REAL);

// allocate arrays on CPU:
REAL* a = (REAL*) malloc(mem_size);
REAL* b = (REAL*) malloc(mem_size);
REAL* c = (REAL*) malloc(mem_size);
REAL* r = (REAL*) malloc(mem_size);
REAL* gpu_u = (REAL*) malloc(mem_size);
REAL* cpu_u = (REAL*) malloc(mem_size);
REAL* gpu_uu = (REAL*) malloc(mem_size);
REAL* cpu_uu = (REAL*) malloc(mem_size);

// init a, b, c, y
init(BLOCK_SIZE, N, a, b, c, r);

// allocate gpu arrays
REAL *d_a, *d_b, *d_c, *d_r, *d_uu, *d_u;
cudaMalloc((void**)&d_a, mem_size);
cudaMalloc((void**)&d_b, mem_size);
cudaMalloc((void**)&d_c, mem_size);
cudaMalloc((void**)&d_r, mem_size);
cudaMalloc((void**)&d_uu, mem_size);
cudaMalloc((void**)&d_u, mem_size);

// Host-To-Device Copy
cudaMemcpy(d_a, a, mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_c, c, mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_r, r, mem_size, cudaMemcpyHostToDevice);

// execute on CPU
goldenSeqTridagPar(a,b,c,r, N,SGM_SIZE, cpu_u,cpu_uu);

// execute on GPU
tridagCUDAWrapper(BLOCK_SIZE, d_a,d_b,d_c,d_r, N,SGM_SIZE, d_u,d_uu);

// transfer back to CPU
cudaMemcpy(gpu_u, d_u, mem_size, cudaMemcpyDeviceToHost);
cudaMemcpy(gpu_uu, d_uu, mem_size, cudaMemcpyDeviceToHost);

// free gpu memory
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); cudaFree(d_r);
cudaFree(d_u); cudaFree(d_uu);

// validate
validate(N, cpu_u, gpu_u);

printf(“It Amazingly Validates!!!\n\n”);

// deallocate cpu arrays
free(a); free(b); free(c); free(r);
free(gpu_uu); free(cpu_uu);
free(gpu_u); free(cpu_u);

return 0;
}

CXX = nvcc

SOURCES_CPP =TestCudaTridag.cu
HELPERS =TridagPar.h TridagKernel.cu.h
EXECUTABLE =test_tridag

default: compile

.cu.o: $(SOURCES_CPP) $(HELPERS)
$(CXX) -c $@ $< compile: $(EXECUTABLE) $(EXECUTABLE): $(CXX) -o $(EXECUTABLE) $(SOURCES_CPP) run: $(EXECUTABLE) ./$(EXECUTABLE) clean: rm -f $(EXECUTABLE) template
void inplaceScanInc(const int n, typename OP::BaseType* inpres) {
typename OP::BaseType acc = OP::identity();//inpres[0];
for(int i=0; i scanres(n); // supposed to also be in shared memory and to reuse the space of mats
//————————————————–
// Recurrence 1: b[i] = b[i] – a[i]*c[i-1]/b[i-1] —
// solved by scan with 2×2 matrix mult operator —
//————————————————–
MyReal4* mats = (MyReal4*)malloc(n*sizeof(MyReal4)); // supposed to be in shared memory!
REAL b0 = b[0];
for(int i=0; i(n,mats);
for(int i=0; i uu
//—————————————————-
// Recurrence 2: y[i] = y[i] – (a[i]/b[i-1])*y[i-1] —
// solved by scan with linear func comp operator —
//—————————————————-
MyReal2* lfuns = (MyReal2*)malloc(n*sizeof(MyReal2));
REAL y0 = r[0];
for(int i=0; i(n,lfuns);
for(int i=0; i u
#if 1
//—————————————————-
// Recurrence 3: backward recurrence solved via —
// scan with linear func comp operator —
//—————————————————-
REAL yn = u[n-1]/uu[n-1];
for(int i=0; i(n,lfuns);
for(int i=0; i

typedef float REAL;

class MyReal2 {
public:
REAL x; REAL y;

__device__ __host__ inline MyReal2() {
x = 0.0; y = 0.0;
}
__device__ __host__ inline MyReal2(const REAL& a, const REAL& b) {
x = a; y = b;
}
__device__ __host__ inline MyReal2(const MyReal2& i4) {
x = i4.x; y = i4.y;
}
volatile __device__ __host__ inline MyReal2& operator=(const MyReal2& i4) volatile {
x = i4.x; y = i4.y;
return *this;
}
__device__ __host__ inline MyReal2& operator=(const MyReal2& i4) {
x = i4.x; y = i4.y;
return *this;
}
};

class MyReal4 {
public:
REAL x; REAL y; REAL z; REAL w;

__device__ __host__ inline MyReal4() {
x = 0.0; y = 0.0; z = 0.0; w = 0.0;
}
__device__ __host__ inline MyReal4(const REAL& a, const REAL& b, const REAL& c, const REAL& d) {
x = a; y = b; z = c; w = d;
}
__device__ __host__ inline MyReal4(const MyReal4& i4) {
x = i4.x; y = i4.y; z = i4.z; w = i4.w;
}
volatile __device__ __host__ inline MyReal4& operator=(const MyReal4& i4) volatile {
x = i4.x; y = i4.y; z = i4.z; w = i4.w;
return *this;
}
__device__ __host__ inline MyReal4& operator=(const MyReal4& i4) {
x = i4.x; y = i4.y; z = i4.z; w = i4.w;
return *this;
}
};

class LinFunComp {
public:
typedef MyReal2 BaseType;

static __device__ __host__ inline
MyReal2 apply(volatile MyReal2& a, volatile MyReal2& b) {
return MyReal2( b.x + b.y*a.x, a.y*b.y );
}

static __device__ __host__ inline
MyReal2 identity() {
return MyReal2(0.0, 1.0);
}
};

class MatMult2b2 {
public:
typedef MyReal4 BaseType;

static __device__ __host__ inline
MyReal4 apply(volatile MyReal4& a, volatile MyReal4& b) {
REAL val = 1.0/(a.x*b.x);
return MyReal4( (b.x*a.x + b.y*a.z)*val,
(b.x*a.y + b.y*a.w)*val,
(b.z*a.x + b.w*a.z)*val,
(b.z*a.y + b.w*a.w)*val );
}

static __device__ __host__ inline
MyReal4 identity() {
return MyReal4(1.0, 0.0, 0.0, 1.0);
}
};

/***************************************/
/*** Scan Inclusive Helpers & Kernel ***/
/***************************************/
template
__device__ inline
T scanIncWarp( volatile T* ptr, const unsigned int idx ) {
const unsigned int lane = idx & 31;

return const_cast(ptr[idx]);
}

template
__device__ inline
T scanIncBlock(volatile T* ptr, const unsigned int idx) {
const unsigned int lane = idx & 31;
const unsigned int warpid = idx >> 5;

T val = scanIncWarp(ptr,idx);
__syncthreads();

//
if (warpid == 0) scanIncWarp(ptr, idx);
__syncthreads();

if (warpid > 0) {
val = OP::apply(ptr[warpid-1], val);
}

return val;
}

/*************************************************/
/*************************************************/
/*** Segmented Inclusive Scan Helpers & Kernel ***/
/*************************************************/
/*************************************************/
template
__device__ inline
T sgmScanIncWarp(volatile T* ptr, volatile F* flg, const unsigned int idx) {
const unsigned int lane = idx & 31;

return const_cast(ptr[idx]);
}

// 2a: the last value is the correct partial result
T warp_total = const_cast(ptr[warplst]);

__syncthreads();

//
if (warpid == 0) sgmScanIncWarp(ptr, flg, idx);
__syncthreads();

if (warpid > 0 && will_accum) {
val = OP::apply(ptr[warpid-1], val);
}
return val;
}

/*********************/
/*** Tridag Kernel ***/
/*********************/
// Try to optimize it: for example,
// (The allocated shared memory is enough for 8 floats / thread):
// 1. the shared memory space for “mat_sh” can be reused for “lin_sh”
// 2. with 1., now you have space to hold “u” and “uu” in shared memory.
// 3. you may hold “a[gid]” in a register, since it is accessed twice, etc.
__global__ void
TRIDAG_SOLVER( REAL* a,
REAL* b,
REAL* c,
REAL* r,
const unsigned int n,
const unsigned int sgm_sz,
REAL* u,
REAL* uu
) {
const unsigned int tid = threadIdx.x;
const unsigned int gid = blockIdx.x*blockDim.x + tid;

// total shared memory (declared outside)
extern __shared__ char sh_mem[];
// shared memory space for the 2×2 matrix multiplication SCAN
volatile MyReal4* mat_sh = (volatile MyReal4*)sh_mem;
// shared memory space for the linear-function composition SCAN
volatile MyReal2* lin_sh = (volatile MyReal2*) (mat_sh + blockDim.x);
// shared memory space for the flag array
volatile int* flg_sh = (volatile int* ) (lin_sh + blockDim.x);

// make the flag array
flg_sh[tid] = (tid % sgm_sz == 0) ? 1 : 0;
__syncthreads();

//————————————————–
// Recurrence 1: b[i] = b[i] – a[i]*c[i-1]/b[i-1] —
// solved by scan with 2×2 matrix mult operator —
//————————————————–
// 1.a) first map
const unsigned int beg_seg_ind = (gid / sgm_sz) * sgm_sz;
REAL b0 = (gid < n) ? b[beg_seg_ind] : 1.0; mat_sh[tid] = (gid!=beg_seg_ind && gid < n) ? MyReal4(b[gid], -a[gid]*c[gid-1], 1.0, 0.0) : MyReal4(1.0, 0.0, 0.0, 1.0) ; // 1.b) inplaceScanInc(n,mats);
__syncthreads();
MyReal4 res4 = sgmScanIncBlock (mat_sh, flg_sh, tid);
// 1.c) second map
if(gid < n) { uu[gid] = (res4.x*b0 + res4.y) / (res4.z*b0 + res4.w) ; } __syncthreads(); // make the flag array flg_sh[tid] = (tid % sgm_sz == 0) ? 1 : 0; __syncthreads(); //---------------------------------------------------- // Recurrence 2: y[i] = y[i] - (a[i]/b[i-1])*y[i-1] -- // solved by scan with linear func comp operator -- //---------------------------------------------------- // 2.a) first map REAL y0 = (gid < n) ? r[beg_seg_ind] : 1.0; lin_sh[tid] = (gid!=beg_seg_ind && gid < n) ? MyReal2(r[gid], -a[gid]/uu[gid-1]) : MyReal2(0.0, 1.0 ) ; // 2.b) inplaceScanInc(n,lfuns);
__syncthreads();
MyReal2 res2 = sgmScanIncBlock (lin_sh, flg_sh, tid);
// 2.c) second map
if(gid < n) { u[gid] = res2.x + y0*res2.y; } __syncthreads(); // make the flag array flg_sh[tid] = (tid % sgm_sz == 0) ? 1 : 0; __syncthreads(); #if 1 //---------------------------------------------------- // Recurrence 3: backward recurrence solved via -- // scan with linear func comp operator -- //---------------------------------------------------- // 3.a) first map const unsigned int end_seg_ind = (beg_seg_ind + sgm_sz) - 1; const unsigned int k = (end_seg_ind - gid) + beg_seg_ind ; REAL yn = u[end_seg_ind] / uu[end_seg_ind]; lin_sh[tid] = (gid!=beg_seg_ind && gid < n) ? MyReal2( u[k]/uu[k], -c[k]/uu[k] ) : MyReal2( 0.0, 1.0 ) ; // 3.b) inplaceScanInc(n,lfuns);
__syncthreads();
MyReal2 res3 = sgmScanIncBlock (lin_sh, flg_sh, tid);
__syncthreads();
// 3.c) second map
if(gid < n) { u[k] = res3.x + yn*res3.y; } #endif } #endif //SCAN_KERS code/Data/Small/output.data code/Data/Small/input.data code/Data/Medium/output.data code/Data/Medium/input.data code/Data/Large/output.data code/Data/Large/input.data code/README.txt code/OrigImpl/ProjCoreOrig.cpp code/OrigImpl/Makefile code/OrigImpl/ProjHelperFun.cpp code/OrigImpl/TridagPar.h code/OrigImpl/ProjHelperFun.h code/OrigImpl/ProjectMain.cpp code/include/CudaUtilProj.cu.h code/include/ParseInput.h code/include/ParserC.h code/include/OpenmpUtil.h code/include/TestCudaUtil.cu code/include/Constants.h code/ParTridagCuda/TestCudaTridag.cu code/ParTridagCuda/Makefile code/ParTridagCuda/TridagPar.h code/ParTridagCuda/TridagKernel.cu.h

Related Posts