CS计算机代考程序代写 algorithm b’MatrixTranspose.tar.gz’

b’MatrixTranspose.tar.gz’

#!/bin/bash

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
# sampleACI-B.pbs
#
#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
# Overview
#
# The PBS directives can be used in either the submission script in the
# form listed here (#PBS -X Y) or along with the submission (qsub -X Y)
# with no change.
#
# This job will be submitted on ACI-B using the command:
# qsub benchmark_template.pbs
#
# You can check on the job using the command:
# qstat -u
#
# Note that here #PBS is a directive; a # followed by anything else is
# a comment.
#
#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
# The PBS directives

#-#-# The Allocation being submitted against
#PBS -A open
# You can submit against your allocation if more time is available. See the onboarding document for information regarding how many processors you are able to request at a time for hte size of your allocation. This is similar to the -q queuename on the legacy machines. If no allocation is listed, your job will be placed on the open queue until your open limits have been reached. In order to ensure your job goes to the open queue, please use the -A open directive.

#-#-# Name of the job
#PBS -N matrix_transpose_benchmark
# This is the name given to the job. It is used for the name of the output and error files and is visibile when you use qstat to check the status of your job.

#-#-# Amount of wall time
#PBS -l walltime=00:02:00
# The time required is in HH:MM:SS format. The wall time is the amount of actual time the job runs and isn’t related to computational time (the actual time times the number of cores being used.)

#-#-# Number of processors, their nodal spread and the type of node
#PBS -l nodes=1:ppn=20
# This is the the amount of processors we ask for. Note the different ways of asking for 4 processors: Putting them all on one node can boost the performance of a job as the communication is on one node while allowing the scheduler to spread them among various processors may shorten the queue wait time. Note that if you are on one node only, you can use the pbs directive -l npcus=X.

#-#-# Memory Request
#PBS -l pmem=1gb
# We ask for 1 GB of RAM per task (pmem). Also available are mem (total memory), vmem (virtual memory) and pvmem (virtual memory per task). The mem option should only be used on single node jobs.

#-#-# Combine the stdout and stderr into one file
#PBS -j oe

export DAPL_DBG_TYPE=”0″

#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
# Prepare for, compile and run the job

#-#-# Echo
echo “#-#-#Job started on `hostname` at `date` ”
echo This job runs on the following processors:
echo `cat $PBS_NODEFILE`
# We output the job start time and the processor names to the output file. This can be helpful for debugging if something goes wrong.

#-#-# Modules
module purge
module load gcc/7.3.1
# We load the modules required. Note that loading the same modules as were used when the code was compiled is required for proper execution. We include the purge, but would comment it out if the -v directive is used with qsub to pass along the environment variables. Please note that there is a default module that would be loaded for gcc but it is better to used the actual module rather than the default in case the default changes.

ulimit -s 10240

#-#-# Directory
echo “Current directory is `pwd`”
cd $PBS_O_WORKDIR
echo “Current directory is `pwd`”
# The directory you are put in to start with is your home directory. You can change to the directory directly (cd /storage/home/…) or change to the directory you submitted from using the PBS_O_WORKDIR environment variable.

#-#-# Compile – put the path to your code here
cd ~/cmpsc450/data_locality_demos
make
# We compile the code within the submission script here, but this is not required. You can compile with a previous job (or on ACI-I) and just run code within jobs. Be sure the modules used whdn compiling and running are the same.

#-#-# Echo
echo “#-#-#Compilation completed and execution started at `date`”
# Output the time here for possible debugging purposes.

#-#-# Run
# Run the job itself
~/cmpsc450/data_locality_demos/m_transpose >> mylog.txt
#-#-# Echo
echo “#-#-#Job Ended at `date`”
# Output the time here for possible debugging purposes.

CC=g++
CFLAGS= -O3 -mavx
DEPS =
OBJ = m_transpose.o

%.o: %.cpp $(DEPS)
$(CC) -c -o $@ $< $(CFLAGS) m_transpose: $(OBJ) $(CC) -o $@ $^ $(CFLAGS) .PHONY: clean clean: rm -f $(OBJ) m_transpose m_transpose.cpp m_transpose.cpp// g++ -o m_transpose m_transpose.cpp -O3 -mavx #include 
#include 
#include 

void get_walltime(double* wcTime) {

     struct timeval tp;

     gettimeofday(&tp, NULL);

     *wcTime = (double)(tp.tv_sec + tp.tv_usec/1000000.0);

}

int main(int argc, char *argv[])
{
    // N and R have been hardcoded here, they work well for demo purposes   
    int i_N = 16384, i_R = 1;
    double *pd_A, *pd_B, *pd_C;
    double d_S,d_E;

    // dynamically allocate memory  
    pd_A = new double[i_N * i_N];
    pd_B = new double[i_N * i_N];

    // populate data… make something up
    for (int i = 0; i < i_N; i++)     {         for (int j = 0; j < i_N; j++)         {             pd_A[i * i_N + j] = i * i_N + j;             pd_B[i * i_N + j] = 0.0;         }     } //#define SPATIAL_BLOCK #define ORIG // use #define to switch between algorithms... this is done at compile time! #ifdef ORIG     printf("ORIG: ");     get_walltime(&d_S);  // get start time     for (int i = 0; i < i_N; i++)     {         for (int j = 0; j < i_N; j++)             pd_B[j * i_N + i] = pd_A[i * i_N + j];     } #endif #ifdef SPATIAL_BLOCK #define M 8               printf("SPATIAL_BLOCK: ");     get_walltime(&d_S);  // get start time     for (int k = 0; k < i_N; k += M)     {         for (int p = 0; p < i_N; p += M)         {             for (int i = k; i < k + M; i++)             {                 for (int j = p; j < p + M; j++)                     pd_B[j * i_N + i] = pd_A[i * i_N + j];             }         }     } #endif     get_walltime(&d_E); // get end time stamp //#define DEBUG  #ifdef DEBUG     // use this to verify results     for (int i = 0; i < 10; i++)     {         printf("%02i: ", i);         for (int j = 0; j < 10; j++)         {             printf("%0.1f, ", pd_B[i * i_N + j]);         }         printf("\n");     } #endif     // display results     printf("R = %i, N = %i, Time = %f\n", i_R, i_N, (d_E - d_S));     // cleanup       delete [] pd_A;     delete [] pd_B;     delete [] pd_C;     return 0; } m_transpose.pbs Makefile m_transpose.cpp