====== CÁLCULOS CON CUDA USANDO GPU ======
En el siguiente ejemplo se hace la suma Y[i] = X[i] + Y[i] donde 0 ≤ i < N. X y Y son 2 arreglos de números flotantes. Se muestran las versiones secuencial y paralela del problema a resolver.
**Versión secuencial**
Usa la función //add// que se ejecuta 10 veces en la CPU. El código //Sequential.cpp// es el siguiente:
#include
using namespace std;
void add(int N, float *X, float *Y){
for (int i = 0; i < N; i++){
Y[i] = X[i] + Y[i];
}
}
int main(void){
int N = 1<<27; //1.34217728 *10^8 elements. 512 MB
//Allocate Memory (512*2=1GB).
float *X = new float[N];
float *Y = new float[N];
//Initialize x and y arrays on the host
for (int i = 0; i < N; i++){
X[i] = 1.0f;
Y[i] = 2.0f;
}
double avg = 0;
clock_t t;
//Runs add 10 times on CPU
for(int i = 0; i < 10; i++){
t = clock(); //start time
add(N, X, Y);
t = clock() - t; //total time = end time-start time
printf ("CPU RUN-%d time = %f ms.\n", i, (((float)t)/CLOCKS_PER_SEC) * 1000);
//time is calculated in terms of clockcycle. Converted in millisecond
avg += ((float)t)/CLOCKS_PER_SEC) * 1000);
}
avg = avg / 10; //average of 10 elements
printf ("CPU Avg time = %lf ms.\n", avg);
delete [] X;
delete [] Y;
return 0;
}
El script de envío //Sequencial.sl// es el siguiente:
#!/bin/bash
#SBATCH --job-name=secuencial
#SBATCH --output=slurm.out
#SBATCH --error=slurm.err
#SBATCH --partition=public
#SBATCH --ntasks=1
cd $SLURM_SUBMIT_DIR
ml load GCCcore
g++ Sequential.cpp -o Sequential
./Sequential | tee Sequential.out
**Versión paralela**
El código paralelo //Parallel.cu// es el siguiente:
#include
using namespace std;
double data[1024+1];
// to save data in file
void csv(){
char filename[11] = "data.csv";
FILE *fp;
fp = fopen(filename, "w+");
fprintf(fp,"Number of Threads, Average Time");
for(int i = 0; i <= 1024; i += 32){
fprintf(fp,"\n%d",max(i,1));
fprintf(fp,",%lf ",data[i]);
}
fclose(fp);
printf("\n%sfile created", filename);
}
// Kernel function to add the elements of two arrays
__global__
void add(int N, float *X, float *Y){
int t = threadIdx.x;
int T = blockDim.x;
for (int i = t; i < N; i += T){
Y[i] = X[i] + Y[i];
}
}
int main(void){
int N = 1<<27;//1.34217728 *10^8 elements. 512 MB
float *X, *Y;
//Allocates Memory so that both GPU and CPU can access (512*2=1GB).
cudaMallocManaged(&X, N*sizeof(float));
cudaMallocManaged(&Y, N*sizeof(float));
//Initialize x and y arrays on the host
for (int i = 0; i < N; i++){
X[i] = 1.0f;
Y[i] = 2.0f;
}
clock_t t;
//Run add 10 times with different number of threads. and save the average time on a table.
//it is good practice to keep thread number multiple of 32.
for(int i = 0; i <= 1024; i += 32){
int T = max(i, 1); //we will need at least 1 thread.
double avg = 0;
for(int j = 0; j <= 10; j++){
t = clock();
add<<>>(N, X, Y);
cudaDeviceSynchronize(); //Wait for GPU to finish executing kernel.
t = clock() - t;
printf("T = %d, Run = %d Time = %lf\n", T, j, (((double)t)/CLOCKS_PER_SEC)*1000);
if(j) avg += ((((double)t)/CLOCKS_PER_SEC) * 1000); //skips the first run.
}
avg = avg / 10;
data[i] = avg;
printf ("It took GPU %lf ms with %d threads.\n", avg, T);
}
csv(); //save data in output file
//Free memory
cudaFree(X);
cudaFree(Y);
return 0;
}
El script de envío //Parallel.sl// es el siguiente:
#!/bin/bash
#SBATCH --job-name=parallelCU
#SBATCH --output=slurm.out
#SBATCH --error=slurm.err
#SBATCH --partition=gpu
#SBATCH --gres=gpu:m10:2 # GPU/node (Solo hay 4 GPU)
#SBATCH --ntasks-per-gpu=1 # total of 2 MPI procesos
#SBATCH --cpus-per-task=1 # CPU cores/MPI processes (Solo hay 4 CPU totales)
module load CUDA/10.2
cd $SLURM_SUBMIT_DIR
export OMP_NUM_THREADS=SLURM_CPUS_PER_TASK
nvcc Parallel.cu -o Parallel
srun ./Parallel | tee Parallel.out