CÁLCULOS CON CUDA USANDO GPU

En el siguiente ejemplo se hace la suma Y[i] = X[i] + Y[i] donde 0 ≤ i < N. X y Y son 2 arreglos de números flotantes. Se muestran las versiones secuencial y paralela del problema a resolver.

Versión secuencial

Usa la función add que se ejecuta 10 veces en la CPU. El código Sequential.cpp es el siguiente:

#include <bits/stdc++.h>
using namespace std;

void add(int N, float *X, float *Y){
    for (int i = 0; i < N; i++){ 
        Y[i] = X[i] + Y[i];
    }
}

int main(void){
    int N = 1<<27;         //1.34217728 *10^8 elements. 512 MB
    
    //Allocate Memory (512*2=1GB).
    float *X = new float[N];
    float *Y = new float[N];
    
    //Initialize x and y arrays on the host
    for (int i = 0; i < N; i++){
        X[i] = 1.0f;
        Y[i] = 2.0f;
    }
    
    double avg = 0;
    clock_t t;
    
    //Runs add 10 times on CPU
    for(int i = 0; i < 10; i++){
        t = clock();       //start time
        add(N, X, Y);
        t = clock() - t;   //total time = end time-start time
        printf ("CPU RUN-%d time = %f ms.\n", i, (((float)t)/CLOCKS_PER_SEC) * 1000);
        //time is calculated in terms of clockcycle. Converted in millisecond
        avg += ((float)t)/CLOCKS_PER_SEC) * 1000); 
    }
    
    avg = avg / 10;        //average of 10 elements
    printf ("CPU Avg time = %lf ms.\n", avg);

    delete [] X;
    delete [] Y;
    return 0;
}

El script de envío Sequencial.sl es el siguiente:

#!/bin/bash
#SBATCH --job-name=secuencial
#SBATCH --output=slurm.out
#SBATCH --error=slurm.err
#SBATCH --partition=public
#SBATCH --ntasks=1

cd $SLURM_SUBMIT_DIR
ml load GCCcore

g++ Sequential.cpp -o Sequential
./Sequential | tee Sequential.out

Versión paralela

El código paralelo Parallel.cu es el siguiente:

#include <bits/stdc++.h>
using namespace std;

double data[1024+1];
// to save data in file 
void csv(){
    char filename[11] = "data.csv";
    FILE *fp;
    fp = fopen(filename, "w+");
    fprintf(fp,"Number of Threads, Average Time");

    for(int i = 0; i <= 1024; i += 32){
        fprintf(fp,"\n%d",max(i,1));
        fprintf(fp,",%lf ",data[i]);
    }

    fclose(fp);

    printf("\n%sfile created", filename);
}


// Kernel function to add the elements of two arrays
__global__
void add(int N, float *X, float *Y){
    int t = threadIdx.x;
    int T = blockDim.x;
    for (int i = t; i < N; i += T){
        Y[i] = X[i] + Y[i];
    }
}

int main(void){
    int N = 1<<27;//1.34217728 *10^8 elements. 512 MB
    float *X, *Y;

    //Allocates Memory so that both GPU and CPU can access (512*2=1GB). 
    cudaMallocManaged(&X, N*sizeof(float));
    cudaMallocManaged(&Y, N*sizeof(float));

    //Initialize x and y arrays on the host
    for (int i = 0; i < N; i++){
	X[i] = 1.0f;
	Y[i] = 2.0f;
    }
    clock_t t;
	
	
    //Run add 10 times with different number of threads. and save the average time on a table.
    //it is good practice to keep thread number multiple of 32. 
    for(int i = 0; i <= 1024; i += 32){
	int T = max(i, 1);               //we will need at least 1 thread. 
	double avg = 0;
	for(int j = 0; j <= 10; j++){
	    t = clock();
	    add<<<dim3(1,1,1), dim3(T,1,1)>>>(N, X, Y);
	    cudaDeviceSynchronize();    //Wait for GPU to finish executing kernel.
	    t = clock() - t;
	    printf("T = %d, Run = %d Time = %lf\n", T, j, (((double)t)/CLOCKS_PER_SEC)*1000);
	    if(j) avg += ((((double)t)/CLOCKS_PER_SEC) * 1000); //skips the first run. 
	}
	avg = avg / 10;
	data[i] = avg;
	printf ("It took GPU %lf ms with %d threads.\n", avg, T);
    }
	
    csv();  //save data in output file
	
    //Free memory
    cudaFree(X);
    cudaFree(Y); 
    return 0;
}

El script de envío Parallel.sl es el siguiente:

#!/bin/bash
#SBATCH --job-name=parallelCU
#SBATCH --output=slurm.out
#SBATCH --error=slurm.err
#SBATCH --partition=gpu
#SBATCH --gres=gpu:m10:2     # GPU/node  (Solo hay 4 GPU)
#SBATCH --ntasks-per-gpu=1   # total of 2 MPI procesos
#SBATCH --cpus-per-task=1    # CPU cores/MPI processes (Solo hay 4 CPU totales)

module load CUDA/10.2

cd $SLURM_SUBMIT_DIR

export OMP_NUM_THREADS=SLURM_CPUS_PER_TASK

nvcc Parallel.cu -o Parallel
srun ./Parallel | tee Parallel.out