====== CÁLCULOS CON CUDA USANDO GPU ====== En el siguiente ejemplo se hace la suma Y[i] = X[i] + Y[i] donde 0 ≤ i < N. X y Y son 2 arreglos de números flotantes. Se muestran las versiones secuencial y paralela del problema a resolver. **Versión secuencial** Usa la función //add// que se ejecuta 10 veces en la CPU. El código //Sequential.cpp// es el siguiente: #include using namespace std; void add(int N, float *X, float *Y){ for (int i = 0; i < N; i++){ Y[i] = X[i] + Y[i]; } } int main(void){ int N = 1<<27; //1.34217728 *10^8 elements. 512 MB //Allocate Memory (512*2=1GB). float *X = new float[N]; float *Y = new float[N]; //Initialize x and y arrays on the host for (int i = 0; i < N; i++){ X[i] = 1.0f; Y[i] = 2.0f; } double avg = 0; clock_t t; //Runs add 10 times on CPU for(int i = 0; i < 10; i++){ t = clock(); //start time add(N, X, Y); t = clock() - t; //total time = end time-start time printf ("CPU RUN-%d time = %f ms.\n", i, (((float)t)/CLOCKS_PER_SEC) * 1000); //time is calculated in terms of clockcycle. Converted in millisecond avg += ((float)t)/CLOCKS_PER_SEC) * 1000); } avg = avg / 10; //average of 10 elements printf ("CPU Avg time = %lf ms.\n", avg); delete [] X; delete [] Y; return 0; } El script de envío //Sequencial.sl// es el siguiente: #!/bin/bash #SBATCH --job-name=secuencial #SBATCH --output=slurm.out #SBATCH --error=slurm.err #SBATCH --partition=public #SBATCH --ntasks=1 cd $SLURM_SUBMIT_DIR ml load GCCcore g++ Sequential.cpp -o Sequential ./Sequential | tee Sequential.out **Versión paralela** El código paralelo //Parallel.cu// es el siguiente: #include using namespace std; double data[1024+1]; // to save data in file void csv(){ char filename[11] = "data.csv"; FILE *fp; fp = fopen(filename, "w+"); fprintf(fp,"Number of Threads, Average Time"); for(int i = 0; i <= 1024; i += 32){ fprintf(fp,"\n%d",max(i,1)); fprintf(fp,",%lf ",data[i]); } fclose(fp); printf("\n%sfile created", filename); } // Kernel function to add the elements of two arrays __global__ void add(int N, float *X, float *Y){ int t = threadIdx.x; int T = blockDim.x; for (int i = t; i < N; i += T){ Y[i] = X[i] + Y[i]; } } int main(void){ int N = 1<<27;//1.34217728 *10^8 elements. 512 MB float *X, *Y; //Allocates Memory so that both GPU and CPU can access (512*2=1GB). cudaMallocManaged(&X, N*sizeof(float)); cudaMallocManaged(&Y, N*sizeof(float)); //Initialize x and y arrays on the host for (int i = 0; i < N; i++){ X[i] = 1.0f; Y[i] = 2.0f; } clock_t t; //Run add 10 times with different number of threads. and save the average time on a table. //it is good practice to keep thread number multiple of 32. for(int i = 0; i <= 1024; i += 32){ int T = max(i, 1); //we will need at least 1 thread. double avg = 0; for(int j = 0; j <= 10; j++){ t = clock(); add<<>>(N, X, Y); cudaDeviceSynchronize(); //Wait for GPU to finish executing kernel. t = clock() - t; printf("T = %d, Run = %d Time = %lf\n", T, j, (((double)t)/CLOCKS_PER_SEC)*1000); if(j) avg += ((((double)t)/CLOCKS_PER_SEC) * 1000); //skips the first run. } avg = avg / 10; data[i] = avg; printf ("It took GPU %lf ms with %d threads.\n", avg, T); } csv(); //save data in output file //Free memory cudaFree(X); cudaFree(Y); return 0; } El script de envío //Parallel.sl// es el siguiente: #!/bin/bash #SBATCH --job-name=parallelCU #SBATCH --output=slurm.out #SBATCH --error=slurm.err #SBATCH --partition=gpu #SBATCH --gres=gpu:m10:2 # GPU/node (Solo hay 4 GPU) #SBATCH --ntasks-per-gpu=1 # total of 2 MPI procesos #SBATCH --cpus-per-task=1 # CPU cores/MPI processes (Solo hay 4 CPU totales) module load CUDA/10.2 cd $SLURM_SUBMIT_DIR export OMP_NUM_THREADS=SLURM_CPUS_PER_TASK nvcc Parallel.cu -o Parallel srun ./Parallel | tee Parallel.out