Existen buenos tutoriales y libros que explican cómo programar Python para que use GPU.

El código siguiente add_values.py suma cien mil elementos:

import numpy as np
from timeit import default_timer as timer
from numba import vectorize
# This should be a substantially high value. On my test machine, this took
# 33 seconds to run via the CPU and just over 3 seconds on the GPU.
NUM_ELEMENTS = 100000000
# This is the CPU version.
def vector_add_cpu(a, b):
  c = np.zeros(NUM_ELEMENTS, dtype=np.float32)
  for i in range(NUM_ELEMENTS):
    c[i] = a[i] + b[i]
  return c
# This is the GPU version. Note the @vectorize decorator. This tells
# numba to turn this into a GPU vectorized function.
@vectorize(["float32(float32, float32)"], target='cuda')
def vector_add_gpu(a, b):
  return a + b;
def main():
  a_source = np.ones(NUM_ELEMENTS, dtype=np.float32)
  b_source = np.ones(NUM_ELEMENTS, dtype=np.float32)
  # Time the CPU function
  start = timer()
  vector_add_cpu(a_source, b_source)
  vector_add_cpu_time = timer() - start
  # Time the GPU function
  start = timer()
  vector_add_gpu(a_source, b_source)
  vector_add_gpu_time = timer() - start
  # Report times
  print("CPU function took %f seconds." % vector_add_cpu_time)
  print("GPU function took %f seconds." % vector_add_gpu_time)
  return 0
if __name__ == "__main__":

En este caso se usa numba que sirve para compilar el código de Python para ejecutar con CUDA.

Para su ejecución en el HPC UO se puede usar el script add_values.sl:

#SBATCH --partition=gpu
#SBATCH --job-name=addValue
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --gres=gpu:m10:1
#SBATCH -o slurm.%N.%j.out # STDOUT 
#SBATCH -e slurm.%N.%j.err # STDERR

# Cambiar al directorio de envío

module load CUDA/10.2
module load Python/3.7.0-foss-2018b

# Ejecutar el programa
python add_values.py