experiments (#1)
All checks were successful
MetagraphOptimization_CI / docs (push) Successful in 10m41s
MetagraphOptimization_CI / test (push) Successful in 30m40s

Co-authored-by: Anton Reinhard <anton.reinhard@proton.me>
Reviewed-on: #1
This commit is contained in:
2024-05-08 12:03:27 +02:00
parent 82ed774b7e
commit 87dbaf2c32
155 changed files with 5372 additions and 1029 deletions

View File

@ -0,0 +1,63 @@
Bootstrap: docker
From: nvidia/cuda:12.3.1-devel-ubuntu20.04
%labels
Requires CUDA driver 470.57+.
%environment
export LANG=C
%runscript
nvidia-smi
./run.sh
%post
. /.singularity.d/env/10-docker*.sh
apt-get update
apt-get install -y pciutils
DEBIAN_FRONTEND='noninteractive' apt-get -y -o Dpkg::Options::='--force-confdef' -o Dpkg::Options::='--force-confold' install build-essential cuda-compat-12-3 libibverbs-dev ibverbs-utils gcc wget git libcap2-bin
apt-get -y autoremove; apt-get -y clean
cd /tmp
# install slurm
: ${SLURM_VERSION:=17-02-11-1}
wget https://github.com/SchedMD/slurm/archive/slurm-${SLURM_VERSION}.tar.gz
tar -xf slurm-${SLURM_VERSION}.tar.gz
cd slurm-slurm-${SLURM_VERSION}
./configure --prefix=/usr/ --sysconfdir=/etc/slurm --localstatedir=/var --disable-debug
make -C contribs/pmi2 -j$(nproc) install
cd ..
rm -rf slurm-*
# install julia
cd ~
wget https://julialang-s3.julialang.org/bin/linux/x64/1.9/julia-1.9.4-linux-x86_64.tar.gz
tar zxvf julia-1.9.4-linux-x86_64.tar.gz
mv julia-1.9.4/ /opt/julia-1.9.4
#mkdir /usr/local/bin
ln -s /opt/julia-1.9.4/bin/julia /usr/local/bin/julia
#Add nvidia driver paths to the environment variables
echo "\n #Nvidia driver paths \n" >> /environment
echo 'export PATH="/nvbin:$PATH"' >> /environment
echo 'export LD_LIBRARY_PATH="/nvlib:$LD_LIBRARY_PATH"' >> /environment
#Add CUDA paths
echo "\n #Cuda paths \n" >> /environment
echo 'export CPATH="/usr/local/cuda/include:$CPATH"' >> /environment
echo 'export PATH="/usr/local/cuda/bin:$PATH"' >> /environment
echo 'export LD_LIBRARY_PATH="/usr/local/cuda/lib64:$LD_LIBRARY_PATH"' >> /environment
echo 'export CUDA_HOME="/usr/local/cuda"' >> /environment
# install likwid
VERSION=5.3.0
wget http://ftp.fau.de/pub/likwid/likwid-$VERSION.tar.gz
tar -xaf likwid-$VERSION.tar.gz
cd likwid-$VERSION
# accessdaemon doesn't work because of permissions
sed -i 's/ACCESSMODE = accessdaemon/ACCESSMODE = perf_event/g' config.mk
make -j4
make -j4 install
echo 'export LD_LIBRARY_PATH="/usr/local/lib:$LD_LIBRARY_PATH"' >> /environment

View File

@ -0,0 +1,22 @@
#!/bin/bash
#SBATCH --array=1-32
#SBATCH --job-name=qed_bench
#SBATCH --partition=intel
#SBATCH --time=16:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=32
#SBATCH --mem=16GB
#SBATCH --output=simulation-%A-%a.out
#SBATCH --error=simulation-%A-%a.err
cd $HOME/repos/metagraph_optimization
module load singularity
module load git
printf "Current git commit hash: " > results/git.txt
git rev-parse HEAD >> results/git.txt
git status >> results/git.txt
singularity exec experiments/CUDA_container.sif ./experiments/run_qed_exec.sh $SLURM_ARRAY_TASK_ID

View File

@ -0,0 +1,24 @@
#!/bin/bash
#SBATCH --job-name=qed_bench
#SBATCH --partition=casus_a100
#SBATCH --account=casus
#SBATCH --time=8:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=32
#SBATCH --gres=gpu:1
#SBATCH --mem=256GB
#SBATCH --output=simulation-%A-%a.out
#SBATCH --error=simulation-%A-%a.err
cd $HOME/repos/metagraph_optimization
module load singularity
module load git
module load cuda/12.1
printf "Current git commit hash: " > results/git.txt
git rev-parse HEAD >> results/git.txt
git status >> results/git.txt
singularity exec --nv experiments/CUDA_container.sif ./experiments/run_qed_exec.sh 32

View File

@ -0,0 +1,24 @@
#!/bin/bash
#SBATCH --job-name=qed_bench
#SBATCH --partition=casus_a100
#SBATCH --account=casus
#SBATCH --time=8:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=128
#SBATCH --gres=gpu:4
#SBATCH --mem=2048GB
#SBATCH --output=simulation-%A-%a.out
#SBATCH --error=simulation-%A-%a.err
cd $HOME/repos/metagraph_optimization
module load singularity
module load git
module load cuda/12.1
printf "Current git commit hash: " > results/git_reduce_bench_gpu.txt
git rev-parse HEAD >> results/git_reduce_bench_gpu.txt
git status >> results/git_reduce_bench_gpu.txt
singularity exec --nv experiments/CUDA_container.sif ./experiments/full_node.sh

View File

@ -0,0 +1,22 @@
#!/bin/bash
#SBATCH --array=1-8
#SBATCH --job-name=qed_diag_gen
#SBATCH --partition=intel
#SBATCH --time=4:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --mem=64GB
#SBATCH --output=simulation-%A-%a.out
#SBATCH --error=simulation-%A-%a.err
cd $HOME/repos/metagraph_optimization
module load singularity
module load git
printf "Current git commit hash: " > results/git.txt
git rev-parse HEAD >> results/git.txt
git status >> results/git.txt
singularity exec experiments/CUDA_container.sif ./experiments/run_gen_diagram.sh $SLURM_ARRAY_TASK_ID

View File

@ -0,0 +1,21 @@
#!/bin/bash
#SBATCH --job-name=qed_bench
#SBATCH --partition=intel
#SBATCH --time=48:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=32
#SBATCH --mem=24GB
#SBATCH --output=simulation-%A-%a.out
#SBATCH --error=simulation-%A-%a.err
cd $HOME/repos/metagraph_optimization
module load singularity
module load git
printf "Current git commit hash: " > results/git_reduce_bench.txt
git rev-parse HEAD >> results/git_reduce_bench.txt
git status >> results/git_reduce_bench.txt
singularity exec experiments/CUDA_container.sif ./experiments/run_reduce_bench.sh

View File

@ -0,0 +1,24 @@
#!/bin/bash
#SBATCH --job-name=qed_bench
#SBATCH --partition=casus_a100
#SBATCH --account=casus
#SBATCH --time=16:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=32
#SBATCH --gres=gpu:1
#SBATCH --mem=256GB
#SBATCH --output=simulation-%A-%a.out
#SBATCH --error=simulation-%A-%a.err
cd $HOME/repos/metagraph_optimization
module load singularity
module load git
module load cuda/12.1
printf "Current git commit hash: " > results/git_reduce_bench_gpu.txt
git rev-parse HEAD >> results/git_reduce_bench_gpu.txt
git status >> results/git_reduce_bench_gpu.txt
singularity exec --nv experiments/CUDA_container.sif ./experiments/run_reduce_bench_gpu.sh

25
experiments/full_node.sh Executable file
View File

@ -0,0 +1,25 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
LOG_FILE="$SCRIPT_DIR/../julia_full_node.log"
cd $SCRIPT_DIR/..
echo "Writing system info..."
# collect some information of the used node and system
uname -a > results/system_full_node.txt
julia --version > results/julia_full_node.txt
lscpu > results/cpu_full_node.txt
nvidia-smi > results/cuda_gpu_full_node.txt
lsblk > results/storage_full_node.txt
lspci > results/pci_full_node.txt
#echo "Initiating julia..."
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
#julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
echo "Benchmarking Full Node 128 Threads + *GPUs*"
julia --project -O3 --threads=128 examples/full_node_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"

27
experiments/run_gen_diagram.sh Executable file
View File

@ -0,0 +1,27 @@
#!/bin/bash
# first arg = number of threads
i=$1
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
LOG_FILE="$SCRIPT_DIR/../julia.log"
cd $SCRIPT_DIR/..
echo "Writing system info..."
# collect some information of the used node and system
uname -a > results/system.txt
julia --version > results/julia.txt
lscpu > results/cpu.txt
lsblk > results/storage.txt
lspci > results/pci.txt
echo "Initiating julia..."
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("BenchmarkTools"); Pkg.add("StatsBase")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
echo "Benchmarking with $i threads..."
julia --project -O3 --threads=$i examples/qed_gen_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"

31
experiments/run_qed_exec.sh Executable file
View File

@ -0,0 +1,31 @@
#!/bin/bash
# first arg = number of threads
i=$1
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
LOG_FILE="$SCRIPT_DIR/../julia_$i.log"
cd $SCRIPT_DIR/..
echo "Writing system info..."
# collect some information of the used node and system
uname -a > results/system_$i.txt
julia --version > results/julia_$i.txt
lscpu > results/cpu_$i.txt
nvidia-smi > results/cuda_gpu_$i.txt
lsblk > results/storage_$i.txt
lspci > results/pci_$i.txt
echo "Initiating julia..."
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
#julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
echo "Benchmarking $i Threads"
julia --project -O3 --threads=$i examples/qed_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
echo "Benchmarking Tape variant $i Threads"
julia --project -O3 --threads=$i examples/qed_bench_tape.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"

24
experiments/run_reduce_bench.sh Executable file
View File

@ -0,0 +1,24 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
LOG_FILE="$SCRIPT_DIR/../julia_bench_reduce.log"
cd $SCRIPT_DIR/..
echo "Writing system info..."
# collect some information of the used node and system
uname -a > results/system_bench_reduce.txt
julia --version > results/julia_bench_reduce.txt
lscpu > results/cpu_bench_reduce.txt
nvidia-smi > results/cuda_gpu_bench_reduce.txt
lsblk > results/storage_bench_reduce.txt
lspci > results/pci_bench_reduce.txt
#echo "Initiating julia..."
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
echo "Benchmarking Reduction 32 Threads"
julia --project -O3 --threads=32 examples/qed_bench_reduction_steps.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"

View File

@ -0,0 +1,25 @@
#!/bin/bash
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
LOG_FILE="$SCRIPT_DIR/../julia_bench_reduce_gpu.log"
cd $SCRIPT_DIR/..
echo "Writing system info..."
# collect some information of the used node and system
uname -a > results/system_bench_reduce_gpu.txt
julia --version > results/julia_bench_reduce_gpu.txt
lscpu > results/cpu_bench_reduce_gpu.txt
nvidia-smi > results/cuda_gpu_bench_reduce_gpu.txt
lsblk > results/storage_bench_reduce_gpu.txt
lspci > results/pci_bench_reduce_gpu.txt
#echo "Initiating julia..."
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
#julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
echo "Benchmarking Reduction 32 Threads, *GPU*"
julia --project -O3 --threads=32 examples/qed_bench_reduction_steps_gpu.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"