experiments (#1)
Co-authored-by: Anton Reinhard <anton.reinhard@proton.me> Reviewed-on: #1
This commit is contained in:
63
experiments/CUDA_container.def
Normal file
63
experiments/CUDA_container.def
Normal file
@ -0,0 +1,63 @@
|
||||
Bootstrap: docker
|
||||
From: nvidia/cuda:12.3.1-devel-ubuntu20.04
|
||||
|
||||
%labels
|
||||
Requires CUDA driver 470.57+.
|
||||
|
||||
%environment
|
||||
export LANG=C
|
||||
|
||||
%runscript
|
||||
nvidia-smi
|
||||
./run.sh
|
||||
|
||||
%post
|
||||
. /.singularity.d/env/10-docker*.sh
|
||||
|
||||
apt-get update
|
||||
apt-get install -y pciutils
|
||||
DEBIAN_FRONTEND='noninteractive' apt-get -y -o Dpkg::Options::='--force-confdef' -o Dpkg::Options::='--force-confold' install build-essential cuda-compat-12-3 libibverbs-dev ibverbs-utils gcc wget git libcap2-bin
|
||||
apt-get -y autoremove; apt-get -y clean
|
||||
|
||||
cd /tmp
|
||||
|
||||
# install slurm
|
||||
: ${SLURM_VERSION:=17-02-11-1}
|
||||
wget https://github.com/SchedMD/slurm/archive/slurm-${SLURM_VERSION}.tar.gz
|
||||
tar -xf slurm-${SLURM_VERSION}.tar.gz
|
||||
cd slurm-slurm-${SLURM_VERSION}
|
||||
./configure --prefix=/usr/ --sysconfdir=/etc/slurm --localstatedir=/var --disable-debug
|
||||
make -C contribs/pmi2 -j$(nproc) install
|
||||
cd ..
|
||||
rm -rf slurm-*
|
||||
|
||||
# install julia
|
||||
cd ~
|
||||
wget https://julialang-s3.julialang.org/bin/linux/x64/1.9/julia-1.9.4-linux-x86_64.tar.gz
|
||||
tar zxvf julia-1.9.4-linux-x86_64.tar.gz
|
||||
mv julia-1.9.4/ /opt/julia-1.9.4
|
||||
#mkdir /usr/local/bin
|
||||
ln -s /opt/julia-1.9.4/bin/julia /usr/local/bin/julia
|
||||
|
||||
#Add nvidia driver paths to the environment variables
|
||||
echo "\n #Nvidia driver paths \n" >> /environment
|
||||
echo 'export PATH="/nvbin:$PATH"' >> /environment
|
||||
echo 'export LD_LIBRARY_PATH="/nvlib:$LD_LIBRARY_PATH"' >> /environment
|
||||
|
||||
#Add CUDA paths
|
||||
echo "\n #Cuda paths \n" >> /environment
|
||||
echo 'export CPATH="/usr/local/cuda/include:$CPATH"' >> /environment
|
||||
echo 'export PATH="/usr/local/cuda/bin:$PATH"' >> /environment
|
||||
echo 'export LD_LIBRARY_PATH="/usr/local/cuda/lib64:$LD_LIBRARY_PATH"' >> /environment
|
||||
echo 'export CUDA_HOME="/usr/local/cuda"' >> /environment
|
||||
|
||||
# install likwid
|
||||
VERSION=5.3.0
|
||||
wget http://ftp.fau.de/pub/likwid/likwid-$VERSION.tar.gz
|
||||
tar -xaf likwid-$VERSION.tar.gz
|
||||
cd likwid-$VERSION
|
||||
# accessdaemon doesn't work because of permissions
|
||||
sed -i 's/ACCESSMODE = accessdaemon/ACCESSMODE = perf_event/g' config.mk
|
||||
make -j4
|
||||
make -j4 install
|
||||
echo 'export LD_LIBRARY_PATH="/usr/local/lib:$LD_LIBRARY_PATH"' >> /environment
|
22
experiments/cluster/diagram_bench_hemera.sh
Executable file
22
experiments/cluster/diagram_bench_hemera.sh
Executable file
@ -0,0 +1,22 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --array=1-32
|
||||
#SBATCH --job-name=qed_bench
|
||||
#SBATCH --partition=intel
|
||||
#SBATCH --time=16:00:00
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --cpus-per-task=32
|
||||
#SBATCH --mem=16GB
|
||||
#SBATCH --output=simulation-%A-%a.out
|
||||
#SBATCH --error=simulation-%A-%a.err
|
||||
|
||||
cd $HOME/repos/metagraph_optimization
|
||||
|
||||
module load singularity
|
||||
module load git
|
||||
|
||||
printf "Current git commit hash: " > results/git.txt
|
||||
git rev-parse HEAD >> results/git.txt
|
||||
git status >> results/git.txt
|
||||
|
||||
singularity exec experiments/CUDA_container.sif ./experiments/run_qed_exec.sh $SLURM_ARRAY_TASK_ID
|
24
experiments/cluster/diagram_bench_hemera_a100.sh
Executable file
24
experiments/cluster/diagram_bench_hemera_a100.sh
Executable file
@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=qed_bench
|
||||
#SBATCH --partition=casus_a100
|
||||
#SBATCH --account=casus
|
||||
#SBATCH --time=8:00:00
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --cpus-per-task=32
|
||||
#SBATCH --gres=gpu:1
|
||||
#SBATCH --mem=256GB
|
||||
#SBATCH --output=simulation-%A-%a.out
|
||||
#SBATCH --error=simulation-%A-%a.err
|
||||
|
||||
cd $HOME/repos/metagraph_optimization
|
||||
|
||||
module load singularity
|
||||
module load git
|
||||
module load cuda/12.1
|
||||
|
||||
printf "Current git commit hash: " > results/git.txt
|
||||
git rev-parse HEAD >> results/git.txt
|
||||
git status >> results/git.txt
|
||||
|
||||
singularity exec --nv experiments/CUDA_container.sif ./experiments/run_qed_exec.sh 32
|
24
experiments/cluster/full_node_hemera.sh
Executable file
24
experiments/cluster/full_node_hemera.sh
Executable file
@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=qed_bench
|
||||
#SBATCH --partition=casus_a100
|
||||
#SBATCH --account=casus
|
||||
#SBATCH --time=8:00:00
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --cpus-per-task=128
|
||||
#SBATCH --gres=gpu:4
|
||||
#SBATCH --mem=2048GB
|
||||
#SBATCH --output=simulation-%A-%a.out
|
||||
#SBATCH --error=simulation-%A-%a.err
|
||||
|
||||
cd $HOME/repos/metagraph_optimization
|
||||
|
||||
module load singularity
|
||||
module load git
|
||||
module load cuda/12.1
|
||||
|
||||
printf "Current git commit hash: " > results/git_reduce_bench_gpu.txt
|
||||
git rev-parse HEAD >> results/git_reduce_bench_gpu.txt
|
||||
git status >> results/git_reduce_bench_gpu.txt
|
||||
|
||||
singularity exec --nv experiments/CUDA_container.sif ./experiments/full_node.sh
|
22
experiments/cluster/gen_diagram_hemera.sh
Executable file
22
experiments/cluster/gen_diagram_hemera.sh
Executable file
@ -0,0 +1,22 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --array=1-8
|
||||
#SBATCH --job-name=qed_diag_gen
|
||||
#SBATCH --partition=intel
|
||||
#SBATCH --time=4:00:00
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --cpus-per-task=8
|
||||
#SBATCH --mem=64GB
|
||||
#SBATCH --output=simulation-%A-%a.out
|
||||
#SBATCH --error=simulation-%A-%a.err
|
||||
|
||||
cd $HOME/repos/metagraph_optimization
|
||||
|
||||
module load singularity
|
||||
module load git
|
||||
|
||||
printf "Current git commit hash: " > results/git.txt
|
||||
git rev-parse HEAD >> results/git.txt
|
||||
git status >> results/git.txt
|
||||
|
||||
singularity exec experiments/CUDA_container.sif ./experiments/run_gen_diagram.sh $SLURM_ARRAY_TASK_ID
|
21
experiments/cluster/reduce_bench_hemera.sh
Executable file
21
experiments/cluster/reduce_bench_hemera.sh
Executable file
@ -0,0 +1,21 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=qed_bench
|
||||
#SBATCH --partition=intel
|
||||
#SBATCH --time=48:00:00
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --cpus-per-task=32
|
||||
#SBATCH --mem=24GB
|
||||
#SBATCH --output=simulation-%A-%a.out
|
||||
#SBATCH --error=simulation-%A-%a.err
|
||||
|
||||
cd $HOME/repos/metagraph_optimization
|
||||
|
||||
module load singularity
|
||||
module load git
|
||||
|
||||
printf "Current git commit hash: " > results/git_reduce_bench.txt
|
||||
git rev-parse HEAD >> results/git_reduce_bench.txt
|
||||
git status >> results/git_reduce_bench.txt
|
||||
|
||||
singularity exec experiments/CUDA_container.sif ./experiments/run_reduce_bench.sh
|
24
experiments/cluster/reduce_bench_hemera_gpu.sh
Executable file
24
experiments/cluster/reduce_bench_hemera_gpu.sh
Executable file
@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
#SBATCH --job-name=qed_bench
|
||||
#SBATCH --partition=casus_a100
|
||||
#SBATCH --account=casus
|
||||
#SBATCH --time=16:00:00
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --cpus-per-task=32
|
||||
#SBATCH --gres=gpu:1
|
||||
#SBATCH --mem=256GB
|
||||
#SBATCH --output=simulation-%A-%a.out
|
||||
#SBATCH --error=simulation-%A-%a.err
|
||||
|
||||
cd $HOME/repos/metagraph_optimization
|
||||
|
||||
module load singularity
|
||||
module load git
|
||||
module load cuda/12.1
|
||||
|
||||
printf "Current git commit hash: " > results/git_reduce_bench_gpu.txt
|
||||
git rev-parse HEAD >> results/git_reduce_bench_gpu.txt
|
||||
git status >> results/git_reduce_bench_gpu.txt
|
||||
|
||||
singularity exec --nv experiments/CUDA_container.sif ./experiments/run_reduce_bench_gpu.sh
|
25
experiments/full_node.sh
Executable file
25
experiments/full_node.sh
Executable file
@ -0,0 +1,25 @@
|
||||
#!/bin/bash
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
LOG_FILE="$SCRIPT_DIR/../julia_full_node.log"
|
||||
|
||||
cd $SCRIPT_DIR/..
|
||||
|
||||
echo "Writing system info..."
|
||||
|
||||
# collect some information of the used node and system
|
||||
uname -a > results/system_full_node.txt
|
||||
julia --version > results/julia_full_node.txt
|
||||
lscpu > results/cpu_full_node.txt
|
||||
nvidia-smi > results/cuda_gpu_full_node.txt
|
||||
lsblk > results/storage_full_node.txt
|
||||
lspci > results/pci_full_node.txt
|
||||
|
||||
#echo "Initiating julia..."
|
||||
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
|
||||
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
#julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
|
||||
|
||||
echo "Benchmarking Full Node 128 Threads + *GPUs*"
|
||||
julia --project -O3 --threads=128 examples/full_node_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
|
27
experiments/run_gen_diagram.sh
Executable file
27
experiments/run_gen_diagram.sh
Executable file
@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
|
||||
# first arg = number of threads
|
||||
i=$1
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
LOG_FILE="$SCRIPT_DIR/../julia.log"
|
||||
|
||||
cd $SCRIPT_DIR/..
|
||||
|
||||
echo "Writing system info..."
|
||||
|
||||
# collect some information of the used node and system
|
||||
uname -a > results/system.txt
|
||||
julia --version > results/julia.txt
|
||||
lscpu > results/cpu.txt
|
||||
lsblk > results/storage.txt
|
||||
lspci > results/pci.txt
|
||||
|
||||
echo "Initiating julia..."
|
||||
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
|
||||
julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("BenchmarkTools"); Pkg.add("StatsBase")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
|
||||
echo "Benchmarking with $i threads..."
|
||||
|
||||
julia --project -O3 --threads=$i examples/qed_gen_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
|
31
experiments/run_qed_exec.sh
Executable file
31
experiments/run_qed_exec.sh
Executable file
@ -0,0 +1,31 @@
|
||||
#!/bin/bash
|
||||
|
||||
# first arg = number of threads
|
||||
i=$1
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
LOG_FILE="$SCRIPT_DIR/../julia_$i.log"
|
||||
|
||||
cd $SCRIPT_DIR/..
|
||||
|
||||
echo "Writing system info..."
|
||||
|
||||
# collect some information of the used node and system
|
||||
uname -a > results/system_$i.txt
|
||||
julia --version > results/julia_$i.txt
|
||||
lscpu > results/cpu_$i.txt
|
||||
nvidia-smi > results/cuda_gpu_$i.txt
|
||||
lsblk > results/storage_$i.txt
|
||||
lspci > results/pci_$i.txt
|
||||
|
||||
echo "Initiating julia..."
|
||||
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
|
||||
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
#julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
|
||||
|
||||
echo "Benchmarking $i Threads"
|
||||
julia --project -O3 --threads=$i examples/qed_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
|
||||
|
||||
echo "Benchmarking Tape variant $i Threads"
|
||||
julia --project -O3 --threads=$i examples/qed_bench_tape.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
|
24
experiments/run_reduce_bench.sh
Executable file
24
experiments/run_reduce_bench.sh
Executable file
@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
LOG_FILE="$SCRIPT_DIR/../julia_bench_reduce.log"
|
||||
|
||||
cd $SCRIPT_DIR/..
|
||||
|
||||
echo "Writing system info..."
|
||||
|
||||
# collect some information of the used node and system
|
||||
uname -a > results/system_bench_reduce.txt
|
||||
julia --version > results/julia_bench_reduce.txt
|
||||
lscpu > results/cpu_bench_reduce.txt
|
||||
nvidia-smi > results/cuda_gpu_bench_reduce.txt
|
||||
lsblk > results/storage_bench_reduce.txt
|
||||
lspci > results/pci_bench_reduce.txt
|
||||
|
||||
#echo "Initiating julia..."
|
||||
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
|
||||
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
|
||||
echo "Benchmarking Reduction 32 Threads"
|
||||
julia --project -O3 --threads=32 examples/qed_bench_reduction_steps.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
|
25
experiments/run_reduce_bench_gpu.sh
Executable file
25
experiments/run_reduce_bench_gpu.sh
Executable file
@ -0,0 +1,25 @@
|
||||
#!/bin/bash
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
LOG_FILE="$SCRIPT_DIR/../julia_bench_reduce_gpu.log"
|
||||
|
||||
cd $SCRIPT_DIR/..
|
||||
|
||||
echo "Writing system info..."
|
||||
|
||||
# collect some information of the used node and system
|
||||
uname -a > results/system_bench_reduce_gpu.txt
|
||||
julia --version > results/julia_bench_reduce_gpu.txt
|
||||
lscpu > results/cpu_bench_reduce_gpu.txt
|
||||
nvidia-smi > results/cuda_gpu_bench_reduce_gpu.txt
|
||||
lsblk > results/storage_bench_reduce_gpu.txt
|
||||
lspci > results/pci_bench_reduce_gpu.txt
|
||||
|
||||
#echo "Initiating julia..."
|
||||
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
|
||||
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
#julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
|
||||
|
||||
echo "Benchmarking Reduction 32 Threads, *GPU*"
|
||||
julia --project -O3 --threads=32 examples/qed_bench_reduction_steps_gpu.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
|
Reference in New Issue
Block a user