experiments #1

Merged
rubydragon merged 39 commits from experiments into main 2024-05-08 12:03:28 +02:00
5 changed files with 1878 additions and 2383 deletions
Showing only changes of commit ae99be7207 - Show all commits

View File

@ -18,12 +18,14 @@ df = DataFrame(
graph_ce = Float64[],
graph_dt = Float64[],
graph_ci = Float64[],
gen_func_t = Float64[],
cpu_compile_t = Float64[],
cpu_st_t = Float64[],
cpu_st_s = Float64[],
cpu_mt_t = Float64[],
gpu_compile_t = Float64[],
cpu_mt_s = Float64[],
cpu_mem = Float64[],
gpu_t = Float64[],
gpu_s = Float64[],
gpu_mem = Float64[],
)
# if they exist, read existing results and append new ones
@ -36,26 +38,53 @@ function log(x...)
return flush(stdout)
end
function bench(func, inputs)
# todo: use gpu kernel instead of broadcasting
gpu_compile_time = @elapsed func.(inputs[1:2])
function bench(func, kernel!, inputs)
# gpu part
n = length(inputs)
cu_inputs = CuVector(inputs)
cu_outputs = CuVector{ComplexF64}()
resize!(cu_outputs, n)
ts = 32
bs = Int(n / ts)
bench = @benchmark begin
@cuda threads = $ts blocks = $bs always_inline = true $kernel!($cu_inputs, $cu_outputs, $n)
CUDA.device_synchronize()
end gcsample = true samples = 20 evals = 1
gpu_time = median(bench.times) / 1e9
gpu_std = std(bench.times) / 1e9
gpu_mem = bench.memory
# cpu part
single_thread = @benchmark $func.($inputs)
multi_threaded = @benchmark Threads.@threads for i in eachindex($inputs)
$func($inputs[i])
end
cpu_st_time = median(single_thread.times) / 1e9
cpu_st_std = std(single_thread.times) / 1e9
cpu_mt_time = median(multi_threaded.times) / 1e9
cpu_mt_std = std(multi_threaded.times) / 1e9
cpu_mem = std(single_thread.times)
gpu_time = @benchmark $func.($inputs)
return (
cpu_compile_time = 0.0,
gpu_compile_time = gpu_compile_time,
cpu_single_thread_time = 0.0,
cpu_multi_thread_time = 0.0,
gpu_time = mean(gpu_time.times) / 1e9,
cpu_single_thread_time = cpu_st_time,
cpu_single_thread_std = cpu_st_std,
cpu_multi_thread_time = cpu_mt_time,
cpu_multi_thread_std = cpu_mt_std,
cpu_mem = cpu_mem,
gpu_time = gpu_time,
gpu_std = gpu_std,
gpu_mem = gpu_mem,
)
end
log("Available CUDA devices:")
for dev in CUDA.devices()
log("CUDA device: $(dev)")
display(dev)
end
# preparation of machine
machine = Machine(
[
@ -72,9 +101,9 @@ machine = Machine(
# bench and produce data
n_inputs = 50_000
n_inputs = 2^16
optimizer = ReductionOptimizer()
processes = [("ke->kke", 50), ("ke->ke", 1), ("ke->kke", 1), ("ke->kkke", 1)]
processes = [("ke->ke", 1), ("ke->kke", 1), ("ke->kkke", 1), ("ke->kkkke", 5)]
for (process_str, STEPSIZE) in processes
n = 0
@ -82,13 +111,14 @@ for (process_str, STEPSIZE) in processes
process = parse_process(process_str, QEDModel())
graph = gen_graph(process)
inputs = CuVector([gen_process_input(process) for _ in 1:n_inputs])
inputs = Vector([gen_process_input(process) for _ in 1:n_inputs])
get_compute_function(graph, process, machine)
while true
func_gen_time = @elapsed func = get_compute_function(graph, process, machine)
res = bench(func, inputs)
func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
res = bench(func, kernel!, inputs)
graph_properties = get_properties(graph)
push!(
@ -103,12 +133,14 @@ for (process_str, STEPSIZE) in processes
graph_properties.computeEffort,
graph_properties.data,
graph_properties.computeIntensity,
func_gen_time,
res.cpu_compile_time,
res.cpu_single_thread_time,
res.cpu_single_thread_std,
res.cpu_multi_thread_time,
res.gpu_compile_time,
res.cpu_multi_thread_std,
res.cpu_mem,
res.gpu_time,
res.gpu_std,
res.gpu_mem,
),
)
CSV.write(results_filename, df)
@ -130,13 +162,14 @@ for (process_str, STEPSIZE) in [("AB->AB", 1), ("AB->ABBB", 1), ("AB->ABBBBB", 1
process = parse_process(process_str, ABCModel())
graph = parse_dag("input/$process_str.txt", ABCModel())
inputs = CuVector([gen_process_input(process) for _ in 1:n_inputs])
inputs = Vector([gen_process_input(process) for _ in 1:n_inputs])
get_compute_function(graph, process, machine)
while true
func_gen_time = @elapsed func = get_compute_function(graph, process, machine)
res = bench(func, inputs)
func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
res = bench(func, kernel!, inputs)
graph_properties = get_properties(graph)
push!(
@ -151,12 +184,14 @@ for (process_str, STEPSIZE) in [("AB->AB", 1), ("AB->ABBB", 1), ("AB->ABBBBB", 1
graph_properties.computeEffort,
graph_properties.data,
graph_properties.computeIntensity,
func_gen_time,
res.cpu_compile_time,
res.cpu_single_thread_time,
res.cpu_single_thread_std,
res.cpu_multi_thread_time,
res.gpu_compile_time,
res.cpu_multi_thread_std,
res.cpu_mem,
res.gpu_time,
res.gpu_std,
res.gpu_mem,
),
)
CSV.write(results_filename, df)

View File

@ -2,12 +2,12 @@
#SBATCH --job-name=qed_bench
#SBATCH --partition=casus_a100
#SBATCH --account=casus
#SBATCH --time=48:00:00
#SBATCH --time=16:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=32
#SBATCH --gres=gpu:1
#SBATCH --mem=24GB
#SBATCH --mem=256GB
#SBATCH --output=simulation-%A-%a.out
#SBATCH --error=simulation-%A-%a.err

View File

@ -15,10 +15,10 @@ nvidia-smi > results/cuda_gpu_bench_reduce.txt
lsblk > results/storage_bench_reduce.txt
lspci > results/pci_bench_reduce.txt
echo "Initiating julia..."
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
#echo "Initiating julia..."
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
echo "Benchmarking Reduction 32 Threads"
julia --project -O3 --threads=32 examples/qed_bench_reduction_steps.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"

View File

@ -15,11 +15,11 @@ nvidia-smi > results/cuda_gpu_bench_reduce_gpu.txt
lsblk > results/storage_bench_reduce_gpu.txt
lspci > results/pci_bench_reduce_gpu.txt
echo "Initiating julia..."
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
#echo "Initiating julia..."
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
#julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
echo "Benchmarking Reduction 32 Threads, *GPU*"
julia --project --threads=32 examples/qed_bench_reduction_steps_gpu.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"

File diff suppressed because one or more lines are too long