Improve qed_bench_redcution_steps_gpu
This commit is contained in:
parent
371467c2bc
commit
ae99be7207
@ -18,12 +18,14 @@ df = DataFrame(
|
||||
graph_ce = Float64[],
|
||||
graph_dt = Float64[],
|
||||
graph_ci = Float64[],
|
||||
gen_func_t = Float64[],
|
||||
cpu_compile_t = Float64[],
|
||||
cpu_st_t = Float64[],
|
||||
cpu_st_s = Float64[],
|
||||
cpu_mt_t = Float64[],
|
||||
gpu_compile_t = Float64[],
|
||||
cpu_mt_s = Float64[],
|
||||
cpu_mem = Float64[],
|
||||
gpu_t = Float64[],
|
||||
gpu_s = Float64[],
|
||||
gpu_mem = Float64[],
|
||||
)
|
||||
|
||||
# if they exist, read existing results and append new ones
|
||||
@ -36,26 +38,53 @@ function log(x...)
|
||||
return flush(stdout)
|
||||
end
|
||||
|
||||
function bench(func, inputs)
|
||||
# todo: use gpu kernel instead of broadcasting
|
||||
gpu_compile_time = @elapsed func.(inputs[1:2])
|
||||
function bench(func, kernel!, inputs)
|
||||
# gpu part
|
||||
n = length(inputs)
|
||||
cu_inputs = CuVector(inputs)
|
||||
cu_outputs = CuVector{ComplexF64}()
|
||||
resize!(cu_outputs, n)
|
||||
ts = 32
|
||||
bs = Int(n / ts)
|
||||
bench = @benchmark begin
|
||||
@cuda threads = $ts blocks = $bs always_inline = true $kernel!($cu_inputs, $cu_outputs, $n)
|
||||
CUDA.device_synchronize()
|
||||
end gcsample = true samples = 20 evals = 1
|
||||
|
||||
gpu_time = median(bench.times) / 1e9
|
||||
gpu_std = std(bench.times) / 1e9
|
||||
gpu_mem = bench.memory
|
||||
|
||||
# cpu part
|
||||
single_thread = @benchmark $func.($inputs)
|
||||
multi_threaded = @benchmark Threads.@threads for i in eachindex($inputs)
|
||||
$func($inputs[i])
|
||||
end
|
||||
|
||||
cpu_st_time = median(single_thread.times) / 1e9
|
||||
cpu_st_std = std(single_thread.times) / 1e9
|
||||
cpu_mt_time = median(multi_threaded.times) / 1e9
|
||||
cpu_mt_std = std(multi_threaded.times) / 1e9
|
||||
cpu_mem = std(single_thread.times)
|
||||
|
||||
gpu_time = @benchmark $func.($inputs)
|
||||
|
||||
return (
|
||||
cpu_compile_time = 0.0,
|
||||
gpu_compile_time = gpu_compile_time,
|
||||
cpu_single_thread_time = 0.0,
|
||||
cpu_multi_thread_time = 0.0,
|
||||
gpu_time = mean(gpu_time.times) / 1e9,
|
||||
cpu_single_thread_time = cpu_st_time,
|
||||
cpu_single_thread_std = cpu_st_std,
|
||||
cpu_multi_thread_time = cpu_mt_time,
|
||||
cpu_multi_thread_std = cpu_mt_std,
|
||||
cpu_mem = cpu_mem,
|
||||
gpu_time = gpu_time,
|
||||
gpu_std = gpu_std,
|
||||
gpu_mem = gpu_mem,
|
||||
)
|
||||
end
|
||||
|
||||
log("Available CUDA devices:")
|
||||
for dev in CUDA.devices()
|
||||
log("CUDA device: $(dev)")
|
||||
display(dev)
|
||||
end
|
||||
|
||||
|
||||
# preparation of machine
|
||||
machine = Machine(
|
||||
[
|
||||
@ -72,9 +101,9 @@ machine = Machine(
|
||||
|
||||
|
||||
# bench and produce data
|
||||
n_inputs = 50_000
|
||||
n_inputs = 2^16
|
||||
optimizer = ReductionOptimizer()
|
||||
processes = [("ke->kke", 50), ("ke->ke", 1), ("ke->kke", 1), ("ke->kkke", 1)]
|
||||
processes = [("ke->ke", 1), ("ke->kke", 1), ("ke->kkke", 1), ("ke->kkkke", 5)]
|
||||
|
||||
for (process_str, STEPSIZE) in processes
|
||||
n = 0
|
||||
@ -82,13 +111,14 @@ for (process_str, STEPSIZE) in processes
|
||||
|
||||
process = parse_process(process_str, QEDModel())
|
||||
graph = gen_graph(process)
|
||||
inputs = CuVector([gen_process_input(process) for _ in 1:n_inputs])
|
||||
inputs = Vector([gen_process_input(process) for _ in 1:n_inputs])
|
||||
|
||||
get_compute_function(graph, process, machine)
|
||||
|
||||
while true
|
||||
func_gen_time = @elapsed func = get_compute_function(graph, process, machine)
|
||||
res = bench(func, inputs)
|
||||
func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
res = bench(func, kernel!, inputs)
|
||||
|
||||
graph_properties = get_properties(graph)
|
||||
push!(
|
||||
@ -103,12 +133,14 @@ for (process_str, STEPSIZE) in processes
|
||||
graph_properties.computeEffort,
|
||||
graph_properties.data,
|
||||
graph_properties.computeIntensity,
|
||||
func_gen_time,
|
||||
res.cpu_compile_time,
|
||||
res.cpu_single_thread_time,
|
||||
res.cpu_single_thread_std,
|
||||
res.cpu_multi_thread_time,
|
||||
res.gpu_compile_time,
|
||||
res.cpu_multi_thread_std,
|
||||
res.cpu_mem,
|
||||
res.gpu_time,
|
||||
res.gpu_std,
|
||||
res.gpu_mem,
|
||||
),
|
||||
)
|
||||
CSV.write(results_filename, df)
|
||||
@ -130,13 +162,14 @@ for (process_str, STEPSIZE) in [("AB->AB", 1), ("AB->ABBB", 1), ("AB->ABBBBB", 1
|
||||
|
||||
process = parse_process(process_str, ABCModel())
|
||||
graph = parse_dag("input/$process_str.txt", ABCModel())
|
||||
inputs = CuVector([gen_process_input(process) for _ in 1:n_inputs])
|
||||
inputs = Vector([gen_process_input(process) for _ in 1:n_inputs])
|
||||
|
||||
get_compute_function(graph, process, machine)
|
||||
|
||||
while true
|
||||
func_gen_time = @elapsed func = get_compute_function(graph, process, machine)
|
||||
res = bench(func, inputs)
|
||||
func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
res = bench(func, kernel!, inputs)
|
||||
|
||||
graph_properties = get_properties(graph)
|
||||
push!(
|
||||
@ -151,12 +184,14 @@ for (process_str, STEPSIZE) in [("AB->AB", 1), ("AB->ABBB", 1), ("AB->ABBBBB", 1
|
||||
graph_properties.computeEffort,
|
||||
graph_properties.data,
|
||||
graph_properties.computeIntensity,
|
||||
func_gen_time,
|
||||
res.cpu_compile_time,
|
||||
res.cpu_single_thread_time,
|
||||
res.cpu_single_thread_std,
|
||||
res.cpu_multi_thread_time,
|
||||
res.gpu_compile_time,
|
||||
res.cpu_multi_thread_std,
|
||||
res.cpu_mem,
|
||||
res.gpu_time,
|
||||
res.gpu_std,
|
||||
res.gpu_mem,
|
||||
),
|
||||
)
|
||||
CSV.write(results_filename, df)
|
||||
|
@ -2,12 +2,12 @@
|
||||
#SBATCH --job-name=qed_bench
|
||||
#SBATCH --partition=casus_a100
|
||||
#SBATCH --account=casus
|
||||
#SBATCH --time=48:00:00
|
||||
#SBATCH --time=16:00:00
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --cpus-per-task=32
|
||||
#SBATCH --gres=gpu:1
|
||||
#SBATCH --mem=24GB
|
||||
#SBATCH --mem=256GB
|
||||
#SBATCH --output=simulation-%A-%a.out
|
||||
#SBATCH --error=simulation-%A-%a.err
|
||||
|
||||
|
@ -15,10 +15,10 @@ nvidia-smi > results/cuda_gpu_bench_reduce.txt
|
||||
lsblk > results/storage_bench_reduce.txt
|
||||
lspci > results/pci_bench_reduce.txt
|
||||
|
||||
echo "Initiating julia..."
|
||||
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
|
||||
julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
#echo "Initiating julia..."
|
||||
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
|
||||
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
|
||||
echo "Benchmarking Reduction 32 Threads"
|
||||
julia --project -O3 --threads=32 examples/qed_bench_reduction_steps.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
|
||||
|
@ -15,11 +15,11 @@ nvidia-smi > results/cuda_gpu_bench_reduce_gpu.txt
|
||||
lsblk > results/storage_bench_reduce_gpu.txt
|
||||
lspci > results/pci_bench_reduce_gpu.txt
|
||||
|
||||
echo "Initiating julia..."
|
||||
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
|
||||
julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
|
||||
#echo "Initiating julia..."
|
||||
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
|
||||
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
#julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
|
||||
|
||||
echo "Benchmarking Reduction 32 Threads, *GPU*"
|
||||
julia --project --threads=32 examples/qed_bench_reduction_steps_gpu.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
|
||||
|
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user