Improve qed_bench_redcution_steps_gpu

This commit is contained in:
2024-03-07 22:28:50 +01:00
parent 371467c2bc
commit ae99be7207
5 changed files with 1878 additions and 2383 deletions

View File

@ -18,12 +18,14 @@ df = DataFrame(
graph_ce = Float64[],
graph_dt = Float64[],
graph_ci = Float64[],
gen_func_t = Float64[],
cpu_compile_t = Float64[],
cpu_st_t = Float64[],
cpu_st_s = Float64[],
cpu_mt_t = Float64[],
gpu_compile_t = Float64[],
cpu_mt_s = Float64[],
cpu_mem = Float64[],
gpu_t = Float64[],
gpu_s = Float64[],
gpu_mem = Float64[],
)
# if they exist, read existing results and append new ones
@ -36,26 +38,53 @@ function log(x...)
return flush(stdout)
end
function bench(func, inputs)
# todo: use gpu kernel instead of broadcasting
gpu_compile_time = @elapsed func.(inputs[1:2])
function bench(func, kernel!, inputs)
# gpu part
n = length(inputs)
cu_inputs = CuVector(inputs)
cu_outputs = CuVector{ComplexF64}()
resize!(cu_outputs, n)
ts = 32
bs = Int(n / ts)
bench = @benchmark begin
@cuda threads = $ts blocks = $bs always_inline = true $kernel!($cu_inputs, $cu_outputs, $n)
CUDA.device_synchronize()
end gcsample = true samples = 20 evals = 1
gpu_time = median(bench.times) / 1e9
gpu_std = std(bench.times) / 1e9
gpu_mem = bench.memory
# cpu part
single_thread = @benchmark $func.($inputs)
multi_threaded = @benchmark Threads.@threads for i in eachindex($inputs)
$func($inputs[i])
end
cpu_st_time = median(single_thread.times) / 1e9
cpu_st_std = std(single_thread.times) / 1e9
cpu_mt_time = median(multi_threaded.times) / 1e9
cpu_mt_std = std(multi_threaded.times) / 1e9
cpu_mem = std(single_thread.times)
gpu_time = @benchmark $func.($inputs)
return (
cpu_compile_time = 0.0,
gpu_compile_time = gpu_compile_time,
cpu_single_thread_time = 0.0,
cpu_multi_thread_time = 0.0,
gpu_time = mean(gpu_time.times) / 1e9,
cpu_single_thread_time = cpu_st_time,
cpu_single_thread_std = cpu_st_std,
cpu_multi_thread_time = cpu_mt_time,
cpu_multi_thread_std = cpu_mt_std,
cpu_mem = cpu_mem,
gpu_time = gpu_time,
gpu_std = gpu_std,
gpu_mem = gpu_mem,
)
end
log("Available CUDA devices:")
for dev in CUDA.devices()
log("CUDA device: $(dev)")
display(dev)
end
# preparation of machine
machine = Machine(
[
@ -72,9 +101,9 @@ machine = Machine(
# bench and produce data
n_inputs = 50_000
n_inputs = 2^16
optimizer = ReductionOptimizer()
processes = [("ke->kke", 50), ("ke->ke", 1), ("ke->kke", 1), ("ke->kkke", 1)]
processes = [("ke->ke", 1), ("ke->kke", 1), ("ke->kkke", 1), ("ke->kkkke", 5)]
for (process_str, STEPSIZE) in processes
n = 0
@ -82,13 +111,14 @@ for (process_str, STEPSIZE) in processes
process = parse_process(process_str, QEDModel())
graph = gen_graph(process)
inputs = CuVector([gen_process_input(process) for _ in 1:n_inputs])
inputs = Vector([gen_process_input(process) for _ in 1:n_inputs])
get_compute_function(graph, process, machine)
while true
func_gen_time = @elapsed func = get_compute_function(graph, process, machine)
res = bench(func, inputs)
func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
res = bench(func, kernel!, inputs)
graph_properties = get_properties(graph)
push!(
@ -103,12 +133,14 @@ for (process_str, STEPSIZE) in processes
graph_properties.computeEffort,
graph_properties.data,
graph_properties.computeIntensity,
func_gen_time,
res.cpu_compile_time,
res.cpu_single_thread_time,
res.cpu_single_thread_std,
res.cpu_multi_thread_time,
res.gpu_compile_time,
res.cpu_multi_thread_std,
res.cpu_mem,
res.gpu_time,
res.gpu_std,
res.gpu_mem,
),
)
CSV.write(results_filename, df)
@ -130,13 +162,14 @@ for (process_str, STEPSIZE) in [("AB->AB", 1), ("AB->ABBB", 1), ("AB->ABBBBB", 1
process = parse_process(process_str, ABCModel())
graph = parse_dag("input/$process_str.txt", ABCModel())
inputs = CuVector([gen_process_input(process) for _ in 1:n_inputs])
inputs = Vector([gen_process_input(process) for _ in 1:n_inputs])
get_compute_function(graph, process, machine)
while true
func_gen_time = @elapsed func = get_compute_function(graph, process, machine)
res = bench(func, inputs)
func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
res = bench(func, kernel!, inputs)
graph_properties = get_properties(graph)
push!(
@ -151,12 +184,14 @@ for (process_str, STEPSIZE) in [("AB->AB", 1), ("AB->ABBB", 1), ("AB->ABBBBB", 1
graph_properties.computeEffort,
graph_properties.data,
graph_properties.computeIntensity,
func_gen_time,
res.cpu_compile_time,
res.cpu_single_thread_time,
res.cpu_single_thread_std,
res.cpu_multi_thread_time,
res.gpu_compile_time,
res.cpu_multi_thread_std,
res.cpu_mem,
res.gpu_time,
res.gpu_std,
res.gpu_mem,
),
)
CSV.write(results_filename, df)