Use CUDA kernels in bench scripts

This commit is contained in:
2024-03-07 00:08:04 +01:00
parent 2e16b0dca7
commit d036f21862
11 changed files with 236 additions and 124 deletions

View File

@ -44,7 +44,7 @@ if isfile(results_filename)
df = CSV.read(results_filename, DataFrame)
end
nInputs = 10_000_000
nInputs = 2^24
function cpu_bench(compute_function, inputs)
bench = @benchmark begin
@ -60,9 +60,15 @@ function cpu_bench(compute_function, inputs)
return (time, rate, s)
end
function gpu_bench(compute_function, inputs)
function gpu_bench(kernel!, inputs)
n = length(inputs)
outputs = CuVector{ComplexF64}()
resize!(outputs, n)
ts = 32
bs = Int(n / ts)
bench = @benchmark begin
CUDA.@sync $compute_function.($inputs)
@cuda threads = ts blocks = bs always_inline = true kernel!.($inputs, $outputs, $n)
CUDA.device_synchronize()
end gcsample = true seconds = 300
time = median(bench.times) / 1e9
@ -77,6 +83,7 @@ function bench_process(
process_name::String,
graph::DAG,
func,
kernel!,
gen_time::Float64,
opt_time::Float64,
func_time::Float64;
@ -131,7 +138,7 @@ function bench_process(
log("Benchmarking GPU...")
gpu_name = "$(name(first(CUDA.devices())))"
cuInputs = CuArray(inputs)
(time_gpu, rate_gpu, std_gpu) = gpu_bench(func, cuInputs)
(time_gpu, rate_gpu, std_gpu) = gpu_bench(kernel!, cuInputs)
flops_gpu = (rate_gpu * NFLOPs) / 10^9
else
log("Skipping GPU...")
@ -211,7 +218,8 @@ process = parse_process("ke->kke", QEDModel())
gen_time = @elapsed graph = gen_graph(process)
opt_time = @elapsed optimize!(optimizer, graph, 200)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "warmup", graph, compute_func, gen_time, opt_time, func_gen_time)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
optimizer = ReductionOptimizer()
@ -220,104 +228,45 @@ process = parse_process("AB->ABBB", ABCModel())
gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "warmup", graph, compute_func, gen_time, opt_time, func_gen_time)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
## -- WARMUP END
optimizer = ReductionOptimizer()
# compton
process = parse_process("ke->ke", QEDModel())
gen_time = @elapsed graph = gen_graph(process)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke", "ke->kkkkkke"]
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
for process_str in processes
# compton
process = parse_process(process_str, QEDModel())
gen_time = @elapsed graph = gen_graph(process)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)
CSV.write(results_filename, df)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
# 2-photon compton
process = parse_process("ke->kke", QEDModel())
gen_time = @elapsed graph = gen_graph(process)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
CSV.write(results_filename, df)
end
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
processes = ["AB->AB", "AB->ABBB", "AB->ABBBBB"]
CSV.write(results_filename, df)
for process_str in processes
# AB->AB
process = parse_process(process_str, ABCModel())
gen_time = @elapsed graph = parse_dag("input/$(process_str).txt", ABCModel())
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)
# 3-photon compton
process = parse_process("ke->kkke", QEDModel())
gen_time = @elapsed graph = gen_graph(process)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
CSV.write(results_filename, df)
# 4-photon compton
process = parse_process("ke->kkkke", QEDModel())
gen_time = @elapsed graph = gen_graph(process)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time, use_gpu = false)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
CSV.write(results_filename, df)
# 5-photon compton
process = parse_process("ke->kkkkke", QEDModel())
gen_time = @elapsed graph = gen_graph(process)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
CSV.write(results_filename, df)
# AB->AB
process = parse_process("AB->AB", ABCModel())
gen_time = @elapsed graph = parse_dag("input/AB->AB.txt", ABCModel())
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
CSV.write(results_filename, df)
# AB->AB^3
process = parse_process("AB->ABBB", ABCModel())
gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
CSV.write(results_filename, df)
# AB->AB^5
process = parse_process("AB->ABBBBB", ABCModel())
gen_time = @elapsed graph = parse_dag("input/AB->ABBBBB.txt", ABCModel())
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
CSV.write(results_filename, df)
CSV.write(results_filename, df)
end