Use CUDA kernels in bench scripts
This commit is contained in:
@ -80,10 +80,13 @@ function cpu_worker(compute_func, inputs, chunk_size)
|
||||
end
|
||||
|
||||
# called with a specific device selected
|
||||
function gpu_worker(compute_func, inputs, chunk_size)
|
||||
function gpu_worker(kernel!, inputs, chunk_size)
|
||||
global progress
|
||||
global gpu_chunks
|
||||
global lck
|
||||
cuOutputs = CuVector{ComplexF64}()
|
||||
resize!(cuOutputs, chunk_size)
|
||||
|
||||
quit = false
|
||||
work_start = 0
|
||||
work_end = 0
|
||||
@ -104,7 +107,9 @@ function gpu_worker(compute_func, inputs, chunk_size)
|
||||
end
|
||||
|
||||
cuInputs = CuVector(inputs[work_start:work_end])
|
||||
compute_func.(cuInputs)
|
||||
ts = 32
|
||||
bs = Int(chunk_size / 32)
|
||||
CUDA.@sync threads = ts blocks = bs always_inline = true kernel!(cuInputs, cuOutputs, chunk_size)
|
||||
end
|
||||
|
||||
#log("GPU Worker on Device $(CUDA.device()) finished!")
|
||||
@ -114,7 +119,7 @@ end
|
||||
|
||||
cpu_gpu_ratio = Vector{Tuple{Int, Int}}()
|
||||
|
||||
function full_compute(compute_func, inputs, chunk_size)
|
||||
function full_compute(compute_func, kernel!, inputs, chunk_size)
|
||||
global progress
|
||||
progress = 1
|
||||
global cpu_chunks
|
||||
@ -126,7 +131,7 @@ function full_compute(compute_func, inputs, chunk_size)
|
||||
|
||||
for dev in CUDA.devices()
|
||||
t = Threads.@spawn device!(dev) do
|
||||
gpu_worker(compute_func, inputs, chunk_size)
|
||||
gpu_worker(kernel!, inputs, chunk_size)
|
||||
return nothing
|
||||
end
|
||||
push!(tasks, t)
|
||||
@ -145,12 +150,12 @@ function full_compute(compute_func, inputs, chunk_size)
|
||||
return nothing
|
||||
end
|
||||
|
||||
function bench(compute_function, inputs, chunk_size)
|
||||
function bench(compute_function, kernel!, inputs, chunk_size)
|
||||
global cpu_gpu_ratio
|
||||
empty!(cpu_gpu_ratio)
|
||||
|
||||
bench = @benchmark begin
|
||||
full_compute($compute_function, $inputs, $chunk_size)
|
||||
full_compute($compute_function, $kernel!, $inputs, $chunk_size)
|
||||
end gcsample = true seconds = 30
|
||||
|
||||
time = median(bench.times) / 1e9
|
||||
@ -165,7 +170,7 @@ function bench(compute_function, inputs, chunk_size)
|
||||
return (time, rate, s, med_cpu_chunks, med_gpu_chunks)
|
||||
end
|
||||
|
||||
function full_node_bench(process::MetagraphOptimization.AbstractProcessDescription, func, chunk_size, inputs)
|
||||
function full_node_bench(process::MetagraphOptimization.AbstractProcessDescription, func, kernel!, chunk_size, inputs)
|
||||
process_name = string(process)
|
||||
log("\n--- Benchmarking $(process_name) on $(nInputs) with chunk size $(chunk_size) ---")
|
||||
|
||||
@ -173,7 +178,7 @@ function full_node_bench(process::MetagraphOptimization.AbstractProcessDescripti
|
||||
display.(CUDA.devices())
|
||||
|
||||
log("Benchmarking full node...")
|
||||
(time, rate, s, med_cpu_chunks, med_gpu_chunks) = bench(func, inputs, chunk_size)
|
||||
(time, rate, s, med_cpu_chunks, med_gpu_chunks) = bench(func, kernel!, inputs, chunk_size)
|
||||
log(
|
||||
"Benchmarking complete with median time $(time), $(med_cpu_chunks) cpu chunks, and $(med_gpu_chunks) gpu chunks.",
|
||||
)
|
||||
@ -212,14 +217,14 @@ machine = Machine(
|
||||
)
|
||||
|
||||
optimizer = ReductionOptimizer()
|
||||
processes = [#="ke->ke", "ke->kke", "ke->kkke", =#"ke->kkkke", "ke->kkkkke"]
|
||||
processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke"]
|
||||
|
||||
for proc in processes
|
||||
process = parse_process(proc, QEDModel())
|
||||
graph = gen_graph(process)
|
||||
optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
|
||||
compute_func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
|
||||
log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")
|
||||
inputs = Vector{typeof(gen_process_input(process))}()
|
||||
@ -234,7 +239,7 @@ for proc in processes
|
||||
end
|
||||
|
||||
for chunk_size in chunkSizes
|
||||
full_node_bench(process, compute_func, chunk_size, inputs)
|
||||
full_node_bench(process, compute_func, kernel!, chunk_size, inputs)
|
||||
CSV.write(results_filename, df)
|
||||
end
|
||||
end;
|
||||
|
@ -44,7 +44,7 @@ if isfile(results_filename)
|
||||
df = CSV.read(results_filename, DataFrame)
|
||||
end
|
||||
|
||||
nInputs = 10_000_000
|
||||
nInputs = 2^24
|
||||
|
||||
function cpu_bench(compute_function, inputs)
|
||||
bench = @benchmark begin
|
||||
@ -60,9 +60,15 @@ function cpu_bench(compute_function, inputs)
|
||||
return (time, rate, s)
|
||||
end
|
||||
|
||||
function gpu_bench(compute_function, inputs)
|
||||
function gpu_bench(kernel!, inputs)
|
||||
n = length(inputs)
|
||||
outputs = CuVector{ComplexF64}()
|
||||
resize!(outputs, n)
|
||||
ts = 32
|
||||
bs = Int(n / ts)
|
||||
bench = @benchmark begin
|
||||
CUDA.@sync $compute_function.($inputs)
|
||||
@cuda threads = ts blocks = bs always_inline = true kernel!.($inputs, $outputs, $n)
|
||||
CUDA.device_synchronize()
|
||||
end gcsample = true seconds = 300
|
||||
|
||||
time = median(bench.times) / 1e9
|
||||
@ -77,6 +83,7 @@ function bench_process(
|
||||
process_name::String,
|
||||
graph::DAG,
|
||||
func,
|
||||
kernel!,
|
||||
gen_time::Float64,
|
||||
opt_time::Float64,
|
||||
func_time::Float64;
|
||||
@ -131,7 +138,7 @@ function bench_process(
|
||||
log("Benchmarking GPU...")
|
||||
gpu_name = "$(name(first(CUDA.devices())))"
|
||||
cuInputs = CuArray(inputs)
|
||||
(time_gpu, rate_gpu, std_gpu) = gpu_bench(func, cuInputs)
|
||||
(time_gpu, rate_gpu, std_gpu) = gpu_bench(kernel!, cuInputs)
|
||||
flops_gpu = (rate_gpu * NFLOPs) / 10^9
|
||||
else
|
||||
log("Skipping GPU...")
|
||||
@ -211,7 +218,8 @@ process = parse_process("ke->kke", QEDModel())
|
||||
gen_time = @elapsed graph = gen_graph(process)
|
||||
opt_time = @elapsed optimize!(optimizer, graph, 200)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "warmup", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
|
||||
|
||||
optimizer = ReductionOptimizer()
|
||||
|
||||
@ -220,104 +228,45 @@ process = parse_process("AB->ABBB", ABCModel())
|
||||
gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "warmup", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
|
||||
|
||||
## -- WARMUP END
|
||||
|
||||
optimizer = ReductionOptimizer()
|
||||
|
||||
# compton
|
||||
process = parse_process("ke->ke", QEDModel())
|
||||
gen_time = @elapsed graph = gen_graph(process)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
|
||||
processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke", "ke->kkkkkke"]
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
for process_str in processes
|
||||
# compton
|
||||
process = parse_process(process_str, QEDModel())
|
||||
gen_time = @elapsed graph = gen_graph(process)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
|
||||
|
||||
# 2-photon compton
|
||||
process = parse_process("ke->kke", QEDModel())
|
||||
gen_time = @elapsed graph = gen_graph(process)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
|
||||
CSV.write(results_filename, df)
|
||||
end
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
processes = ["AB->AB", "AB->ABBB", "AB->ABBBBB"]
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
for process_str in processes
|
||||
# AB->AB
|
||||
process = parse_process(process_str, ABCModel())
|
||||
gen_time = @elapsed graph = parse_dag("input/$(process_str).txt", ABCModel())
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)
|
||||
|
||||
# 3-photon compton
|
||||
process = parse_process("ke->kkke", QEDModel())
|
||||
gen_time = @elapsed graph = gen_graph(process)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
|
||||
# 4-photon compton
|
||||
process = parse_process("ke->kkkke", QEDModel())
|
||||
gen_time = @elapsed graph = gen_graph(process)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time, use_gpu = false)
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
|
||||
# 5-photon compton
|
||||
process = parse_process("ke->kkkkke", QEDModel())
|
||||
gen_time = @elapsed graph = gen_graph(process)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
|
||||
# AB->AB
|
||||
process = parse_process("AB->AB", ABCModel())
|
||||
gen_time = @elapsed graph = parse_dag("input/AB->AB.txt", ABCModel())
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
|
||||
# AB->AB^3
|
||||
process = parse_process("AB->ABBB", ABCModel())
|
||||
gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
|
||||
# AB->AB^5
|
||||
process = parse_process("AB->ABBBBB", ABCModel())
|
||||
gen_time = @elapsed graph = parse_dag("input/AB->ABBBBB.txt", ABCModel())
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
CSV.write(results_filename, df)
|
||||
end
|
||||
|
@ -37,6 +37,7 @@ function log(x...)
|
||||
end
|
||||
|
||||
function bench(func, inputs)
|
||||
# todo: use gpu kernel instead of broadcasting
|
||||
gpu_compile_time = @elapsed func.(inputs[1:2])
|
||||
|
||||
gpu_time = @benchmark $func.($inputs)
|
||||
|
Reference in New Issue
Block a user