experiments (#1)
All checks were successful
MetagraphOptimization_CI / docs (push) Successful in 10m41s
MetagraphOptimization_CI / test (push) Successful in 30m40s

Co-authored-by: Anton Reinhard <anton.reinhard@proton.me>
Reviewed-on: #1
This commit is contained in:
2024-05-08 12:03:27 +02:00
parent 82ed774b7e
commit 87dbaf2c32
155 changed files with 5372 additions and 1029 deletions

View File

@@ -5,5 +5,6 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
MetagraphOptimization = "3e869610-d48d-4942-ba70-c1b702a33ca4"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
QEDbase = "10e22c08-3ccb-4172-bfcf-7d7aa3d04d93"
QEDprocesses = "46de9c38-1bb3-4547-a1ec-da24d767fdad"
StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"

249
examples/full_node_bench.jl Normal file
View File

@@ -0,0 +1,249 @@
using MetagraphOptimization
using CUDA
using UUIDs
using DataFrames
using CSV
using Random
using BenchmarkTools
using Dates
using Base.Threads
function log(x...)
println(now(), " ", join(x, " ")...)
flush(stdout)
return nothing
end
results_filename = "full_node_bench.csv"
df = DataFrame(
process_name = String[],
cpu_threads = Int[],
gpu_devices = Int[],
n_inputs = Int[],
chunk_size = Int[],
time = Float64[],
std = Float64[],
rate = Float64[],
cpu_chunks = Float64[],
gpu_chunks = Float64[],
memory_est = Float64[],
)
# if they exist, read existing results and append new ones
if isfile(results_filename)
df = CSV.read(results_filename, DataFrame)
end
nInputs = 2^26
lck = ReentrantLock()
progress = 1
cpu_chunks = 0
gpu_chunks = 0
chunkSizes = [1024, 4096, 16384, 65536, 262144, 1048576] # 2^10 to 2^20
function cpu_worker(compute_func, inputs, chunk_size)
global progress
global cpu_chunks
global lck
quit = false
work_start = 0
work_end = 0
while true
lock(lck) do
if progress >= nInputs
quit = true
else
work_start = progress
progress = progress + chunk_size
work_end = min(progress - 1, nInputs)
cpu_chunks = cpu_chunks + 1
#log("CPU Worker $(Threads.threadid()) computing $(cpu_chunks)th cpu chunk ($work_start, $work_end)")
end
end
if quit
break
end
for i in work_start:work_end
compute_func(inputs[i])
end
end
#log("CPU Worker on $(Threads.threadid()) finished!")
return nothing
end
# called with a specific device selected
function gpu_worker(kernel!, inputs, chunk_size)
global progress
global gpu_chunks
global lck
cuOutputs = CuVector{ComplexF64}()
resize!(cuOutputs, chunk_size)
quit = false
work_start = 0
work_end = 0
while true
lock(lck) do
if progress >= nInputs
quit = true
else
work_start = progress
progress = progress + chunk_size
work_end = min(progress - 1, nInputs)
gpu_chunks = gpu_chunks + 1
#log("GPU Worker $(CUDA.device()) computing $(gpu_chunks)th gpu chunk ($work_start, $work_end)")
end
end
if quit
break
end
cuInputs = CuVector(inputs[work_start:work_end])
ts = 32
bs = Int(chunk_size / 32)
@cuda threads = ts blocks = bs always_inline = true kernel!(cuInputs, cuOutputs, chunk_size)
CUDA.device_synchronize()
end
#log("GPU Worker on Device $(CUDA.device()) finished!")
return nothing
end
cpu_gpu_ratio = Vector{Tuple{Int, Int}}()
function full_compute(compute_func, kernel!, inputs, chunk_size)
global progress
progress = 1
global cpu_chunks
cpu_chunks = 0
global gpu_chunks
gpu_chunks = 0
tasks = Vector()
for dev in CUDA.devices()
t = Threads.@spawn device!(dev) do
gpu_worker(kernel!, inputs, chunk_size)
return nothing
end
push!(tasks, t)
end
for i in 1:(Threads.nthreads() - length(CUDA.devices()))
t = Threads.@spawn cpu_worker(compute_func, inputs, chunk_size)
push!(tasks, t)
end
for t in tasks
wait(t)
end
push!(cpu_gpu_ratio, (cpu_chunks, gpu_chunks))
return nothing
end
function bench(compute_function, kernel!, inputs, chunk_size)
global cpu_gpu_ratio
empty!(cpu_gpu_ratio)
bench = @benchmark begin
full_compute($compute_function, $kernel!, $inputs, $chunk_size)
end gcsample = true seconds = 60
time = median(bench.times) / 1e9
s = std(bench.times) / 1e9
rate = length(inputs) / time
med_cpu_chunks = median(getindex.(cpu_gpu_ratio, 1))
med_gpu_chunks = median(getindex.(cpu_gpu_ratio, 2))
mem_estimate = bench.memory
log("CPU/GPU ratios: $(cpu_gpu_ratio)")
return (time, rate, s, med_cpu_chunks, med_gpu_chunks, mem_estimate)
end
function full_node_bench(process::MetagraphOptimization.AbstractProcessDescription, func, kernel!, chunk_size, inputs)
process_name = string(process)
log("\n--- Benchmarking $(process_name) on $(nInputs) with chunk size $(chunk_size) ---")
log("Available Cuda Devices:")
display.(CUDA.devices())
log("Benchmarking full node...")
(time, rate, s, med_cpu_chunks, med_gpu_chunks, mem_estimate) = bench(func, kernel!, inputs, chunk_size)
log(
"Benchmarking complete with median time $(time), $(med_cpu_chunks) cpu chunks, and $(med_gpu_chunks) gpu chunks.",
)
push!(
df,
Dict(
:process_name => process_name,
:cpu_threads => Threads.nthreads() - length(CUDA.devices()),
:gpu_devices => length(CUDA.devices()),
:n_inputs => nInputs,
:chunk_size => chunk_size,
:time => time,
:std => s,
:rate => rate,
:cpu_chunks => med_cpu_chunks,
:gpu_chunks => med_gpu_chunks,
:memory_est => mem_estimate,
),
)
return nothing
end
# use "mock" machine that only uses cpu for compilation
machine = Machine(
[
MetagraphOptimization.NumaNode(
0,
1,
MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
-1.0,
UUIDs.uuid1(),
),
],
[-1.0;;],
)
optimizer = ReductionOptimizer()
processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke"]
for proc in processes
process = parse_process(proc, QEDModel())
graph = gen_graph(process)
optimize_to_fixpoint!(optimizer, graph)
compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")
inputs = Vector{typeof(gen_process_input(process))}()
resize!(inputs, nInputs)
procs = Vector{typeof(process)}()
for i in 1:Threads.nthreads()
push!(procs, copy(process))
end
@inbounds Threads.@threads for i in eachindex(inputs)
inputs[i] = gen_process_input(procs[Threads.nthreads()])
end
for chunk_size in chunkSizes
full_node_bench(process, compute_func, kernel!, chunk_size, inputs)
CSV.write(results_filename, df)
end
end;

View File

@@ -34,9 +34,10 @@ function import_bench()
bench_txt("AB->ABBB.txt")
bench_txt("AB->ABBBBB.txt")
bench_txt("AB->ABBBBBBB.txt")
#bench_txt("AB->ABBBBBBBBB.txt")
bench_txt("AB->ABBBBBBBBB.txt")
bench_txt("ABAB->ABAB.txt")
return bench_txt("ABAB->ABC.txt")
bench_txt("ABAB->ABC.txt")
return nothing
end
import_bench()

View File

@@ -2,44 +2,117 @@ using MetagraphOptimization
using LIKWID
using CUDA
using UUIDs
using DataFrames
using CSV
using Random
using BenchmarkTools
using Dates
function cpu_bench(compute_function, inputs)
compute_function.(inputs[begin:10]) # make sure it's compiled
DISABLE_GPU = false
time = @elapsed Threads.@threads for i in eachindex(inputs)
@invokelatest compute_function(inputs[i])
end
rate = length(inputs) / time
return (time, rate)
function log(x...)
println(now(), " ", join(x, " ")...)
return flush(stdout)
end
function gpu_bench(compute_function, inputs)
CUDA.@sync compute_function.(inputs[begin:10]) # make sure it's compiled
results_filename = "bench_results_$(Threads.nthreads()).csv"
time = @elapsed CUDA.@sync compute_function.(inputs)
df = DataFrame(
process_name = String[],
graph_gen_time = Float64[],
optimization_time = Float64[],
function_generation_time = Float64[],
graph_nodes = Int[],
graph_edges = Int[],
graph_mem = Float64[],
cpu_threads = Int[],
n_inputs = Int[],
nflops_likwid = Int[],
cpu_time = Float64[],
cpu_std = Float64[],
cpu_rate = Float64[],
cpu_gflops = Float64[],
gpu_name = String[],
gpu_time = Float64[],
gpu_std = Float64[],
gpu_rate = Float64[],
gpu_gflops = Float64[],
)
# if they exist, read existing results and append new ones
if isfile(results_filename)
df = CSV.read(results_filename, DataFrame)
end
nInputs = 2^20
function cpu_bench(compute_function, inputs)
bench = @benchmark begin
@inbounds Threads.@threads for i in eachindex($inputs)
@invokelatest $compute_function($inputs[i])
end
end gcsample = true samples = 20 evals = 1
time = median(bench.times) / 1e9
s = std(bench.times) / 1e9
rate = length(inputs) / time
return (time, rate)
return (time, rate, s)
end
function gpu_bench(kernel!, inputs)
n = length(inputs)
outputs = CuVector{ComplexF64}()
resize!(outputs, n)
ts = 32
bs = Int(n / ts)
bench = @benchmark begin
@cuda threads = $ts blocks = $bs always_inline = true $kernel!($inputs, $outputs, $n)
CUDA.device_synchronize()
end gcsample = true samples = 20 evals = 1
time = median(bench.times) / 1e9
s = std(bench.times) / 1e9
rate = length(inputs) / time
return (time, rate, s)
end
function bench_process(
process::MetagraphOptimization.AbstractProcessDescription,
process_name::String,
graph::DAG,
func,
io::IO = stdout;
use_likwid = true,
kernel!,
gen_time::Float64,
opt_time::Float64,
func_time::Float64;
use_likwid = false,
use_gpu = true,
)
println(io, "\n--- Benchmarking $(process) ---")
log("\n--- Benchmarking $(process_name) ---")
if DISABLE_GPU
use_gpu = false
end
NFLOPs = GraphProperties(graph).computeEffort
graph_props = GraphProperties(graph)
NFLOPs = graph_props.computeEffort
nflops_likwid = 0
if use_likwid
input = gen_process_input(process)
func(input) # compile first
# get rid of annoying output to console
oldstd = stdout
redirect_stdout(devnull)
_, events = @perfmon "FLOPS_DP" func(input)
redirect_stdout(oldstd) # recover original stdout
NFLOPs = first(events["FLOPS_DP"])["RETIRED_SSE_AVX_FLOPS_ALL"]
nflops_likwid = NFLOPs
end
nInputs = 10000000 # ten million
println(io, "Generating $nInputs inputs with $(Threads.nthreads()) threads...")
log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")
inputs = Vector{typeof(gen_process_input(process))}()
resize!(inputs, nInputs)
@@ -48,35 +121,76 @@ function bench_process(
push!(processes, copy(process))
end
Threads.@threads for i in eachindex(inputs)
@inbounds Threads.@threads for i in eachindex(inputs)
inputs[i] = gen_process_input(processes[Threads.nthreads()])
end
println(io, "Benchmarking CPU with $(Threads.nthreads()) threads...")
(time_cpu, rate_cpu) = cpu_bench(func, inputs)
flops_cpu = (rate_cpu * NFLOPs) / 1024^3
log("Benchmarking CPU with $(Threads.nthreads()) threads...")
(time_cpu, rate_cpu, std_cpu) = cpu_bench(func, inputs)
flops_cpu = (rate_cpu * NFLOPs) / 10^9
println(io, "Benchmarking GPU...")
cuInputs = CuArray(inputs)
(time_gpu, rate_gpu) = gpu_bench(func, cuInputs)
flops_gpu = (rate_gpu * NFLOPs) / 1024^3
time_gpu = 0.0
std_gpu = 0.0
rate_gpu = 0.0
flops_gpu = 0.0
gpu_name = "none"
if use_gpu
log("Benchmarking GPU...")
gpu_name = "$(name(first(CUDA.devices())))"
cuInputs = CuArray(inputs)
(time_gpu, rate_gpu, std_gpu) = gpu_bench(kernel!, cuInputs)
flops_gpu = (rate_gpu * NFLOPs) / 10^9
else
log("Skipping GPU...")
end
println(io, "\nBenchmark Summary for $(process):")
log("\nBenchmark Summary for $(process):")
if use_likwid
println(io, "Measured FLOPS by LIKWID: $NFLOPs")
log("Measured FLOPS by LIKWID: $NFLOPs")
else
println(io, "Total graph compute effort: $NFLOPs")
log("Total graph compute effort: $NFLOPs")
end
println(io, "Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))")
println(io, "CPU, $(Threads.nthreads()) threads")
println(io, " Time: $time_cpu")
println(io, " Rate: $rate_cpu")
println(io, " GFLOPS: $flops_cpu")
println(io, "GPU, $(name(first(CUDA.devices())))")
println(io, " Time: $time_gpu")
println(io, " Rate: $rate_gpu")
return println(io, " GFLOPS: $flops_gpu")
log("Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))")
log("CPU, $(Threads.nthreads()) threads")
log(" Time: $time_cpu")
log(" Rate: $rate_cpu")
log(" GFLOPS: $flops_cpu")
if use_gpu
log("GPU, $gpu_name")
log(" Time: $time_gpu")
log(" Rate: $rate_gpu")
log(" GFLOPS: $flops_gpu")
end
if (process_name != "warmup")
push!(
df,
Dict(
:process_name => process_name,
:graph_gen_time => gen_time,
:optimization_time => opt_time,
:function_generation_time => func_time,
:graph_nodes => graph_props.noNodes,
:graph_edges => graph_props.noEdges,
:graph_mem => MetagraphOptimization.mem(graph),
:cpu_threads => Threads.nthreads(),
:n_inputs => nInputs,
:nflops_likwid => nflops_likwid,
:cpu_time => time_cpu,
:cpu_std => std_cpu,
:cpu_rate => rate_cpu,
:cpu_gflops => flops_cpu,
:gpu_name => gpu_name,
:gpu_time => time_gpu,
:gpu_std => std_gpu,
:gpu_rate => rate_gpu,
:gpu_gflops => flops_gpu,
),
)
end
return nothing
end
# use "mock" machine that only uses cpu
@@ -92,57 +206,67 @@ machine = Machine(
],
[-1.0;;],
)
optimizer = ReductionOptimizer()
# sadly cannot put these in functions because the world age must increase after the function is created which happens only in the global scope
# compton
process = parse_process("ke->ke", QEDModel())
graph = gen_graph(process)
optimize_to_fixpoint!(optimizer, graph)
compute_func = get_compute_function(graph, process, machine)
bench_process(process, compute_func)
## -- WARMUP TO COMPILE FUNCTIONS first
#=
optimizer = RandomWalkOptimizer(MersenneTwister(0))
# 2-photon compton
process = parse_process("ke->kke", QEDModel())
graph = gen_graph(process)
optimize_to_fixpoint!(optimizer, graph)
compute_func = get_compute_function(graph, process, machine)
bench_process(process, compute_func)
gen_time = @elapsed graph = gen_graph(process)
opt_time = @elapsed optimize!(optimizer, graph, 200)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
# 3-photon compton
process = parse_process("ke->kkke", QEDModel())
graph = gen_graph(process)
optimize_to_fixpoint!(optimizer, graph)
compute_func = get_compute_function(graph, process, machine)
bench_process(process, compute_func)
# AB->AB
process = parse_process("AB->AB", ABCModel())
graph = parse_dag("input/AB->AB.txt", ABCModel())
optimize_to_fixpoint!(optimizer, graph)
compute_func = get_compute_function(graph, process, machine)
bench_process(process, compute_func)
optimizer = ReductionOptimizer()
# AB->AB^3
process = parse_process("AB->ABBB", ABCModel())
graph = parse_dag("input/AB->ABBB.txt", ABCModel())
optimize_to_fixpoint!(optimizer, graph)
compute_func = get_compute_function(graph, process, machine)
bench_process(process, compute_func)
gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
=#
## -- WARMUP END
exit(0)
optimizer = ReductionOptimizer()
# 4-photon compton
process = parse_process("ke->kkkke", QEDModel())
graph = gen_graph(process)
optimize_to_fixpoint!(optimizer, graph)
compute_func = get_compute_function(graph, process, machine)
bench_process(process, compute_func)
processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke"]
# AB->AB^5
process = parse_process("AB->ABBBBB", ABCModel())
graph = parse_dag("input/AB->ABBBBB.txt", ABCModel())
optimize_to_fixpoint!(optimizer, graph)
compute_func = get_compute_function(graph, process, machine)
bench_process(process, compute_func)
for process_str in processes
# compton
process = parse_process(process_str, QEDModel())
gen_time = @elapsed graph = gen_graph(process)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
CSV.write(results_filename, df)
end
processes = ["AB->AB", "AB->ABBB", "AB->ABBBBB", "AB->ABBBBBBB"]
for process_str in processes
# AB->AB
process = parse_process(process_str, ABCModel())
gen_time = @elapsed graph = parse_dag("input/$(process_str).txt", ABCModel())
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
CSV.write(results_filename, df)
end

View File

@@ -0,0 +1,163 @@
using MetagraphOptimization
using CUDA
using UUIDs
using BenchmarkTools
using DataFrames
using CSV
results_filename = "bench_results_reduction_steps.csv"
df = DataFrame(
threads = Int[],
process = String[],
operations = Int[],
cumulative_optimization_time = Float64[],
graph_nodes = Int[],
graph_edges = Int[],
graph_ce = Float64[],
graph_dt = Float64[],
graph_ci = Float64[],
gen_func_t = Float64[],
cpu_compile_t = Float64[],
cpu_st_t = Float64[],
cpu_mt_t = Float64[],
gpu_compile_t = Float64[],
gpu_t = Float64[],
)
# if they exist, read existing results and append new ones
if isfile(results_filename)
df = CSV.read(results_filename, DataFrame)
end
function bench(func, inputs)
compile_time = @elapsed func(inputs[1])
single_thread = @benchmark $func.($inputs)
multi_threaded = @benchmark Threads.@threads for i in eachindex($inputs)
$func($inputs[i])
end
return (
cpu_compile_time = compile_time,
gpu_compile_time = 0.0,
cpu_single_thread_time = mean(single_thread.times) / 1e9,
cpu_multi_thread_time = mean(multi_threaded.times) / 1e9,
gpu_time = 0.0,
)
end
# preparation of machine
machine = Machine(
[
MetagraphOptimization.NumaNode(
0,
1,
MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
-1.0,
UUIDs.uuid1(),
),
],
[-1.0;;],
)
# bench and produce data
n_inputs = 50_000
optimizer = ReductionOptimizer()
processes = [("ke->kke", 5), ("ke->ke", 1), ("ke->kke", 1), ("ke->kkke", 1), ("ke->kkkke", 1), ("ke->kkkkke", 1)]
for (process_str, STEPSIZE) in processes
n = 0
opt_time_cum = 0
process = parse_process(process_str, QEDModel())
graph = gen_graph(process)
inputs = [gen_process_input(process) for _ in 1:n_inputs]
get_compute_function(graph, process, machine)
while true
func_gen_time = @elapsed func = get_compute_function(graph, process, machine)
res = bench(func, inputs)
graph_properties = get_properties(graph)
push!(
df,
(
Threads.nthreads(),
process_str,
n,
opt_time_cum,
graph_properties.noNodes,
graph_properties.noEdges,
graph_properties.computeEffort,
graph_properties.data,
graph_properties.computeIntensity,
func_gen_time,
res.cpu_compile_time,
res.cpu_single_thread_time,
res.cpu_multi_thread_time,
res.gpu_compile_time,
res.gpu_time,
),
)
CSV.write(results_filename, df)
if fixpoint_reached(optimizer, graph)
break
end
opt_time_cum += @elapsed optimize!(optimizer, graph, STEPSIZE)
n += STEPSIZE
end
end
CSV.write(results_filename, df)
for (process_str, STEPSIZE) in [("AB->AB", 1), ("AB->ABBB", 1), ("AB->ABBBBB", 1)]
n = 0
opt_time_cum = 0
process = parse_process(process_str, ABCModel())
graph = parse_dag("input/$process_str.txt", ABCModel())
inputs = [gen_process_input(process) for _ in 1:n_inputs]
get_compute_function(graph, process, machine)
while true
func_gen_time = @elapsed func = get_compute_function(graph, process, machine)
res = bench(func, inputs)
graph_properties = get_properties(graph)
push!(
df,
(
Threads.nthreads(),
process_str,
n,
opt_time_cum,
graph_properties.noNodes,
graph_properties.noEdges,
graph_properties.computeEffort,
graph_properties.data,
graph_properties.computeIntensity,
func_gen_time,
res.cpu_compile_time,
res.cpu_single_thread_time,
res.cpu_multi_thread_time,
res.gpu_compile_time,
res.gpu_time,
),
)
CSV.write(results_filename, df)
if fixpoint_reached(optimizer, graph)
break
end
opt_time_cum += @elapsed optimize!(optimizer, graph, STEPSIZE)
n += STEPSIZE
end
end
CSV.write(results_filename, df)

View File

@@ -0,0 +1,208 @@
using MetagraphOptimization
using CUDA
using UUIDs
using BenchmarkTools
using DataFrames
using CSV
using Dates
results_filename = "bench_results_reduction_steps_gpu.csv"
df = DataFrame(
threads = Int[],
process = String[],
operations = Int[],
cumulative_optimization_time = Float64[],
graph_nodes = Int[],
graph_edges = Int[],
graph_ce = Float64[],
graph_dt = Float64[],
graph_ci = Float64[],
cpu_st_t = Float64[],
cpu_st_s = Float64[],
cpu_mt_t = Float64[],
cpu_mt_s = Float64[],
cpu_mem = Float64[],
gpu_t = Float64[],
gpu_s = Float64[],
gpu_mem = Float64[],
)
# if they exist, read existing results and append new ones
if isfile(results_filename)
df = CSV.read(results_filename, DataFrame)
end
function log(x...)
println(now(), " ", join(x, " ")...)
return flush(stdout)
end
function bench(func, kernel!, inputs)
# gpu part
n = length(inputs)
cu_inputs = CuVector(inputs)
cu_outputs = CuVector{ComplexF64}()
resize!(cu_outputs, n)
ts = 32
bs = Int(n / ts)
bench = @benchmark begin
@cuda threads = $ts blocks = $bs always_inline = true $kernel!($cu_inputs, $cu_outputs, $n)
CUDA.device_synchronize()
end gcsample = true samples = 20 evals = 1
gpu_time = median(bench.times) / 1e9
gpu_std = std(bench.times) / 1e9
gpu_mem = bench.memory
# cpu part
single_thread = @benchmark $func.($inputs)
multi_threaded = @benchmark Threads.@threads for i in eachindex($inputs)
$func($inputs[i])
end
cpu_st_time = median(single_thread.times) / 1e9
cpu_st_std = std(single_thread.times) / 1e9
cpu_mt_time = median(multi_threaded.times) / 1e9
cpu_mt_std = std(multi_threaded.times) / 1e9
cpu_mem = std(single_thread.times)
return (
cpu_single_thread_time = cpu_st_time,
cpu_single_thread_std = cpu_st_std,
cpu_multi_thread_time = cpu_mt_time,
cpu_multi_thread_std = cpu_mt_std,
cpu_mem = cpu_mem,
gpu_time = gpu_time,
gpu_std = gpu_std,
gpu_mem = gpu_mem,
)
end
log("Available CUDA devices:")
for dev in CUDA.devices()
display(dev)
end
# preparation of machine
machine = Machine(
[
MetagraphOptimization.NumaNode(
0,
1,
MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
-1.0,
UUIDs.uuid1(),
),
],
[-1.0;;],
)
# bench and produce data
n_inputs = 2^16
optimizer = ReductionOptimizer()
processes = [("ke->ke", 1), ("ke->kke", 1), ("ke->kkke", 1), ("ke->kkkke", 5)]
for (process_str, STEPSIZE) in processes
n = 0
opt_time_cum = 0
process = parse_process(process_str, QEDModel())
graph = gen_graph(process)
inputs = Vector([gen_process_input(process) for _ in 1:n_inputs])
get_compute_function(graph, process, machine)
while true
func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
res = bench(func, kernel!, inputs)
graph_properties = get_properties(graph)
push!(
df,
(
Threads.nthreads(),
process_str,
n,
opt_time_cum,
graph_properties.noNodes,
graph_properties.noEdges,
graph_properties.computeEffort,
graph_properties.data,
graph_properties.computeIntensity,
res.cpu_single_thread_time,
res.cpu_single_thread_std,
res.cpu_multi_thread_time,
res.cpu_multi_thread_std,
res.cpu_mem,
res.gpu_time,
res.gpu_std,
res.gpu_mem,
),
)
CSV.write(results_filename, df)
if fixpoint_reached(optimizer, graph)
break
end
opt_time_cum += @elapsed optimize!(optimizer, graph, STEPSIZE)
n += STEPSIZE
end
end
CSV.write(results_filename, df)
for (process_str, STEPSIZE) in [("AB->AB", 1), ("AB->ABBB", 1), ("AB->ABBBBB", 1)]
n = 0
opt_time_cum = 0
process = parse_process(process_str, ABCModel())
graph = parse_dag("input/$process_str.txt", ABCModel())
inputs = Vector([gen_process_input(process) for _ in 1:n_inputs])
get_compute_function(graph, process, machine)
while true
func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
res = bench(func, kernel!, inputs)
graph_properties = get_properties(graph)
push!(
df,
(
Threads.nthreads(),
process_str,
n,
opt_time_cum,
graph_properties.noNodes,
graph_properties.noEdges,
graph_properties.computeEffort,
graph_properties.data,
graph_properties.computeIntensity,
res.cpu_single_thread_time,
res.cpu_single_thread_std,
res.cpu_multi_thread_time,
res.cpu_multi_thread_std,
res.cpu_mem,
res.gpu_time,
res.gpu_std,
res.gpu_mem,
),
)
CSV.write(results_filename, df)
if fixpoint_reached(optimizer, graph)
break
end
opt_time_cum += @elapsed optimize!(optimizer, graph, STEPSIZE)
n += STEPSIZE
end
end
CSV.write(results_filename, df)

232
examples/qed_bench_tape.jl Normal file
View File

@@ -0,0 +1,232 @@
using MetagraphOptimization
using LIKWID
using UUIDs
using DataFrames
using CSV
using Random
using BenchmarkTools
using Dates
function log(x...)
println(now(), " ", join(x, " ")...)
return flush(stdout)
end
results_filename = "bench_results_tape_$(Threads.nthreads()).csv"
df = DataFrame(
process_name = String[],
graph_gen_time = Float64[],
optimization_time = Float64[],
function_generation_time = Float64[],
graph_nodes = Int[],
graph_edges = Int[],
graph_mem = Float64[],
cpu_threads = Int[],
n_inputs = Int[],
nflops_likwid = Int[],
cpu_time = Float64[],
cpu_rate = Float64[],
cpu_gflops = Float64[],
cpu_std = Float64[],
gpu_name = String[],
gpu_time = Float64[],
gpu_std = Float64[],
gpu_rate = Float64[],
gpu_gflops = Float64[],
)
# if they exist, read existing results and append new ones
if isfile(results_filename)
df = CSV.read(results_filename, DataFrame)
end
nInputs = 1_000_000
# use "mock" machine that only uses cpu
machine = Machine(
[
MetagraphOptimization.NumaNode(
0,
1,
MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
-1.0,
UUIDs.uuid1(),
),
],
[-1.0;;],
)
function cpu_bench(tape, inputs)
bench = @benchmark begin
@inbounds Threads.@threads for i in eachindex($inputs)
execute_tape($tape, $inputs[i])
end
end gcsample = true seconds = 300
time = mean(bench.times) / 1e9
s = std(bench.times) / 1e9
rate = length(inputs) / time
return (time, rate, s)
end
function bench_process(
process::MetagraphOptimization.AbstractProcessDescription,
process_name::String,
graph::DAG,
gen_time::Float64,
opt_time::Float64,
io::IO = stdout;
use_likwid = false,
)
log("\n--- Benchmarking $(process_name) ---")
func_time = @elapsed tape = gen_tape(graph, process, machine)
graph_props = GraphProperties(graph)
NFLOPs = graph_props.computeEffort
nflops_likwid = 0
if use_likwid
input = gen_process_input(process)
# get rid of annoying output to console
oldstd = stdout
redirect_stdout(devnull)
_, events = @perfmon "FLOPS_DP" execute_tape(tape, input)
redirect_stdout(oldstd) # recover original stdout
NFLOPs = first(events["FLOPS_DP"])["RETIRED_SSE_AVX_FLOPS_ALL"]
nflops_likwid = NFLOPs
end
log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")
inputs = Vector{typeof(gen_process_input(process))}()
resize!(inputs, nInputs)
processes = Vector{typeof(process)}()
for i in 1:Threads.nthreads()
push!(processes, copy(process))
end
@inbounds Threads.@threads for i in eachindex(inputs)
inputs[i] = gen_process_input(processes[Threads.nthreads()])
end
log("Benchmarking CPU with $(Threads.nthreads()) threads...")
(time_cpu, rate_cpu, std_cpu) = cpu_bench(tape, inputs)
flops_cpu = (rate_cpu * NFLOPs) / 10^9
log("\nBenchmark Summary for $(process):")
if use_likwid
log("Measured FLOPS by LIKWID: $NFLOPs")
else
log("Total graph compute effort: $NFLOPs")
end
log("Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))")
log("CPU, $(Threads.nthreads()) threads")
log(" Time: $time_cpu")
log(" Rate: $rate_cpu")
log(" GFLOPS: $flops_cpu")
if (process_name != "warmup")
push!(
df,
Dict(
:process_name => process_name,
:graph_gen_time => gen_time,
:optimization_time => opt_time,
:function_generation_time => func_time,
:graph_nodes => graph_props.noNodes,
:graph_edges => graph_props.noEdges,
:graph_mem => MetagraphOptimization.mem(graph),
:cpu_threads => Threads.nthreads(),
:n_inputs => nInputs,
:nflops_likwid => nflops_likwid,
:cpu_time => time_cpu,
:cpu_std => std_cpu,
:cpu_rate => rate_cpu,
:cpu_gflops => flops_cpu,
:gpu_name => "none",
:gpu_time => 0.0,
:gpu_std => 0.0,
:gpu_rate => 0.0,
:gpu_gflops => 0.0,
),
)
end
return nothing
end
function bench_qed(process_string::String, skip_unoptimized = false)
optimizer = ReductionOptimizer()
process = parse_process(process_string, QEDModel())
gen_time = @elapsed graph = gen_graph(process)
opt_time = 0.0
if !skip_unoptimized
bench_process(process, "$process not optimized tape", graph, gen_time, opt_time)
end
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
bench_process(process, "$process reduced tape", graph, gen_time, opt_time)
return nothing
end
function bench_abc(process_string::String)
optimizer = ReductionOptimizer()
process = parse_process(process_string, ABCModel())
gen_time = @elapsed graph = parse_dag("input/$process_string.txt", ABCModel())
bench_process(process, "$process not optimized tape", graph, gen_time, 0.0)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
bench_process(process, "$process reduced tape", graph, gen_time, opt_time)
return nothing
end
# sadly cannot put these in functions because the world age must increase after the function is created which happens only in the global scope
## -- WARMUP TO COMPILE FUNCTIONS first
optimizer = ReductionOptimizer()
process = parse_process("ke->kke", QEDModel())
gen_time = @elapsed graph = gen_graph(process)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
bench_process(process, "warmup", graph, gen_time, opt_time)
# AB->AB^3
process = parse_process("AB->ABBB", ABCModel())
gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
bench_process(process, "warmup", graph, gen_time, opt_time)
## -- WARMUP END
# compton
bench_qed("ke->ke")
CSV.write(results_filename, df)
bench_qed("ke->kke")
CSV.write(results_filename, df)
bench_qed("ke->kkke")
CSV.write(results_filename, df)
bench_qed("ke->kkkke")
CSV.write(results_filename, df)
bench_qed("ke->kkkkke")
CSV.write(results_filename, df)
bench_qed("ke->kkkkkke")
CSV.write(results_filename, df)
bench_qed("ke->kkkkkkke")
CSV.write(results_filename, df)
bench_abc("AB->AB")
CSV.write(results_filename, df)
bench_abc("AB->ABBB")
CSV.write(results_filename, df)
bench_abc("AB->ABBBBB")
CSV.write(results_filename, df)

144
examples/qed_gen_bench.jl Normal file
View File

@@ -0,0 +1,144 @@
using MetagraphOptimization
using DataFrames
using CSV
using BenchmarkTools
using StatsBase
results_filename = "qed_gen_results_$(Threads.nthreads()).csv"
df = DataFrame(
process_name = String[],
cpu_threads = Int[],
graph_gen_samples = Int[],
graph_gen_mean = Float64[],
graph_gen_std = Float64[],
graph_gen_median = Float64[],
graph_nodes = Int[],
graph_data_nodes = Int[],
graph_u_nodes = Int[],
graph_v_nodes = Int[],
graph_s1_nodes = Int[],
graph_s2_nodes = Int[],
graph_edges = Int[],
graph_nodes_reduced = Int[],
graph_data_nodes_reduced = Int[],
graph_u_nodes_reduced = Int[],
graph_v_nodes_reduced = Int[],
graph_s1_nodes_reduced = Int[],
graph_s2_nodes_reduced = Int[],
graph_edges_reduced = Int[],
graph_mem = Float64[],
graph_mem_reduced = Float64[],
graph_elapsed_reduce = Float64[],
)
function bench_process(process::AbstractString; warmup = false, optimize = true)
println("Benchmarking $process...")
model = QEDModel()
proc = parse_process(process, model)
gen_bench = @benchmark gen_graph($proc) gcsample = true seconds = 5
graph = gen_graph(proc)
props = GraphProperties(graph)
node_dict = countmap(typeof.(graph.nodes))
graph_size = Base.summarysize(graph)
reduce_elapsed = -1.0
node_dict_reduced = Dict()
graph_size_reduced = -1.0
props_reduced = GraphProperties()
if optimize
reduce_elapsed = @elapsed optimize_to_fixpoint!(ReductionOptimizer(), graph)
props_reduced = GraphProperties(graph)
node_dict_reduced = countmap(typeof.(graph.nodes))
graph_size_reduced = Base.summarysize(graph)
end
if warmup
return nothing
end
push!(
df,
Dict(
:process_name => process,
:cpu_threads => Threads.nthreads(),
:graph_gen_samples => length(gen_bench.times),
:graph_gen_mean => mean(gen_bench.times),
:graph_gen_std => std(gen_bench.times),
:graph_gen_median => median(gen_bench.times),
:graph_nodes => props.noNodes,
:graph_data_nodes => get(node_dict, DataTaskNode{DataTask}, 0),
:graph_u_nodes => get(node_dict, ComputeTaskNode{ComputeTaskQED_U}, 0),
:graph_v_nodes => get(node_dict, ComputeTaskNode{ComputeTaskQED_V}, 0),
:graph_s1_nodes => get(node_dict, ComputeTaskNode{ComputeTaskQED_S1}, 0),
:graph_s2_nodes => get(node_dict, ComputeTaskNode{ComputeTaskQED_S2}, 0),
:graph_edges => props.noEdges,
:graph_nodes_reduced => props_reduced.noNodes,
:graph_data_nodes_reduced => get(node_dict_reduced, DataTaskNode{DataTask}, 0),
:graph_u_nodes_reduced => get(node_dict_reduced, ComputeTaskNode{ComputeTaskQED_U}, 0),
:graph_v_nodes_reduced => get(node_dict_reduced, ComputeTaskNode{ComputeTaskQED_V}, 0),
:graph_s1_nodes_reduced => get(node_dict_reduced, ComputeTaskNode{ComputeTaskQED_S1}, 0),
:graph_s2_nodes_reduced => get(node_dict_reduced, ComputeTaskNode{ComputeTaskQED_S2}, 0),
:graph_edges_reduced => props_reduced.noEdges,
:graph_mem => graph_size,
:graph_mem_reduced => graph_size_reduced,
:graph_elapsed_reduce => reduce_elapsed,
),
)
return nothing
end
processes = [
("ke->ke", true),
("ke->kke", true),
("ke->kkke", true),
("ke->kkkke", true),
("ke->kkkkke", true),
("ke->kkkkkke", true),
("ke->kkkkkkke", true),
#("ke->kkkkkkkke", false),
#("ke->kkkkkkkkke", false),
]
df = DataFrame(
process_name = String[],
cpu_threads = Int[],
graph_gen_samples = Int[],
graph_gen_mean = Float64[],
graph_gen_std = Float64[],
graph_gen_median = Float64[],
graph_nodes = Int[],
graph_data_nodes = Int[],
graph_u_nodes = Int[],
graph_v_nodes = Int[],
graph_s1_nodes = Int[],
graph_s2_nodes = Int[],
graph_edges = Int[],
graph_nodes_reduced = Int[],
graph_data_nodes_reduced = Int[],
graph_u_nodes_reduced = Int[],
graph_v_nodes_reduced = Int[],
graph_s1_nodes_reduced = Int[],
graph_s2_nodes_reduced = Int[],
graph_edges_reduced = Int[],
graph_mem = Float64[],
graph_mem_reduced = Float64[],
graph_elapsed_reduce = Float64[],
)
# if they exist, read existing results and append new ones
if isfile(results_filename)
df = CSV.read(results_filename, DataFrame)
end
bench_process("ke->kke", warmup = true)
for (process, opt) in processes
bench_process(process, optimize = opt)
CSV.write(results_filename, df)
end

File diff suppressed because one or more lines are too long