experiments (#1)
Co-authored-by: Anton Reinhard <anton.reinhard@proton.me> Reviewed-on: #1
This commit is contained in:
@@ -5,5 +5,6 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
|
||||
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
|
||||
MetagraphOptimization = "3e869610-d48d-4942-ba70-c1b702a33ca4"
|
||||
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
|
||||
QEDbase = "10e22c08-3ccb-4172-bfcf-7d7aa3d04d93"
|
||||
QEDprocesses = "46de9c38-1bb3-4547-a1ec-da24d767fdad"
|
||||
StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
|
||||
|
249
examples/full_node_bench.jl
Normal file
249
examples/full_node_bench.jl
Normal file
@@ -0,0 +1,249 @@
|
||||
using MetagraphOptimization
|
||||
using CUDA
|
||||
using UUIDs
|
||||
using DataFrames
|
||||
using CSV
|
||||
using Random
|
||||
using BenchmarkTools
|
||||
using Dates
|
||||
|
||||
using Base.Threads
|
||||
|
||||
|
||||
function log(x...)
|
||||
println(now(), " ", join(x, " ")...)
|
||||
flush(stdout)
|
||||
return nothing
|
||||
end
|
||||
|
||||
results_filename = "full_node_bench.csv"
|
||||
|
||||
df = DataFrame(
|
||||
process_name = String[],
|
||||
cpu_threads = Int[],
|
||||
gpu_devices = Int[],
|
||||
n_inputs = Int[],
|
||||
chunk_size = Int[],
|
||||
time = Float64[],
|
||||
std = Float64[],
|
||||
rate = Float64[],
|
||||
cpu_chunks = Float64[],
|
||||
gpu_chunks = Float64[],
|
||||
memory_est = Float64[],
|
||||
)
|
||||
|
||||
# if they exist, read existing results and append new ones
|
||||
if isfile(results_filename)
|
||||
df = CSV.read(results_filename, DataFrame)
|
||||
end
|
||||
|
||||
nInputs = 2^26
|
||||
|
||||
lck = ReentrantLock()
|
||||
|
||||
progress = 1
|
||||
cpu_chunks = 0
|
||||
gpu_chunks = 0
|
||||
|
||||
chunkSizes = [1024, 4096, 16384, 65536, 262144, 1048576] # 2^10 to 2^20
|
||||
|
||||
function cpu_worker(compute_func, inputs, chunk_size)
|
||||
global progress
|
||||
global cpu_chunks
|
||||
global lck
|
||||
quit = false
|
||||
work_start = 0
|
||||
work_end = 0
|
||||
while true
|
||||
lock(lck) do
|
||||
if progress >= nInputs
|
||||
quit = true
|
||||
else
|
||||
work_start = progress
|
||||
progress = progress + chunk_size
|
||||
work_end = min(progress - 1, nInputs)
|
||||
cpu_chunks = cpu_chunks + 1
|
||||
#log("CPU Worker $(Threads.threadid()) computing $(cpu_chunks)th cpu chunk ($work_start, $work_end)")
|
||||
end
|
||||
end
|
||||
if quit
|
||||
break
|
||||
end
|
||||
|
||||
for i in work_start:work_end
|
||||
compute_func(inputs[i])
|
||||
end
|
||||
end
|
||||
|
||||
#log("CPU Worker on $(Threads.threadid()) finished!")
|
||||
|
||||
return nothing
|
||||
end
|
||||
|
||||
# called with a specific device selected
|
||||
function gpu_worker(kernel!, inputs, chunk_size)
|
||||
global progress
|
||||
global gpu_chunks
|
||||
global lck
|
||||
cuOutputs = CuVector{ComplexF64}()
|
||||
resize!(cuOutputs, chunk_size)
|
||||
|
||||
quit = false
|
||||
work_start = 0
|
||||
work_end = 0
|
||||
while true
|
||||
lock(lck) do
|
||||
if progress >= nInputs
|
||||
quit = true
|
||||
else
|
||||
work_start = progress
|
||||
progress = progress + chunk_size
|
||||
work_end = min(progress - 1, nInputs)
|
||||
gpu_chunks = gpu_chunks + 1
|
||||
#log("GPU Worker $(CUDA.device()) computing $(gpu_chunks)th gpu chunk ($work_start, $work_end)")
|
||||
end
|
||||
end
|
||||
if quit
|
||||
break
|
||||
end
|
||||
|
||||
cuInputs = CuVector(inputs[work_start:work_end])
|
||||
ts = 32
|
||||
bs = Int(chunk_size / 32)
|
||||
@cuda threads = ts blocks = bs always_inline = true kernel!(cuInputs, cuOutputs, chunk_size)
|
||||
CUDA.device_synchronize()
|
||||
end
|
||||
|
||||
#log("GPU Worker on Device $(CUDA.device()) finished!")
|
||||
|
||||
return nothing
|
||||
end
|
||||
|
||||
cpu_gpu_ratio = Vector{Tuple{Int, Int}}()
|
||||
|
||||
function full_compute(compute_func, kernel!, inputs, chunk_size)
|
||||
global progress
|
||||
progress = 1
|
||||
global cpu_chunks
|
||||
cpu_chunks = 0
|
||||
global gpu_chunks
|
||||
gpu_chunks = 0
|
||||
|
||||
tasks = Vector()
|
||||
|
||||
for dev in CUDA.devices()
|
||||
t = Threads.@spawn device!(dev) do
|
||||
gpu_worker(kernel!, inputs, chunk_size)
|
||||
return nothing
|
||||
end
|
||||
push!(tasks, t)
|
||||
end
|
||||
|
||||
for i in 1:(Threads.nthreads() - length(CUDA.devices()))
|
||||
t = Threads.@spawn cpu_worker(compute_func, inputs, chunk_size)
|
||||
push!(tasks, t)
|
||||
end
|
||||
|
||||
for t in tasks
|
||||
wait(t)
|
||||
end
|
||||
|
||||
push!(cpu_gpu_ratio, (cpu_chunks, gpu_chunks))
|
||||
return nothing
|
||||
end
|
||||
|
||||
function bench(compute_function, kernel!, inputs, chunk_size)
|
||||
global cpu_gpu_ratio
|
||||
empty!(cpu_gpu_ratio)
|
||||
|
||||
bench = @benchmark begin
|
||||
full_compute($compute_function, $kernel!, $inputs, $chunk_size)
|
||||
end gcsample = true seconds = 60
|
||||
|
||||
time = median(bench.times) / 1e9
|
||||
s = std(bench.times) / 1e9
|
||||
rate = length(inputs) / time
|
||||
|
||||
med_cpu_chunks = median(getindex.(cpu_gpu_ratio, 1))
|
||||
med_gpu_chunks = median(getindex.(cpu_gpu_ratio, 2))
|
||||
mem_estimate = bench.memory
|
||||
|
||||
log("CPU/GPU ratios: $(cpu_gpu_ratio)")
|
||||
|
||||
return (time, rate, s, med_cpu_chunks, med_gpu_chunks, mem_estimate)
|
||||
end
|
||||
|
||||
function full_node_bench(process::MetagraphOptimization.AbstractProcessDescription, func, kernel!, chunk_size, inputs)
|
||||
process_name = string(process)
|
||||
log("\n--- Benchmarking $(process_name) on $(nInputs) with chunk size $(chunk_size) ---")
|
||||
|
||||
log("Available Cuda Devices:")
|
||||
display.(CUDA.devices())
|
||||
|
||||
log("Benchmarking full node...")
|
||||
(time, rate, s, med_cpu_chunks, med_gpu_chunks, mem_estimate) = bench(func, kernel!, inputs, chunk_size)
|
||||
log(
|
||||
"Benchmarking complete with median time $(time), $(med_cpu_chunks) cpu chunks, and $(med_gpu_chunks) gpu chunks.",
|
||||
)
|
||||
|
||||
push!(
|
||||
df,
|
||||
Dict(
|
||||
:process_name => process_name,
|
||||
:cpu_threads => Threads.nthreads() - length(CUDA.devices()),
|
||||
:gpu_devices => length(CUDA.devices()),
|
||||
:n_inputs => nInputs,
|
||||
:chunk_size => chunk_size,
|
||||
:time => time,
|
||||
:std => s,
|
||||
:rate => rate,
|
||||
:cpu_chunks => med_cpu_chunks,
|
||||
:gpu_chunks => med_gpu_chunks,
|
||||
:memory_est => mem_estimate,
|
||||
),
|
||||
)
|
||||
|
||||
return nothing
|
||||
end
|
||||
|
||||
# use "mock" machine that only uses cpu for compilation
|
||||
machine = Machine(
|
||||
[
|
||||
MetagraphOptimization.NumaNode(
|
||||
0,
|
||||
1,
|
||||
MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
|
||||
-1.0,
|
||||
UUIDs.uuid1(),
|
||||
),
|
||||
],
|
||||
[-1.0;;],
|
||||
)
|
||||
|
||||
optimizer = ReductionOptimizer()
|
||||
processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke"]
|
||||
|
||||
for proc in processes
|
||||
process = parse_process(proc, QEDModel())
|
||||
graph = gen_graph(process)
|
||||
optimize_to_fixpoint!(optimizer, graph)
|
||||
compute_func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
|
||||
log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")
|
||||
inputs = Vector{typeof(gen_process_input(process))}()
|
||||
resize!(inputs, nInputs)
|
||||
procs = Vector{typeof(process)}()
|
||||
for i in 1:Threads.nthreads()
|
||||
push!(procs, copy(process))
|
||||
end
|
||||
|
||||
@inbounds Threads.@threads for i in eachindex(inputs)
|
||||
inputs[i] = gen_process_input(procs[Threads.nthreads()])
|
||||
end
|
||||
|
||||
for chunk_size in chunkSizes
|
||||
full_node_bench(process, compute_func, kernel!, chunk_size, inputs)
|
||||
CSV.write(results_filename, df)
|
||||
end
|
||||
end;
|
@@ -34,9 +34,10 @@ function import_bench()
|
||||
bench_txt("AB->ABBB.txt")
|
||||
bench_txt("AB->ABBBBB.txt")
|
||||
bench_txt("AB->ABBBBBBB.txt")
|
||||
#bench_txt("AB->ABBBBBBBBB.txt")
|
||||
bench_txt("AB->ABBBBBBBBB.txt")
|
||||
bench_txt("ABAB->ABAB.txt")
|
||||
return bench_txt("ABAB->ABC.txt")
|
||||
bench_txt("ABAB->ABC.txt")
|
||||
return nothing
|
||||
end
|
||||
|
||||
import_bench()
|
||||
|
@@ -2,44 +2,117 @@ using MetagraphOptimization
|
||||
using LIKWID
|
||||
using CUDA
|
||||
using UUIDs
|
||||
using DataFrames
|
||||
using CSV
|
||||
using Random
|
||||
using BenchmarkTools
|
||||
using Dates
|
||||
|
||||
function cpu_bench(compute_function, inputs)
|
||||
compute_function.(inputs[begin:10]) # make sure it's compiled
|
||||
DISABLE_GPU = false
|
||||
|
||||
time = @elapsed Threads.@threads for i in eachindex(inputs)
|
||||
@invokelatest compute_function(inputs[i])
|
||||
end
|
||||
rate = length(inputs) / time
|
||||
return (time, rate)
|
||||
function log(x...)
|
||||
println(now(), " ", join(x, " ")...)
|
||||
return flush(stdout)
|
||||
end
|
||||
|
||||
function gpu_bench(compute_function, inputs)
|
||||
CUDA.@sync compute_function.(inputs[begin:10]) # make sure it's compiled
|
||||
results_filename = "bench_results_$(Threads.nthreads()).csv"
|
||||
|
||||
time = @elapsed CUDA.@sync compute_function.(inputs)
|
||||
df = DataFrame(
|
||||
process_name = String[],
|
||||
graph_gen_time = Float64[],
|
||||
optimization_time = Float64[],
|
||||
function_generation_time = Float64[],
|
||||
graph_nodes = Int[],
|
||||
graph_edges = Int[],
|
||||
graph_mem = Float64[],
|
||||
cpu_threads = Int[],
|
||||
n_inputs = Int[],
|
||||
nflops_likwid = Int[],
|
||||
cpu_time = Float64[],
|
||||
cpu_std = Float64[],
|
||||
cpu_rate = Float64[],
|
||||
cpu_gflops = Float64[],
|
||||
gpu_name = String[],
|
||||
gpu_time = Float64[],
|
||||
gpu_std = Float64[],
|
||||
gpu_rate = Float64[],
|
||||
gpu_gflops = Float64[],
|
||||
)
|
||||
|
||||
# if they exist, read existing results and append new ones
|
||||
if isfile(results_filename)
|
||||
df = CSV.read(results_filename, DataFrame)
|
||||
end
|
||||
|
||||
nInputs = 2^20
|
||||
|
||||
function cpu_bench(compute_function, inputs)
|
||||
bench = @benchmark begin
|
||||
@inbounds Threads.@threads for i in eachindex($inputs)
|
||||
@invokelatest $compute_function($inputs[i])
|
||||
end
|
||||
end gcsample = true samples = 20 evals = 1
|
||||
|
||||
time = median(bench.times) / 1e9
|
||||
s = std(bench.times) / 1e9
|
||||
rate = length(inputs) / time
|
||||
|
||||
return (time, rate)
|
||||
return (time, rate, s)
|
||||
end
|
||||
|
||||
function gpu_bench(kernel!, inputs)
|
||||
n = length(inputs)
|
||||
outputs = CuVector{ComplexF64}()
|
||||
resize!(outputs, n)
|
||||
ts = 32
|
||||
bs = Int(n / ts)
|
||||
bench = @benchmark begin
|
||||
@cuda threads = $ts blocks = $bs always_inline = true $kernel!($inputs, $outputs, $n)
|
||||
CUDA.device_synchronize()
|
||||
end gcsample = true samples = 20 evals = 1
|
||||
|
||||
time = median(bench.times) / 1e9
|
||||
s = std(bench.times) / 1e9
|
||||
rate = length(inputs) / time
|
||||
|
||||
return (time, rate, s)
|
||||
end
|
||||
|
||||
function bench_process(
|
||||
process::MetagraphOptimization.AbstractProcessDescription,
|
||||
process_name::String,
|
||||
graph::DAG,
|
||||
func,
|
||||
io::IO = stdout;
|
||||
use_likwid = true,
|
||||
kernel!,
|
||||
gen_time::Float64,
|
||||
opt_time::Float64,
|
||||
func_time::Float64;
|
||||
use_likwid = false,
|
||||
use_gpu = true,
|
||||
)
|
||||
println(io, "\n--- Benchmarking $(process) ---")
|
||||
log("\n--- Benchmarking $(process_name) ---")
|
||||
if DISABLE_GPU
|
||||
use_gpu = false
|
||||
end
|
||||
|
||||
NFLOPs = GraphProperties(graph).computeEffort
|
||||
graph_props = GraphProperties(graph)
|
||||
NFLOPs = graph_props.computeEffort
|
||||
nflops_likwid = 0
|
||||
if use_likwid
|
||||
input = gen_process_input(process)
|
||||
func(input) # compile first
|
||||
|
||||
# get rid of annoying output to console
|
||||
oldstd = stdout
|
||||
redirect_stdout(devnull)
|
||||
_, events = @perfmon "FLOPS_DP" func(input)
|
||||
redirect_stdout(oldstd) # recover original stdout
|
||||
|
||||
NFLOPs = first(events["FLOPS_DP"])["RETIRED_SSE_AVX_FLOPS_ALL"]
|
||||
nflops_likwid = NFLOPs
|
||||
end
|
||||
|
||||
nInputs = 10000000 # ten million
|
||||
println(io, "Generating $nInputs inputs with $(Threads.nthreads()) threads...")
|
||||
log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")
|
||||
|
||||
inputs = Vector{typeof(gen_process_input(process))}()
|
||||
resize!(inputs, nInputs)
|
||||
@@ -48,35 +121,76 @@ function bench_process(
|
||||
push!(processes, copy(process))
|
||||
end
|
||||
|
||||
Threads.@threads for i in eachindex(inputs)
|
||||
@inbounds Threads.@threads for i in eachindex(inputs)
|
||||
inputs[i] = gen_process_input(processes[Threads.nthreads()])
|
||||
end
|
||||
|
||||
println(io, "Benchmarking CPU with $(Threads.nthreads()) threads...")
|
||||
(time_cpu, rate_cpu) = cpu_bench(func, inputs)
|
||||
flops_cpu = (rate_cpu * NFLOPs) / 1024^3
|
||||
log("Benchmarking CPU with $(Threads.nthreads()) threads...")
|
||||
(time_cpu, rate_cpu, std_cpu) = cpu_bench(func, inputs)
|
||||
flops_cpu = (rate_cpu * NFLOPs) / 10^9
|
||||
|
||||
println(io, "Benchmarking GPU...")
|
||||
cuInputs = CuArray(inputs)
|
||||
(time_gpu, rate_gpu) = gpu_bench(func, cuInputs)
|
||||
flops_gpu = (rate_gpu * NFLOPs) / 1024^3
|
||||
time_gpu = 0.0
|
||||
std_gpu = 0.0
|
||||
rate_gpu = 0.0
|
||||
flops_gpu = 0.0
|
||||
gpu_name = "none"
|
||||
if use_gpu
|
||||
log("Benchmarking GPU...")
|
||||
gpu_name = "$(name(first(CUDA.devices())))"
|
||||
cuInputs = CuArray(inputs)
|
||||
(time_gpu, rate_gpu, std_gpu) = gpu_bench(kernel!, cuInputs)
|
||||
flops_gpu = (rate_gpu * NFLOPs) / 10^9
|
||||
else
|
||||
log("Skipping GPU...")
|
||||
end
|
||||
|
||||
println(io, "\nBenchmark Summary for $(process):")
|
||||
log("\nBenchmark Summary for $(process):")
|
||||
|
||||
if use_likwid
|
||||
println(io, "Measured FLOPS by LIKWID: $NFLOPs")
|
||||
log("Measured FLOPS by LIKWID: $NFLOPs")
|
||||
else
|
||||
println(io, "Total graph compute effort: $NFLOPs")
|
||||
log("Total graph compute effort: $NFLOPs")
|
||||
end
|
||||
println(io, "Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))")
|
||||
println(io, "CPU, $(Threads.nthreads()) threads")
|
||||
println(io, " Time: $time_cpu")
|
||||
println(io, " Rate: $rate_cpu")
|
||||
println(io, " GFLOPS: $flops_cpu")
|
||||
println(io, "GPU, $(name(first(CUDA.devices())))")
|
||||
println(io, " Time: $time_gpu")
|
||||
println(io, " Rate: $rate_gpu")
|
||||
return println(io, " GFLOPS: $flops_gpu")
|
||||
log("Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))")
|
||||
log("CPU, $(Threads.nthreads()) threads")
|
||||
log(" Time: $time_cpu")
|
||||
log(" Rate: $rate_cpu")
|
||||
log(" GFLOPS: $flops_cpu")
|
||||
if use_gpu
|
||||
log("GPU, $gpu_name")
|
||||
log(" Time: $time_gpu")
|
||||
log(" Rate: $rate_gpu")
|
||||
log(" GFLOPS: $flops_gpu")
|
||||
end
|
||||
|
||||
if (process_name != "warmup")
|
||||
push!(
|
||||
df,
|
||||
Dict(
|
||||
:process_name => process_name,
|
||||
:graph_gen_time => gen_time,
|
||||
:optimization_time => opt_time,
|
||||
:function_generation_time => func_time,
|
||||
:graph_nodes => graph_props.noNodes,
|
||||
:graph_edges => graph_props.noEdges,
|
||||
:graph_mem => MetagraphOptimization.mem(graph),
|
||||
:cpu_threads => Threads.nthreads(),
|
||||
:n_inputs => nInputs,
|
||||
:nflops_likwid => nflops_likwid,
|
||||
:cpu_time => time_cpu,
|
||||
:cpu_std => std_cpu,
|
||||
:cpu_rate => rate_cpu,
|
||||
:cpu_gflops => flops_cpu,
|
||||
:gpu_name => gpu_name,
|
||||
:gpu_time => time_gpu,
|
||||
:gpu_std => std_gpu,
|
||||
:gpu_rate => rate_gpu,
|
||||
:gpu_gflops => flops_gpu,
|
||||
),
|
||||
)
|
||||
end
|
||||
|
||||
return nothing
|
||||
end
|
||||
|
||||
# use "mock" machine that only uses cpu
|
||||
@@ -92,57 +206,67 @@ machine = Machine(
|
||||
],
|
||||
[-1.0;;],
|
||||
)
|
||||
optimizer = ReductionOptimizer()
|
||||
|
||||
# sadly cannot put these in functions because the world age must increase after the function is created which happens only in the global scope
|
||||
|
||||
# compton
|
||||
process = parse_process("ke->ke", QEDModel())
|
||||
graph = gen_graph(process)
|
||||
optimize_to_fixpoint!(optimizer, graph)
|
||||
compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, compute_func)
|
||||
## -- WARMUP TO COMPILE FUNCTIONS first
|
||||
#=
|
||||
optimizer = RandomWalkOptimizer(MersenneTwister(0))
|
||||
|
||||
# 2-photon compton
|
||||
process = parse_process("ke->kke", QEDModel())
|
||||
graph = gen_graph(process)
|
||||
optimize_to_fixpoint!(optimizer, graph)
|
||||
compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, compute_func)
|
||||
gen_time = @elapsed graph = gen_graph(process)
|
||||
opt_time = @elapsed optimize!(optimizer, graph, 200)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
|
||||
|
||||
# 3-photon compton
|
||||
process = parse_process("ke->kkke", QEDModel())
|
||||
graph = gen_graph(process)
|
||||
optimize_to_fixpoint!(optimizer, graph)
|
||||
compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, compute_func)
|
||||
|
||||
# AB->AB
|
||||
process = parse_process("AB->AB", ABCModel())
|
||||
graph = parse_dag("input/AB->AB.txt", ABCModel())
|
||||
optimize_to_fixpoint!(optimizer, graph)
|
||||
compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, compute_func)
|
||||
optimizer = ReductionOptimizer()
|
||||
|
||||
# AB->AB^3
|
||||
process = parse_process("AB->ABBB", ABCModel())
|
||||
graph = parse_dag("input/AB->ABBB.txt", ABCModel())
|
||||
optimize_to_fixpoint!(optimizer, graph)
|
||||
compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, compute_func)
|
||||
gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
|
||||
=#
|
||||
## -- WARMUP END
|
||||
|
||||
exit(0)
|
||||
optimizer = ReductionOptimizer()
|
||||
|
||||
# 4-photon compton
|
||||
process = parse_process("ke->kkkke", QEDModel())
|
||||
graph = gen_graph(process)
|
||||
optimize_to_fixpoint!(optimizer, graph)
|
||||
compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, compute_func)
|
||||
processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke"]
|
||||
|
||||
# AB->AB^5
|
||||
process = parse_process("AB->ABBBBB", ABCModel())
|
||||
graph = parse_dag("input/AB->ABBBBB.txt", ABCModel())
|
||||
optimize_to_fixpoint!(optimizer, graph)
|
||||
compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, compute_func)
|
||||
for process_str in processes
|
||||
# compton
|
||||
process = parse_process(process_str, QEDModel())
|
||||
gen_time = @elapsed graph = gen_graph(process)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
end
|
||||
|
||||
processes = ["AB->AB", "AB->ABBB", "AB->ABBBBB", "AB->ABBBBBBB"]
|
||||
|
||||
for process_str in processes
|
||||
# AB->AB
|
||||
process = parse_process(process_str, ABCModel())
|
||||
gen_time = @elapsed graph = parse_dag("input/$(process_str).txt", ABCModel())
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
end
|
||||
|
163
examples/qed_bench_reduction_steps.jl
Normal file
163
examples/qed_bench_reduction_steps.jl
Normal file
@@ -0,0 +1,163 @@
|
||||
using MetagraphOptimization
|
||||
using CUDA
|
||||
using UUIDs
|
||||
using BenchmarkTools
|
||||
using DataFrames
|
||||
using CSV
|
||||
|
||||
results_filename = "bench_results_reduction_steps.csv"
|
||||
|
||||
df = DataFrame(
|
||||
threads = Int[],
|
||||
process = String[],
|
||||
operations = Int[],
|
||||
cumulative_optimization_time = Float64[],
|
||||
graph_nodes = Int[],
|
||||
graph_edges = Int[],
|
||||
graph_ce = Float64[],
|
||||
graph_dt = Float64[],
|
||||
graph_ci = Float64[],
|
||||
gen_func_t = Float64[],
|
||||
cpu_compile_t = Float64[],
|
||||
cpu_st_t = Float64[],
|
||||
cpu_mt_t = Float64[],
|
||||
gpu_compile_t = Float64[],
|
||||
gpu_t = Float64[],
|
||||
)
|
||||
|
||||
# if they exist, read existing results and append new ones
|
||||
if isfile(results_filename)
|
||||
df = CSV.read(results_filename, DataFrame)
|
||||
end
|
||||
|
||||
function bench(func, inputs)
|
||||
compile_time = @elapsed func(inputs[1])
|
||||
|
||||
single_thread = @benchmark $func.($inputs)
|
||||
multi_threaded = @benchmark Threads.@threads for i in eachindex($inputs)
|
||||
$func($inputs[i])
|
||||
end
|
||||
|
||||
return (
|
||||
cpu_compile_time = compile_time,
|
||||
gpu_compile_time = 0.0,
|
||||
cpu_single_thread_time = mean(single_thread.times) / 1e9,
|
||||
cpu_multi_thread_time = mean(multi_threaded.times) / 1e9,
|
||||
gpu_time = 0.0,
|
||||
)
|
||||
end
|
||||
|
||||
# preparation of machine
|
||||
machine = Machine(
|
||||
[
|
||||
MetagraphOptimization.NumaNode(
|
||||
0,
|
||||
1,
|
||||
MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
|
||||
-1.0,
|
||||
UUIDs.uuid1(),
|
||||
),
|
||||
],
|
||||
[-1.0;;],
|
||||
)
|
||||
|
||||
# bench and produce data
|
||||
n_inputs = 50_000
|
||||
optimizer = ReductionOptimizer()
|
||||
processes = [("ke->kke", 5), ("ke->ke", 1), ("ke->kke", 1), ("ke->kkke", 1), ("ke->kkkke", 1), ("ke->kkkkke", 1)]
|
||||
|
||||
for (process_str, STEPSIZE) in processes
|
||||
n = 0
|
||||
opt_time_cum = 0
|
||||
|
||||
process = parse_process(process_str, QEDModel())
|
||||
graph = gen_graph(process)
|
||||
inputs = [gen_process_input(process) for _ in 1:n_inputs]
|
||||
|
||||
get_compute_function(graph, process, machine)
|
||||
|
||||
while true
|
||||
func_gen_time = @elapsed func = get_compute_function(graph, process, machine)
|
||||
res = bench(func, inputs)
|
||||
|
||||
graph_properties = get_properties(graph)
|
||||
push!(
|
||||
df,
|
||||
(
|
||||
Threads.nthreads(),
|
||||
process_str,
|
||||
n,
|
||||
opt_time_cum,
|
||||
graph_properties.noNodes,
|
||||
graph_properties.noEdges,
|
||||
graph_properties.computeEffort,
|
||||
graph_properties.data,
|
||||
graph_properties.computeIntensity,
|
||||
func_gen_time,
|
||||
res.cpu_compile_time,
|
||||
res.cpu_single_thread_time,
|
||||
res.cpu_multi_thread_time,
|
||||
res.gpu_compile_time,
|
||||
res.gpu_time,
|
||||
),
|
||||
)
|
||||
CSV.write(results_filename, df)
|
||||
|
||||
if fixpoint_reached(optimizer, graph)
|
||||
break
|
||||
end
|
||||
|
||||
opt_time_cum += @elapsed optimize!(optimizer, graph, STEPSIZE)
|
||||
n += STEPSIZE
|
||||
end
|
||||
end
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
|
||||
for (process_str, STEPSIZE) in [("AB->AB", 1), ("AB->ABBB", 1), ("AB->ABBBBB", 1)]
|
||||
n = 0
|
||||
opt_time_cum = 0
|
||||
|
||||
process = parse_process(process_str, ABCModel())
|
||||
graph = parse_dag("input/$process_str.txt", ABCModel())
|
||||
inputs = [gen_process_input(process) for _ in 1:n_inputs]
|
||||
|
||||
get_compute_function(graph, process, machine)
|
||||
|
||||
while true
|
||||
func_gen_time = @elapsed func = get_compute_function(graph, process, machine)
|
||||
res = bench(func, inputs)
|
||||
|
||||
graph_properties = get_properties(graph)
|
||||
push!(
|
||||
df,
|
||||
(
|
||||
Threads.nthreads(),
|
||||
process_str,
|
||||
n,
|
||||
opt_time_cum,
|
||||
graph_properties.noNodes,
|
||||
graph_properties.noEdges,
|
||||
graph_properties.computeEffort,
|
||||
graph_properties.data,
|
||||
graph_properties.computeIntensity,
|
||||
func_gen_time,
|
||||
res.cpu_compile_time,
|
||||
res.cpu_single_thread_time,
|
||||
res.cpu_multi_thread_time,
|
||||
res.gpu_compile_time,
|
||||
res.gpu_time,
|
||||
),
|
||||
)
|
||||
CSV.write(results_filename, df)
|
||||
|
||||
if fixpoint_reached(optimizer, graph)
|
||||
break
|
||||
end
|
||||
|
||||
opt_time_cum += @elapsed optimize!(optimizer, graph, STEPSIZE)
|
||||
n += STEPSIZE
|
||||
end
|
||||
end
|
||||
|
||||
CSV.write(results_filename, df)
|
208
examples/qed_bench_reduction_steps_gpu.jl
Normal file
208
examples/qed_bench_reduction_steps_gpu.jl
Normal file
@@ -0,0 +1,208 @@
|
||||
using MetagraphOptimization
|
||||
using CUDA
|
||||
using UUIDs
|
||||
using BenchmarkTools
|
||||
using DataFrames
|
||||
using CSV
|
||||
using Dates
|
||||
|
||||
results_filename = "bench_results_reduction_steps_gpu.csv"
|
||||
|
||||
df = DataFrame(
|
||||
threads = Int[],
|
||||
process = String[],
|
||||
operations = Int[],
|
||||
cumulative_optimization_time = Float64[],
|
||||
graph_nodes = Int[],
|
||||
graph_edges = Int[],
|
||||
graph_ce = Float64[],
|
||||
graph_dt = Float64[],
|
||||
graph_ci = Float64[],
|
||||
cpu_st_t = Float64[],
|
||||
cpu_st_s = Float64[],
|
||||
cpu_mt_t = Float64[],
|
||||
cpu_mt_s = Float64[],
|
||||
cpu_mem = Float64[],
|
||||
gpu_t = Float64[],
|
||||
gpu_s = Float64[],
|
||||
gpu_mem = Float64[],
|
||||
)
|
||||
|
||||
# if they exist, read existing results and append new ones
|
||||
if isfile(results_filename)
|
||||
df = CSV.read(results_filename, DataFrame)
|
||||
end
|
||||
|
||||
function log(x...)
|
||||
println(now(), " ", join(x, " ")...)
|
||||
return flush(stdout)
|
||||
end
|
||||
|
||||
function bench(func, kernel!, inputs)
|
||||
# gpu part
|
||||
n = length(inputs)
|
||||
cu_inputs = CuVector(inputs)
|
||||
cu_outputs = CuVector{ComplexF64}()
|
||||
resize!(cu_outputs, n)
|
||||
ts = 32
|
||||
bs = Int(n / ts)
|
||||
bench = @benchmark begin
|
||||
@cuda threads = $ts blocks = $bs always_inline = true $kernel!($cu_inputs, $cu_outputs, $n)
|
||||
CUDA.device_synchronize()
|
||||
end gcsample = true samples = 20 evals = 1
|
||||
|
||||
gpu_time = median(bench.times) / 1e9
|
||||
gpu_std = std(bench.times) / 1e9
|
||||
gpu_mem = bench.memory
|
||||
|
||||
# cpu part
|
||||
single_thread = @benchmark $func.($inputs)
|
||||
multi_threaded = @benchmark Threads.@threads for i in eachindex($inputs)
|
||||
$func($inputs[i])
|
||||
end
|
||||
|
||||
cpu_st_time = median(single_thread.times) / 1e9
|
||||
cpu_st_std = std(single_thread.times) / 1e9
|
||||
cpu_mt_time = median(multi_threaded.times) / 1e9
|
||||
cpu_mt_std = std(multi_threaded.times) / 1e9
|
||||
cpu_mem = std(single_thread.times)
|
||||
|
||||
|
||||
return (
|
||||
cpu_single_thread_time = cpu_st_time,
|
||||
cpu_single_thread_std = cpu_st_std,
|
||||
cpu_multi_thread_time = cpu_mt_time,
|
||||
cpu_multi_thread_std = cpu_mt_std,
|
||||
cpu_mem = cpu_mem,
|
||||
gpu_time = gpu_time,
|
||||
gpu_std = gpu_std,
|
||||
gpu_mem = gpu_mem,
|
||||
)
|
||||
end
|
||||
|
||||
log("Available CUDA devices:")
|
||||
for dev in CUDA.devices()
|
||||
display(dev)
|
||||
end
|
||||
|
||||
# preparation of machine
|
||||
machine = Machine(
|
||||
[
|
||||
MetagraphOptimization.NumaNode(
|
||||
0,
|
||||
1,
|
||||
MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
|
||||
-1.0,
|
||||
UUIDs.uuid1(),
|
||||
),
|
||||
],
|
||||
[-1.0;;],
|
||||
)
|
||||
|
||||
|
||||
# bench and produce data
|
||||
n_inputs = 2^16
|
||||
optimizer = ReductionOptimizer()
|
||||
processes = [("ke->ke", 1), ("ke->kke", 1), ("ke->kkke", 1), ("ke->kkkke", 5)]
|
||||
|
||||
for (process_str, STEPSIZE) in processes
|
||||
n = 0
|
||||
opt_time_cum = 0
|
||||
|
||||
process = parse_process(process_str, QEDModel())
|
||||
graph = gen_graph(process)
|
||||
inputs = Vector([gen_process_input(process) for _ in 1:n_inputs])
|
||||
|
||||
get_compute_function(graph, process, machine)
|
||||
|
||||
while true
|
||||
func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
res = bench(func, kernel!, inputs)
|
||||
|
||||
graph_properties = get_properties(graph)
|
||||
push!(
|
||||
df,
|
||||
(
|
||||
Threads.nthreads(),
|
||||
process_str,
|
||||
n,
|
||||
opt_time_cum,
|
||||
graph_properties.noNodes,
|
||||
graph_properties.noEdges,
|
||||
graph_properties.computeEffort,
|
||||
graph_properties.data,
|
||||
graph_properties.computeIntensity,
|
||||
res.cpu_single_thread_time,
|
||||
res.cpu_single_thread_std,
|
||||
res.cpu_multi_thread_time,
|
||||
res.cpu_multi_thread_std,
|
||||
res.cpu_mem,
|
||||
res.gpu_time,
|
||||
res.gpu_std,
|
||||
res.gpu_mem,
|
||||
),
|
||||
)
|
||||
CSV.write(results_filename, df)
|
||||
|
||||
if fixpoint_reached(optimizer, graph)
|
||||
break
|
||||
end
|
||||
|
||||
opt_time_cum += @elapsed optimize!(optimizer, graph, STEPSIZE)
|
||||
n += STEPSIZE
|
||||
end
|
||||
end
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
|
||||
for (process_str, STEPSIZE) in [("AB->AB", 1), ("AB->ABBB", 1), ("AB->ABBBBB", 1)]
|
||||
n = 0
|
||||
opt_time_cum = 0
|
||||
|
||||
process = parse_process(process_str, ABCModel())
|
||||
graph = parse_dag("input/$process_str.txt", ABCModel())
|
||||
inputs = Vector([gen_process_input(process) for _ in 1:n_inputs])
|
||||
|
||||
get_compute_function(graph, process, machine)
|
||||
|
||||
while true
|
||||
func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
res = bench(func, kernel!, inputs)
|
||||
|
||||
graph_properties = get_properties(graph)
|
||||
push!(
|
||||
df,
|
||||
(
|
||||
Threads.nthreads(),
|
||||
process_str,
|
||||
n,
|
||||
opt_time_cum,
|
||||
graph_properties.noNodes,
|
||||
graph_properties.noEdges,
|
||||
graph_properties.computeEffort,
|
||||
graph_properties.data,
|
||||
graph_properties.computeIntensity,
|
||||
res.cpu_single_thread_time,
|
||||
res.cpu_single_thread_std,
|
||||
res.cpu_multi_thread_time,
|
||||
res.cpu_multi_thread_std,
|
||||
res.cpu_mem,
|
||||
res.gpu_time,
|
||||
res.gpu_std,
|
||||
res.gpu_mem,
|
||||
),
|
||||
)
|
||||
CSV.write(results_filename, df)
|
||||
|
||||
if fixpoint_reached(optimizer, graph)
|
||||
break
|
||||
end
|
||||
|
||||
opt_time_cum += @elapsed optimize!(optimizer, graph, STEPSIZE)
|
||||
n += STEPSIZE
|
||||
end
|
||||
end
|
||||
|
||||
CSV.write(results_filename, df)
|
232
examples/qed_bench_tape.jl
Normal file
232
examples/qed_bench_tape.jl
Normal file
@@ -0,0 +1,232 @@
|
||||
using MetagraphOptimization
|
||||
using LIKWID
|
||||
using UUIDs
|
||||
using DataFrames
|
||||
using CSV
|
||||
using Random
|
||||
using BenchmarkTools
|
||||
using Dates
|
||||
|
||||
function log(x...)
|
||||
println(now(), " ", join(x, " ")...)
|
||||
return flush(stdout)
|
||||
end
|
||||
|
||||
results_filename = "bench_results_tape_$(Threads.nthreads()).csv"
|
||||
|
||||
df = DataFrame(
|
||||
process_name = String[],
|
||||
graph_gen_time = Float64[],
|
||||
optimization_time = Float64[],
|
||||
function_generation_time = Float64[],
|
||||
graph_nodes = Int[],
|
||||
graph_edges = Int[],
|
||||
graph_mem = Float64[],
|
||||
cpu_threads = Int[],
|
||||
n_inputs = Int[],
|
||||
nflops_likwid = Int[],
|
||||
cpu_time = Float64[],
|
||||
cpu_rate = Float64[],
|
||||
cpu_gflops = Float64[],
|
||||
cpu_std = Float64[],
|
||||
gpu_name = String[],
|
||||
gpu_time = Float64[],
|
||||
gpu_std = Float64[],
|
||||
gpu_rate = Float64[],
|
||||
gpu_gflops = Float64[],
|
||||
)
|
||||
|
||||
# if they exist, read existing results and append new ones
|
||||
if isfile(results_filename)
|
||||
df = CSV.read(results_filename, DataFrame)
|
||||
end
|
||||
|
||||
nInputs = 1_000_000
|
||||
|
||||
# use "mock" machine that only uses cpu
|
||||
machine = Machine(
|
||||
[
|
||||
MetagraphOptimization.NumaNode(
|
||||
0,
|
||||
1,
|
||||
MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
|
||||
-1.0,
|
||||
UUIDs.uuid1(),
|
||||
),
|
||||
],
|
||||
[-1.0;;],
|
||||
)
|
||||
|
||||
|
||||
function cpu_bench(tape, inputs)
|
||||
bench = @benchmark begin
|
||||
@inbounds Threads.@threads for i in eachindex($inputs)
|
||||
execute_tape($tape, $inputs[i])
|
||||
end
|
||||
end gcsample = true seconds = 300
|
||||
|
||||
time = mean(bench.times) / 1e9
|
||||
s = std(bench.times) / 1e9
|
||||
rate = length(inputs) / time
|
||||
|
||||
return (time, rate, s)
|
||||
end
|
||||
|
||||
function bench_process(
|
||||
process::MetagraphOptimization.AbstractProcessDescription,
|
||||
process_name::String,
|
||||
graph::DAG,
|
||||
gen_time::Float64,
|
||||
opt_time::Float64,
|
||||
io::IO = stdout;
|
||||
use_likwid = false,
|
||||
)
|
||||
log("\n--- Benchmarking $(process_name) ---")
|
||||
|
||||
func_time = @elapsed tape = gen_tape(graph, process, machine)
|
||||
|
||||
graph_props = GraphProperties(graph)
|
||||
NFLOPs = graph_props.computeEffort
|
||||
nflops_likwid = 0
|
||||
if use_likwid
|
||||
input = gen_process_input(process)
|
||||
|
||||
# get rid of annoying output to console
|
||||
oldstd = stdout
|
||||
redirect_stdout(devnull)
|
||||
_, events = @perfmon "FLOPS_DP" execute_tape(tape, input)
|
||||
redirect_stdout(oldstd) # recover original stdout
|
||||
|
||||
NFLOPs = first(events["FLOPS_DP"])["RETIRED_SSE_AVX_FLOPS_ALL"]
|
||||
nflops_likwid = NFLOPs
|
||||
end
|
||||
|
||||
log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")
|
||||
|
||||
inputs = Vector{typeof(gen_process_input(process))}()
|
||||
resize!(inputs, nInputs)
|
||||
processes = Vector{typeof(process)}()
|
||||
for i in 1:Threads.nthreads()
|
||||
push!(processes, copy(process))
|
||||
end
|
||||
|
||||
@inbounds Threads.@threads for i in eachindex(inputs)
|
||||
inputs[i] = gen_process_input(processes[Threads.nthreads()])
|
||||
end
|
||||
|
||||
log("Benchmarking CPU with $(Threads.nthreads()) threads...")
|
||||
(time_cpu, rate_cpu, std_cpu) = cpu_bench(tape, inputs)
|
||||
flops_cpu = (rate_cpu * NFLOPs) / 10^9
|
||||
|
||||
log("\nBenchmark Summary for $(process):")
|
||||
|
||||
if use_likwid
|
||||
log("Measured FLOPS by LIKWID: $NFLOPs")
|
||||
else
|
||||
log("Total graph compute effort: $NFLOPs")
|
||||
end
|
||||
log("Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))")
|
||||
log("CPU, $(Threads.nthreads()) threads")
|
||||
log(" Time: $time_cpu")
|
||||
log(" Rate: $rate_cpu")
|
||||
log(" GFLOPS: $flops_cpu")
|
||||
|
||||
if (process_name != "warmup")
|
||||
push!(
|
||||
df,
|
||||
Dict(
|
||||
:process_name => process_name,
|
||||
:graph_gen_time => gen_time,
|
||||
:optimization_time => opt_time,
|
||||
:function_generation_time => func_time,
|
||||
:graph_nodes => graph_props.noNodes,
|
||||
:graph_edges => graph_props.noEdges,
|
||||
:graph_mem => MetagraphOptimization.mem(graph),
|
||||
:cpu_threads => Threads.nthreads(),
|
||||
:n_inputs => nInputs,
|
||||
:nflops_likwid => nflops_likwid,
|
||||
:cpu_time => time_cpu,
|
||||
:cpu_std => std_cpu,
|
||||
:cpu_rate => rate_cpu,
|
||||
:cpu_gflops => flops_cpu,
|
||||
:gpu_name => "none",
|
||||
:gpu_time => 0.0,
|
||||
:gpu_std => 0.0,
|
||||
:gpu_rate => 0.0,
|
||||
:gpu_gflops => 0.0,
|
||||
),
|
||||
)
|
||||
end
|
||||
|
||||
return nothing
|
||||
end
|
||||
|
||||
function bench_qed(process_string::String, skip_unoptimized = false)
|
||||
optimizer = ReductionOptimizer()
|
||||
|
||||
process = parse_process(process_string, QEDModel())
|
||||
gen_time = @elapsed graph = gen_graph(process)
|
||||
opt_time = 0.0
|
||||
if !skip_unoptimized
|
||||
bench_process(process, "$process not optimized tape", graph, gen_time, opt_time)
|
||||
end
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
bench_process(process, "$process reduced tape", graph, gen_time, opt_time)
|
||||
|
||||
return nothing
|
||||
end
|
||||
|
||||
function bench_abc(process_string::String)
|
||||
optimizer = ReductionOptimizer()
|
||||
|
||||
process = parse_process(process_string, ABCModel())
|
||||
gen_time = @elapsed graph = parse_dag("input/$process_string.txt", ABCModel())
|
||||
bench_process(process, "$process not optimized tape", graph, gen_time, 0.0)
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
bench_process(process, "$process reduced tape", graph, gen_time, opt_time)
|
||||
|
||||
return nothing
|
||||
end
|
||||
|
||||
# sadly cannot put these in functions because the world age must increase after the function is created which happens only in the global scope
|
||||
|
||||
## -- WARMUP TO COMPILE FUNCTIONS first
|
||||
optimizer = ReductionOptimizer()
|
||||
|
||||
process = parse_process("ke->kke", QEDModel())
|
||||
gen_time = @elapsed graph = gen_graph(process)
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
bench_process(process, "warmup", graph, gen_time, opt_time)
|
||||
|
||||
# AB->AB^3
|
||||
process = parse_process("AB->ABBB", ABCModel())
|
||||
gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
bench_process(process, "warmup", graph, gen_time, opt_time)
|
||||
|
||||
## -- WARMUP END
|
||||
|
||||
# compton
|
||||
bench_qed("ke->ke")
|
||||
CSV.write(results_filename, df)
|
||||
bench_qed("ke->kke")
|
||||
CSV.write(results_filename, df)
|
||||
bench_qed("ke->kkke")
|
||||
CSV.write(results_filename, df)
|
||||
bench_qed("ke->kkkke")
|
||||
CSV.write(results_filename, df)
|
||||
bench_qed("ke->kkkkke")
|
||||
CSV.write(results_filename, df)
|
||||
bench_qed("ke->kkkkkke")
|
||||
CSV.write(results_filename, df)
|
||||
bench_qed("ke->kkkkkkke")
|
||||
CSV.write(results_filename, df)
|
||||
|
||||
bench_abc("AB->AB")
|
||||
CSV.write(results_filename, df)
|
||||
bench_abc("AB->ABBB")
|
||||
CSV.write(results_filename, df)
|
||||
bench_abc("AB->ABBBBB")
|
||||
CSV.write(results_filename, df)
|
144
examples/qed_gen_bench.jl
Normal file
144
examples/qed_gen_bench.jl
Normal file
@@ -0,0 +1,144 @@
|
||||
using MetagraphOptimization
|
||||
using DataFrames
|
||||
using CSV
|
||||
using BenchmarkTools
|
||||
using StatsBase
|
||||
|
||||
results_filename = "qed_gen_results_$(Threads.nthreads()).csv"
|
||||
|
||||
df = DataFrame(
|
||||
process_name = String[],
|
||||
cpu_threads = Int[],
|
||||
graph_gen_samples = Int[],
|
||||
graph_gen_mean = Float64[],
|
||||
graph_gen_std = Float64[],
|
||||
graph_gen_median = Float64[],
|
||||
graph_nodes = Int[],
|
||||
graph_data_nodes = Int[],
|
||||
graph_u_nodes = Int[],
|
||||
graph_v_nodes = Int[],
|
||||
graph_s1_nodes = Int[],
|
||||
graph_s2_nodes = Int[],
|
||||
graph_edges = Int[],
|
||||
graph_nodes_reduced = Int[],
|
||||
graph_data_nodes_reduced = Int[],
|
||||
graph_u_nodes_reduced = Int[],
|
||||
graph_v_nodes_reduced = Int[],
|
||||
graph_s1_nodes_reduced = Int[],
|
||||
graph_s2_nodes_reduced = Int[],
|
||||
graph_edges_reduced = Int[],
|
||||
graph_mem = Float64[],
|
||||
graph_mem_reduced = Float64[],
|
||||
graph_elapsed_reduce = Float64[],
|
||||
)
|
||||
|
||||
function bench_process(process::AbstractString; warmup = false, optimize = true)
|
||||
println("Benchmarking $process...")
|
||||
model = QEDModel()
|
||||
|
||||
proc = parse_process(process, model)
|
||||
|
||||
gen_bench = @benchmark gen_graph($proc) gcsample = true seconds = 5
|
||||
|
||||
graph = gen_graph(proc)
|
||||
|
||||
props = GraphProperties(graph)
|
||||
node_dict = countmap(typeof.(graph.nodes))
|
||||
graph_size = Base.summarysize(graph)
|
||||
|
||||
reduce_elapsed = -1.0
|
||||
node_dict_reduced = Dict()
|
||||
graph_size_reduced = -1.0
|
||||
props_reduced = GraphProperties()
|
||||
if optimize
|
||||
reduce_elapsed = @elapsed optimize_to_fixpoint!(ReductionOptimizer(), graph)
|
||||
|
||||
props_reduced = GraphProperties(graph)
|
||||
node_dict_reduced = countmap(typeof.(graph.nodes))
|
||||
graph_size_reduced = Base.summarysize(graph)
|
||||
end
|
||||
|
||||
if warmup
|
||||
return nothing
|
||||
end
|
||||
|
||||
push!(
|
||||
df,
|
||||
Dict(
|
||||
:process_name => process,
|
||||
:cpu_threads => Threads.nthreads(),
|
||||
:graph_gen_samples => length(gen_bench.times),
|
||||
:graph_gen_mean => mean(gen_bench.times),
|
||||
:graph_gen_std => std(gen_bench.times),
|
||||
:graph_gen_median => median(gen_bench.times),
|
||||
:graph_nodes => props.noNodes,
|
||||
:graph_data_nodes => get(node_dict, DataTaskNode{DataTask}, 0),
|
||||
:graph_u_nodes => get(node_dict, ComputeTaskNode{ComputeTaskQED_U}, 0),
|
||||
:graph_v_nodes => get(node_dict, ComputeTaskNode{ComputeTaskQED_V}, 0),
|
||||
:graph_s1_nodes => get(node_dict, ComputeTaskNode{ComputeTaskQED_S1}, 0),
|
||||
:graph_s2_nodes => get(node_dict, ComputeTaskNode{ComputeTaskQED_S2}, 0),
|
||||
:graph_edges => props.noEdges,
|
||||
:graph_nodes_reduced => props_reduced.noNodes,
|
||||
:graph_data_nodes_reduced => get(node_dict_reduced, DataTaskNode{DataTask}, 0),
|
||||
:graph_u_nodes_reduced => get(node_dict_reduced, ComputeTaskNode{ComputeTaskQED_U}, 0),
|
||||
:graph_v_nodes_reduced => get(node_dict_reduced, ComputeTaskNode{ComputeTaskQED_V}, 0),
|
||||
:graph_s1_nodes_reduced => get(node_dict_reduced, ComputeTaskNode{ComputeTaskQED_S1}, 0),
|
||||
:graph_s2_nodes_reduced => get(node_dict_reduced, ComputeTaskNode{ComputeTaskQED_S2}, 0),
|
||||
:graph_edges_reduced => props_reduced.noEdges,
|
||||
:graph_mem => graph_size,
|
||||
:graph_mem_reduced => graph_size_reduced,
|
||||
:graph_elapsed_reduce => reduce_elapsed,
|
||||
),
|
||||
)
|
||||
return nothing
|
||||
end
|
||||
|
||||
processes = [
|
||||
("ke->ke", true),
|
||||
("ke->kke", true),
|
||||
("ke->kkke", true),
|
||||
("ke->kkkke", true),
|
||||
("ke->kkkkke", true),
|
||||
("ke->kkkkkke", true),
|
||||
("ke->kkkkkkke", true),
|
||||
#("ke->kkkkkkkke", false),
|
||||
#("ke->kkkkkkkkke", false),
|
||||
]
|
||||
|
||||
df = DataFrame(
|
||||
process_name = String[],
|
||||
cpu_threads = Int[],
|
||||
graph_gen_samples = Int[],
|
||||
graph_gen_mean = Float64[],
|
||||
graph_gen_std = Float64[],
|
||||
graph_gen_median = Float64[],
|
||||
graph_nodes = Int[],
|
||||
graph_data_nodes = Int[],
|
||||
graph_u_nodes = Int[],
|
||||
graph_v_nodes = Int[],
|
||||
graph_s1_nodes = Int[],
|
||||
graph_s2_nodes = Int[],
|
||||
graph_edges = Int[],
|
||||
graph_nodes_reduced = Int[],
|
||||
graph_data_nodes_reduced = Int[],
|
||||
graph_u_nodes_reduced = Int[],
|
||||
graph_v_nodes_reduced = Int[],
|
||||
graph_s1_nodes_reduced = Int[],
|
||||
graph_s2_nodes_reduced = Int[],
|
||||
graph_edges_reduced = Int[],
|
||||
graph_mem = Float64[],
|
||||
graph_mem_reduced = Float64[],
|
||||
graph_elapsed_reduce = Float64[],
|
||||
)
|
||||
|
||||
# if they exist, read existing results and append new ones
|
||||
if isfile(results_filename)
|
||||
df = CSV.read(results_filename, DataFrame)
|
||||
end
|
||||
|
||||
bench_process("ke->kke", warmup = true)
|
||||
|
||||
for (process, opt) in processes
|
||||
bench_process(process, optimize = opt)
|
||||
CSV.write(results_filename, df)
|
||||
end
|
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user