273 lines
8.3 KiB
Julia
Raw Normal View History

using MetagraphOptimization
using LIKWID
using CUDA
using UUIDs
using DataFrames
using CSV
using Random
using BenchmarkTools
using Dates
DISABLE_GPU = false
function log(x...)
println(now(), " ", join(x, " ")...)
return flush(stdout)
end
results_filename = "bench_results_$(Threads.nthreads()).csv"
df = DataFrame(
process_name = String[],
graph_gen_time = Float64[],
optimization_time = Float64[],
function_generation_time = Float64[],
graph_nodes = Int[],
graph_edges = Int[],
graph_mem = Float64[],
cpu_threads = Int[],
n_inputs = Int[],
nflops_likwid = Int[],
cpu_time = Float64[],
cpu_std = Float64[],
cpu_rate = Float64[],
cpu_gflops = Float64[],
gpu_name = String[],
gpu_time = Float64[],
gpu_std = Float64[],
gpu_rate = Float64[],
gpu_gflops = Float64[],
)
# if they exist, read existing results and append new ones
if isfile(results_filename)
df = CSV.read(results_filename, DataFrame)
end
nInputs = 2^20
function cpu_bench(compute_function, inputs)
bench = @benchmark begin
@inbounds Threads.@threads for i in eachindex($inputs)
@invokelatest $compute_function($inputs[i])
end
end gcsample = true samples = 20 evals = 1
time = median(bench.times) / 1e9
s = std(bench.times) / 1e9
rate = length(inputs) / time
return (time, rate, s)
end
function gpu_bench(kernel!, inputs)
n = length(inputs)
outputs = CuVector{ComplexF64}()
resize!(outputs, n)
ts = 32
bs = Int(n / ts)
bench = @benchmark begin
@cuda threads = $ts blocks = $bs always_inline = true $kernel!($inputs, $outputs, $n)
CUDA.device_synchronize()
end gcsample = true samples = 20 evals = 1
time = median(bench.times) / 1e9
s = std(bench.times) / 1e9
rate = length(inputs) / time
return (time, rate, s)
end
function bench_process(
process::MetagraphOptimization.AbstractProcessDescription,
process_name::String,
graph::DAG,
func,
kernel!,
gen_time::Float64,
opt_time::Float64,
func_time::Float64;
use_likwid = false,
use_gpu = true,
)
log("\n--- Benchmarking $(process_name) ---")
if DISABLE_GPU
use_gpu = false
end
graph_props = GraphProperties(graph)
NFLOPs = graph_props.computeEffort
nflops_likwid = 0
if use_likwid
input = gen_process_input(process)
func(input) # compile first
# get rid of annoying output to console
oldstd = stdout
redirect_stdout(devnull)
_, events = @perfmon "FLOPS_DP" func(input)
redirect_stdout(oldstd) # recover original stdout
NFLOPs = first(events["FLOPS_DP"])["RETIRED_SSE_AVX_FLOPS_ALL"]
nflops_likwid = NFLOPs
end
log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")
inputs = Vector{typeof(gen_process_input(process))}()
resize!(inputs, nInputs)
processes = Vector{typeof(process)}()
for i in 1:Threads.nthreads()
push!(processes, copy(process))
end
@inbounds Threads.@threads for i in eachindex(inputs)
inputs[i] = gen_process_input(processes[Threads.nthreads()])
end
log("Benchmarking CPU with $(Threads.nthreads()) threads...")
(time_cpu, rate_cpu, std_cpu) = cpu_bench(func, inputs)
flops_cpu = (rate_cpu * NFLOPs) / 10^9
time_gpu = 0.0
std_gpu = 0.0
rate_gpu = 0.0
flops_gpu = 0.0
gpu_name = "none"
if use_gpu
log("Benchmarking GPU...")
gpu_name = "$(name(first(CUDA.devices())))"
cuInputs = CuArray(inputs)
(time_gpu, rate_gpu, std_gpu) = gpu_bench(kernel!, cuInputs)
flops_gpu = (rate_gpu * NFLOPs) / 10^9
else
log("Skipping GPU...")
end
log("\nBenchmark Summary for $(process):")
if use_likwid
log("Measured FLOPS by LIKWID: $NFLOPs")
else
log("Total graph compute effort: $NFLOPs")
end
log("Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))")
log("CPU, $(Threads.nthreads()) threads")
log(" Time: $time_cpu")
log(" Rate: $rate_cpu")
log(" GFLOPS: $flops_cpu")
if use_gpu
log("GPU, $gpu_name")
log(" Time: $time_gpu")
log(" Rate: $rate_gpu")
log(" GFLOPS: $flops_gpu")
end
if (process_name != "warmup")
push!(
df,
Dict(
:process_name => process_name,
:graph_gen_time => gen_time,
:optimization_time => opt_time,
:function_generation_time => func_time,
:graph_nodes => graph_props.noNodes,
:graph_edges => graph_props.noEdges,
:graph_mem => MetagraphOptimization.mem(graph),
:cpu_threads => Threads.nthreads(),
:n_inputs => nInputs,
:nflops_likwid => nflops_likwid,
:cpu_time => time_cpu,
:cpu_std => std_cpu,
:cpu_rate => rate_cpu,
:cpu_gflops => flops_cpu,
:gpu_name => gpu_name,
:gpu_time => time_gpu,
:gpu_std => std_gpu,
:gpu_rate => rate_gpu,
:gpu_gflops => flops_gpu,
),
)
end
return nothing
end
# use "mock" machine that only uses cpu
machine = Machine(
[
MetagraphOptimization.NumaNode(
0,
1,
MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
-1.0,
UUIDs.uuid1(),
),
],
[-1.0;;],
)
# sadly cannot put these in functions because the world age must increase after the function is created which happens only in the global scope
## -- WARMUP TO COMPILE FUNCTIONS first
#=
optimizer = RandomWalkOptimizer(MersenneTwister(0))
# 2-photon compton
process = parse_process("ke->kke", QEDModel())
gen_time = @elapsed graph = gen_graph(process)
opt_time = @elapsed optimize!(optimizer, graph, 200)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
optimizer = ReductionOptimizer()
# AB->AB^3
process = parse_process("AB->ABBB", ABCModel())
gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
=#
## -- WARMUP END
optimizer = ReductionOptimizer()
processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke"]
for process_str in processes
# compton
process = parse_process(process_str, QEDModel())
gen_time = @elapsed graph = gen_graph(process)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
CSV.write(results_filename, df)
end
processes = ["AB->AB", "AB->ABBB", "AB->ABBBBB", "AB->ABBBBBBB"]
for process_str in processes
# AB->AB
process = parse_process(process_str, ABCModel())
gen_time = @elapsed graph = parse_dag("input/$(process_str).txt", ABCModel())
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
CSV.write(results_filename, df)
end