heterogeneity (#27)
Prepare things to work with heterogeneity, make things work on GPU Reviewed-on: Rubydragon/MetagraphOptimization.jl#27 Co-authored-by: Anton Reinhard <anton.reinhard@proton.me> Co-committed-by: Anton Reinhard <anton.reinhard@proton.me>
This commit is contained in:
@@ -1,3 +1,9 @@
|
||||
[deps]
|
||||
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
|
||||
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
|
||||
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
|
||||
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
|
||||
MetagraphOptimization = "3e869610-d48d-4942-ba70-c1b702a33ca4"
|
||||
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
|
||||
QEDprocesses = "46de9c38-1bb3-4547-a1ec-da24d767fdad"
|
||||
StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
|
||||
|
148
examples/qed_bench.jl
Normal file
148
examples/qed_bench.jl
Normal file
@@ -0,0 +1,148 @@
|
||||
using MetagraphOptimization
|
||||
using LIKWID
|
||||
using CUDA
|
||||
using UUIDs
|
||||
|
||||
function cpu_bench(compute_function, inputs)
|
||||
compute_function.(inputs[begin:10]) # make sure it's compiled
|
||||
|
||||
time = @elapsed Threads.@threads for i in eachindex(inputs)
|
||||
@invokelatest compute_function(inputs[i])
|
||||
end
|
||||
rate = length(inputs) / time
|
||||
return (time, rate)
|
||||
end
|
||||
|
||||
function gpu_bench(compute_function, inputs)
|
||||
CUDA.@sync compute_function.(inputs[begin:10]) # make sure it's compiled
|
||||
|
||||
time = @elapsed CUDA.@sync compute_function.(inputs)
|
||||
rate = length(inputs) / time
|
||||
|
||||
return (time, rate)
|
||||
end
|
||||
|
||||
function bench_process(
|
||||
process::MetagraphOptimization.AbstractProcessDescription,
|
||||
func,
|
||||
io::IO = stdout;
|
||||
use_likwid = true,
|
||||
)
|
||||
println(io, "\n--- Benchmarking $(process) ---")
|
||||
|
||||
NFLOPs = GraphProperties(graph).computeEffort
|
||||
if use_likwid
|
||||
input = gen_process_input(process)
|
||||
func(input) # compile first
|
||||
_, events = @perfmon "FLOPS_DP" func(input)
|
||||
NFLOPs = first(events["FLOPS_DP"])["RETIRED_SSE_AVX_FLOPS_ALL"]
|
||||
end
|
||||
|
||||
nInputs = 10000000 # ten million
|
||||
println(io, "Generating $nInputs inputs with $(Threads.nthreads()) threads...")
|
||||
|
||||
inputs = Vector{typeof(gen_process_input(process))}()
|
||||
resize!(inputs, nInputs)
|
||||
processes = Vector{typeof(process)}()
|
||||
for i in 1:Threads.nthreads()
|
||||
push!(processes, copy(process))
|
||||
end
|
||||
|
||||
Threads.@threads for i in eachindex(inputs)
|
||||
inputs[i] = gen_process_input(processes[Threads.nthreads()])
|
||||
end
|
||||
|
||||
println(io, "Benchmarking CPU with $(Threads.nthreads()) threads...")
|
||||
(time_cpu, rate_cpu) = cpu_bench(func, inputs)
|
||||
flops_cpu = (rate_cpu * NFLOPs) / 1024^3
|
||||
|
||||
println(io, "Benchmarking GPU...")
|
||||
cuInputs = CuArray(inputs)
|
||||
(time_gpu, rate_gpu) = gpu_bench(func, cuInputs)
|
||||
flops_gpu = (rate_gpu * NFLOPs) / 1024^3
|
||||
|
||||
println(io, "\nBenchmark Summary for $(process):")
|
||||
|
||||
if use_likwid
|
||||
println(io, "Measured FLOPS by LIKWID: $NFLOPs")
|
||||
else
|
||||
println(io, "Total graph compute effort: $NFLOPs")
|
||||
end
|
||||
println(io, "Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))")
|
||||
println(io, "CPU, $(Threads.nthreads()) threads")
|
||||
println(io, " Time: $time_cpu")
|
||||
println(io, " Rate: $rate_cpu")
|
||||
println(io, " GFLOPS: $flops_cpu")
|
||||
println(io, "GPU, $(name(first(CUDA.devices())))")
|
||||
println(io, " Time: $time_gpu")
|
||||
println(io, " Rate: $rate_gpu")
|
||||
return println(io, " GFLOPS: $flops_gpu")
|
||||
end
|
||||
|
||||
# use "mock" machine that only uses cpu
|
||||
machine = Machine(
|
||||
[
|
||||
MetagraphOptimization.NumaNode(
|
||||
0,
|
||||
1,
|
||||
MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
|
||||
-1.0,
|
||||
UUIDs.uuid1(),
|
||||
),
|
||||
],
|
||||
[-1.0;;],
|
||||
)
|
||||
optimizer = ReductionOptimizer()
|
||||
|
||||
# sadly cannot put these in functions because the world age must increase after the function is created which happens only in the global scope
|
||||
|
||||
# compton
|
||||
process = parse_process("ke->ke", QEDModel())
|
||||
graph = gen_graph(process)
|
||||
optimize_to_fixpoint!(optimizer, graph)
|
||||
compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, compute_func)
|
||||
|
||||
# 2-photon compton
|
||||
process = parse_process("ke->kke", QEDModel())
|
||||
graph = gen_graph(process)
|
||||
optimize_to_fixpoint!(optimizer, graph)
|
||||
compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, compute_func)
|
||||
|
||||
# 3-photon compton
|
||||
process = parse_process("ke->kkke", QEDModel())
|
||||
graph = gen_graph(process)
|
||||
optimize_to_fixpoint!(optimizer, graph)
|
||||
compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, compute_func)
|
||||
|
||||
# AB->AB
|
||||
process = parse_process("AB->AB", ABCModel())
|
||||
graph = parse_dag("input/AB->AB.txt", ABCModel())
|
||||
optimize_to_fixpoint!(optimizer, graph)
|
||||
compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, compute_func)
|
||||
|
||||
# AB->AB^3
|
||||
process = parse_process("AB->ABBB", ABCModel())
|
||||
graph = parse_dag("input/AB->ABBB.txt", ABCModel())
|
||||
optimize_to_fixpoint!(optimizer, graph)
|
||||
compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, compute_func)
|
||||
|
||||
exit(0)
|
||||
|
||||
# 4-photon compton
|
||||
process = parse_process("ke->kkkke", QEDModel())
|
||||
graph = gen_graph(process)
|
||||
optimize_to_fixpoint!(optimizer, graph)
|
||||
compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, compute_func)
|
||||
|
||||
# AB->AB^5
|
||||
process = parse_process("AB->ABBBBB", ABCModel())
|
||||
graph = parse_dag("input/AB->ABBBBB.txt", ABCModel())
|
||||
optimize_to_fixpoint!(optimizer, graph)
|
||||
compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, compute_func)
|
542
examples/reduction.ipynb
Normal file
542
examples/reduction.ipynb
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user