heterogeneity (#27)

Prepare things to work with heterogeneity, make things work on GPU Reviewed-on: Rubydragon/MetagraphOptimization.jl#27 Co-authored-by: Anton Reinhard <anton.reinhard@proton.me> Co-committed-by: Anton Reinhard <anton.reinhard@proton.me>
2023-12-18 14:31:52 +01:00
parent c90346e948
commit 92e0eeaaef
42 changed files with 1631 additions and 238 deletions
--- a/examples/Project.toml
+++ b/examples/Project.toml
@@ -1,3 +1,9 @@
 [deps]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 MetagraphOptimization = "3e869610-d48d-4942-ba70-c1b702a33ca4"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+QEDprocesses = "46de9c38-1bb3-4547-a1ec-da24d767fdad"
+StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
--- a/examples/qed_bench.jl
+++ b/examples/qed_bench.jl
@@ -0,0 +1,148 @@
+using MetagraphOptimization
+using LIKWID
+using CUDA
+using UUIDs
+
+function cpu_bench(compute_function, inputs)
+    compute_function.(inputs[begin:10]) # make sure it's compiled
+
+    time = @elapsed Threads.@threads for i in eachindex(inputs)
+        @invokelatest compute_function(inputs[i])
+    end
+    rate = length(inputs) / time
+    return (time, rate)
+end
+
+function gpu_bench(compute_function, inputs)
+    CUDA.@sync compute_function.(inputs[begin:10])  # make sure it's compiled
+
+    time = @elapsed CUDA.@sync compute_function.(inputs)
+    rate = length(inputs) / time
+
+    return (time, rate)
+end
+
+function bench_process(
+    process::MetagraphOptimization.AbstractProcessDescription,
+    func,
+    io::IO = stdout;
+    use_likwid = true,
+)
+    println(io, "\n--- Benchmarking $(process) ---")
+
+    NFLOPs = GraphProperties(graph).computeEffort
+    if use_likwid
+        input = gen_process_input(process)
+        func(input) # compile first
+        _, events = @perfmon "FLOPS_DP" func(input)
+        NFLOPs = first(events["FLOPS_DP"])["RETIRED_SSE_AVX_FLOPS_ALL"]
+    end
+
+    nInputs = 10000000  # ten million
+    println(io, "Generating $nInputs inputs with $(Threads.nthreads()) threads...")
+
+    inputs = Vector{typeof(gen_process_input(process))}()
+    resize!(inputs, nInputs)
+    processes = Vector{typeof(process)}()
+    for i in 1:Threads.nthreads()
+        push!(processes, copy(process))
+    end
+
+    Threads.@threads for i in eachindex(inputs)
+        inputs[i] = gen_process_input(processes[Threads.nthreads()])
+    end
+
+    println(io, "Benchmarking CPU with $(Threads.nthreads()) threads...")
+    (time_cpu, rate_cpu) = cpu_bench(func, inputs)
+    flops_cpu = (rate_cpu * NFLOPs) / 1024^3
+
+    println(io, "Benchmarking GPU...")
+    cuInputs = CuArray(inputs)
+    (time_gpu, rate_gpu) = gpu_bench(func, cuInputs)
+    flops_gpu = (rate_gpu * NFLOPs) / 1024^3
+
+    println(io, "\nBenchmark Summary for $(process):")
+
+    if use_likwid
+        println(io, "Measured FLOPS by LIKWID: $NFLOPs")
+    else
+        println(io, "Total graph compute effort: $NFLOPs")
+    end
+    println(io, "Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))")
+    println(io, "CPU, $(Threads.nthreads()) threads")
+    println(io, "  Time:  $time_cpu")
+    println(io, "  Rate:  $rate_cpu")
+    println(io, "  GFLOPS: $flops_cpu")
+    println(io, "GPU, $(name(first(CUDA.devices())))")
+    println(io, "  Time:  $time_gpu")
+    println(io, "  Rate:  $rate_gpu")
+    return println(io, "  GFLOPS: $flops_gpu")
+end
+
+# use "mock" machine that only uses cpu
+machine = Machine(
+    [
+        MetagraphOptimization.NumaNode(
+            0,
+            1,
+            MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
+            -1.0,
+            UUIDs.uuid1(),
+        ),
+    ],
+    [-1.0;;],
+)
+optimizer = ReductionOptimizer()
+
+# sadly cannot put these in functions because the world age must increase after the function is created which happens only in the global scope
+
+# compton
+process = parse_process("ke->ke", QEDModel())
+graph = gen_graph(process)
+optimize_to_fixpoint!(optimizer, graph)
+compute_func = get_compute_function(graph, process, machine)
+bench_process(process, compute_func)
+
+# 2-photon compton
+process = parse_process("ke->kke", QEDModel())
+graph = gen_graph(process)
+optimize_to_fixpoint!(optimizer, graph)
+compute_func = get_compute_function(graph, process, machine)
+bench_process(process, compute_func)
+
+# 3-photon compton
+process = parse_process("ke->kkke", QEDModel())
+graph = gen_graph(process)
+optimize_to_fixpoint!(optimizer, graph)
+compute_func = get_compute_function(graph, process, machine)
+bench_process(process, compute_func)
+
+# AB->AB
+process = parse_process("AB->AB", ABCModel())
+graph = parse_dag("input/AB->AB.txt", ABCModel())
+optimize_to_fixpoint!(optimizer, graph)
+compute_func = get_compute_function(graph, process, machine)
+bench_process(process, compute_func)
+
+# AB->AB^3
+process = parse_process("AB->ABBB", ABCModel())
+graph = parse_dag("input/AB->ABBB.txt", ABCModel())
+optimize_to_fixpoint!(optimizer, graph)
+compute_func = get_compute_function(graph, process, machine)
+bench_process(process, compute_func)
+
+exit(0)
+
+# 4-photon compton
+process = parse_process("ke->kkkke", QEDModel())
+graph = gen_graph(process)
+optimize_to_fixpoint!(optimizer, graph)
+compute_func = get_compute_function(graph, process, machine)
+bench_process(process, compute_func)
+
+# AB->AB^5
+process = parse_process("AB->ABBBBB", ABCModel())
+graph = parse_dag("input/AB->ABBBBB.txt", ABCModel())
+optimize_to_fixpoint!(optimizer, graph)
+compute_func = get_compute_function(graph, process, machine)
+bench_process(process, compute_func)
--- a/examples/reduction.ipynb
+++ b/examples/reduction.ipynb