Add reduction benchmarks

2024-02-20 21:18:19 +01:00
parent fce9110e2a
commit 17c2df800c
338 changed files with 15341 additions and 385 deletions
--- a/examples/Project.toml
+++ b/examples/Project.toml
@ -5,5 +5,6 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 MetagraphOptimization = "3e869610-d48d-4942-ba70-c1b702a33ca4"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+QEDbase = "10e22c08-3ccb-4172-bfcf-7d7aa3d04d93"
 QEDprocesses = "46de9c38-1bb3-4547-a1ec-da24d767fdad"
 StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
--- a/examples/qed_bench_reduction_steps.jl
+++ b/examples/qed_bench_reduction_steps.jl
@ -0,0 +1,115 @@
+using MetagraphOptimization
+using CUDA
+using UUIDs
+using BenchmarkTools
+using DataFrames
+using CSV
+
+results_filename = "bench_results_reduction_steps.csv"
+
+df = DataFrame(
+    threads = Int[],
+    process = String[],
+    operations = Int[],
+    cumulative_optimization_time = Float64[],
+    graph_nodes = Int[],
+    graph_edges = Int[],
+    graph_ce = Float64[],
+    graph_dt = Float64[],
+    graph_ci = Float64[],
+    gen_func_t = Float64[],
+    cpu_compile_t = Float64[],
+    cpu_st_t = Float64[],
+    cpu_mt_t = Float64[],
+    gpu_compile_t = Float64[],
+    gpu_t = Float64[],
+)
+
+# if they exist, read existing results and append new ones
+if isfile(results_filename)
+    df = CSV.read(results_filename, DataFrame)
+end
+
+function bench(func, inputs)
+    compile_time = @elapsed func(inputs[1])
+
+    single_thread = @benchmark $func.($inputs)
+    multi_threaded = @benchmark Threads.@threads for i in eachindex($inputs)
+        $func($inputs[i])
+    end
+
+    return (
+        cpu_compile_time = compile_time,
+        gpu_compile_time = 0.0,
+        cpu_single_thread_time = mean(single_thread.times) / 1e9,
+        cpu_multi_thread_time = mean(multi_threaded.times) / 1e9,
+        gpu_time = 0.0,
+    )
+end
+
+
+# bench and produce data
+n_inputs = 10_000
+optimizer = ReductionOptimizer()
+processes = [("ke->kke", 50), ("ke->ke", 1), ("ke->kke", 1), ("ke->kkke", 5), ("ke->kkkke", 5), ("ke->kkkkke", 10), ("ke->kkkkkke", 20)]
+
+for (process_str, STEPSIZE) in processes
+    n = 0
+    opt_time_cum = 0
+    
+    # preparation of graph
+    machine = Machine(
+        [
+            MetagraphOptimization.NumaNode(
+                0,
+                1,
+                MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
+                -1.0,
+                UUIDs.uuid1(),
+            ),
+        ],
+        [-1.0;;],
+    )
+    process = parse_process(process_str, QEDModel())
+    graph = gen_graph(process)
+    inputs = [gen_process_input(process) for _ in 1:n_inputs]
+
+    get_compute_function(graph, process, machine)
+
+    while true
+        func_gen_time = @elapsed func = get_compute_function(graph, process, machine)
+        res = bench(func, inputs)
+
+        graph_properties = get_properties(graph)
+        push!(
+            df,
+            (
+                Threads.nthreads(),
+                process_str,
+                n,
+                opt_time_cum,
+                graph_properties.noNodes,
+                graph_properties.noEdges,
+                graph_properties.computeEffort,
+                graph_properties.data,
+                graph_properties.computeIntensity,
+                func_gen_time,
+                res.cpu_compile_time,
+                res.cpu_single_thread_time,
+                res.cpu_multi_thread_time,
+                res.gpu_compile_time,
+                res.gpu_time,
+            ),
+        )
+        CSV.write(results_filename, df)
+
+        if fixpoint_reached(optimizer, graph)
+            break
+        end
+
+        opt_time_cum += @elapsed optimize!(optimizer, graph, STEPSIZE)
+        n += STEPSIZE
+    end
+end
+
+CSV.write(results_filename, df)
--- a/examples/qed_bench_reduction_steps_gpu.jl
+++ b/examples/qed_bench_reduction_steps_gpu.jl
@ -0,0 +1,118 @@
+using MetagraphOptimization
+using CUDA
+using UUIDs
+using BenchmarkTools
+using DataFrames
+using CSV
+
+results_filename = "bench_results_reduction_steps_gpu.csv"
+
+df = DataFrame(
+    threads = Int[],
+    process = String[],
+    operations = Int[],
+    cumulative_optimization_time = Float64[],
+    graph_nodes = Int[],
+    graph_edges = Int[],
+    graph_ce = Float64[],
+    graph_dt = Float64[],
+    graph_ci = Float64[],
+    gen_func_t = Float64[],
+    cpu_compile_t = Float64[],
+    cpu_st_t = Float64[],
+    cpu_mt_t = Float64[],
+    gpu_compile_t = Float64[],
+    gpu_t = Float64[],
+)
+
+# if they exist, read existing results and append new ones
+if isfile(results_filename)
+    df = CSV.read(results_filename, DataFrame)
+end
+
+function log(x...)
+    println(now(), " ", join(x, " ")...)
+    return flush(stdout)
+end
+
+function bench(func, inputs)
+    gpu_compile_time = @elapsed func.(inputs[1:2])
+
+    gpu_time = @benchmark $func.($inputs)
+    
+    return (
+        cpu_compile_time = 0.0,
+        gpu_compile_time = gpu_compile_time,
+        cpu_single_thread_time = 0.0,
+        cpu_multi_thread_time = 0.0,
+        gpu_time = mean(gpu_time.times) / 1e9,
+    )
+end
+
+log("CUDA devices: $(CUDA.devices())")
+
+# bench and produce data
+n_inputs = 10_000
+optimizer = ReductionOptimizer()
+processes = [("ke->kke", 50), ("ke->ke", 1), ("ke->kke", 1), ("ke->kkke", 1), ("ke->kkkke", 5)]
+
+for (process_str, STEPSIZE) in processes
+    n = 0
+    opt_time_cum = 0
+    
+    # preparation of graph
+    machine = Machine(
+        [
+            MetagraphOptimization.NumaNode(
+                0,
+                1,
+                MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
+                -1.0,
+                UUIDs.uuid1(),
+            ),
+        ],
+        [-1.0;;],
+    )
+    process = parse_process(process_str, QEDModel())
+    graph = gen_graph(process)
+    inputs = CuVector([gen_process_input(process) for _ in 1:n_inputs])
+
+    get_compute_function(graph, process, machine)
+
+    while true
+        func_gen_time = @elapsed func = get_compute_function(graph, process, machine)
+        res = bench(func, inputs)
+
+        graph_properties = get_properties(graph)
+        push!(
+            df,
+            (
+                Threads.nthreads(),
+                process_str,
+                n,
+                opt_time_cum,
+                graph_properties.noNodes,
+                graph_properties.noEdges,
+                graph_properties.computeEffort,
+                graph_properties.data,
+                graph_properties.computeIntensity,
+                func_gen_time,
+                res.cpu_compile_time,
+                res.cpu_single_thread_time,
+                res.cpu_multi_thread_time,
+                res.gpu_compile_time,
+                res.gpu_time,
+            ),
+        )
+        CSV.write(results_filename, df)
+
+        if fixpoint_reached(optimizer, graph)
+            break
+        end
+
+        opt_time_cum += @elapsed optimize!(optimizer, graph, STEPSIZE)
+        n += STEPSIZE
+    end
+end
+
+CSV.write(results_filename, df)
--- a/examples/reduction.ipynb
+++ b/examples/reduction.ipynb