Add evaluation script, run script, csv data and first plots

2024-02-06 09:35:04 +01:00
parent 7098d1801a
commit 3ac9954d32
42 changed files with 6725 additions and 36 deletions
--- a/examples/qed_bench.jl
+++ b/examples/qed_bench.jl
@ -6,7 +6,7 @@ using DataFrames
 using CSV
 using Random

-DISABLE_GPU = false
+DISABLE_GPU = true

 results_filename = "results.csv"

@ -261,29 +261,6 @@ bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_ti
 # 6-photon compton
 process = parse_process("ke->kkkkkke", QEDModel())
 gen_time = @elapsed graph = gen_graph(process)
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process no optimization", graph, compute_func, gen_time, 0.0, func_gen_time, use_gpu = false)
-
-opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time, use_gpu = false)
-
-# 7-photon compton
-process = parse_process("ke->kkkkkkke", QEDModel())
-gen_time = @elapsed graph = gen_graph(process)
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process no optimization", graph, compute_func, gen_time, 0.0, func_gen_time, use_gpu = false)
-
-opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time, use_gpu = false)
-
-# 8-photon compton
-process = parse_process("ke->kkkkkkkke", QEDModel())
-gen_time = @elapsed graph = gen_graph(process)
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process no optimization", graph, compute_func, gen_time, 0.0, func_gen_time, use_gpu = false)
-
 opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
 bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time, use_gpu = false)
@ -318,14 +295,4 @@ opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
 bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time, use_gpu = false)

-# AB->AB^7
-process = parse_process("AB->ABBBBBBB", ABCModel())
-gen_time = @elapsed graph = parse_dag("input/AB->ABBBBBBB.txt", ABCModel())
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process no optimization", graph, compute_func, gen_time, 0.0, func_gen_time, use_gpu = false)
-
-opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time, use_gpu = false)
-
 CSV.write(results_filename, df)
--- a/examples/qed_bench_tape.jl
+++ b/examples/qed_bench_tape.jl
@ -0,0 +1,209 @@
+using MetagraphOptimization
+using LIKWID
+using UUIDs
+using DataFrames
+using CSV
+using Random
+
+results_filename = "results.csv"
+
+df = DataFrame(
+    process_name = String[],
+    graph_gen_time = Float64[],
+    optimization_time = Float64[],
+    function_generation_time = Float64[],
+    graph_nodes = Int[],
+    graph_edges = Int[],
+    graph_mem = Float64[],
+    cpu_threads = Int[],
+    n_inputs = Int[],
+    nflops_likwid = Int[],
+    cpu_time = Float64[],
+    cpu_rate = Float64[],
+    cpu_gflops = Float64[],
+    gpu_name = String[],
+    gpu_time = Float64[],
+    gpu_rate = Float64[],
+    gpu_gflops = Float64[],
+)
+
+# if they exist, read existing results and append new ones
+if isfile(results_filename)
+    df = CSV.read(results_filename, DataFrame)
+end
+
+nInputs = 100_000
+
+
+# use "mock" machine that only uses cpu
+machine = Machine(
+    [
+        MetagraphOptimization.NumaNode(
+            0,
+            1,
+            MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
+            -1.0,
+            UUIDs.uuid1(),
+        ),
+    ],
+    [-1.0;;],
+)
+
+
+function cpu_bench(tape, inputs)
+    time = @elapsed Threads.@threads for i in eachindex(inputs)
+        execute_tape(tape, inputs[i])
+    end
+    rate = length(inputs) / time
+    return (time, rate)
+end
+
+function bench_process(
+    process::MetagraphOptimization.AbstractProcessDescription,
+    process_name::String,
+    graph::DAG,
+    gen_time::Float64,
+    opt_time::Float64,
+    io::IO = stdout;
+    use_likwid = true,
+)
+    println(io, "\n--- Benchmarking $(process_name) ---")
+
+    func_time = @elapsed tape = gen_tape(graph, process, machine)
+
+    graph_props = GraphProperties(graph)
+    NFLOPs = graph_props.computeEffort
+    nflops_likwid = 0
+    if use_likwid
+        input = gen_process_input(process)
+
+        # get rid of annoying output to console
+        oldstd = stdout
+        redirect_stdout(devnull)
+        _, events = @perfmon "FLOPS_DP" execute_tape(tape, input)
+        redirect_stdout(oldstd) # recover original stdout
+
+        NFLOPs = first(events["FLOPS_DP"])["RETIRED_SSE_AVX_FLOPS_ALL"]
+        nflops_likwid = NFLOPs
+    end
+
+    println(io, "Generating $nInputs inputs with $(Threads.nthreads()) threads...")
+
+    inputs = Vector{typeof(gen_process_input(process))}()
+    resize!(inputs, nInputs)
+    processes = Vector{typeof(process)}()
+    for i in 1:Threads.nthreads()
+        push!(processes, copy(process))
+    end
+
+    Threads.@threads for i in eachindex(inputs)
+        inputs[i] = gen_process_input(processes[Threads.nthreads()])
+    end
+
+    println(io, "Benchmarking CPU with $(Threads.nthreads()) threads...")
+    (time_cpu, rate_cpu) = cpu_bench(tape, inputs)
+    flops_cpu = (rate_cpu * NFLOPs) / 10^9
+
+    println(io, "\nBenchmark Summary for $(process):")
+
+    if use_likwid
+        println(io, "Measured FLOPS by LIKWID: $NFLOPs")
+    else
+        println(io, "Total graph compute effort: $NFLOPs")
+    end
+    println(io, "Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))")
+    println(io, "CPU, $(Threads.nthreads()) threads")
+    println(io, "  Time:  $time_cpu")
+    println(io, "  Rate:  $rate_cpu")
+    println(io, "  GFLOPS: $flops_cpu")
+
+    if (process_name != "warmup")
+        push!(
+            df,
+            Dict(
+                :process_name => process_name,
+                :graph_gen_time => gen_time,
+                :optimization_time => opt_time,
+                :function_generation_time => func_time,
+                :graph_nodes => graph_props.noNodes,
+                :graph_edges => graph_props.noEdges,
+                :graph_mem => MetagraphOptimization.mem(graph),
+                :cpu_threads => Threads.nthreads(),
+                :n_inputs => nInputs,
+                :nflops_likwid => nflops_likwid,
+                :cpu_time => time_cpu,
+                :cpu_rate => rate_cpu,
+                :cpu_gflops => flops_cpu,
+                :gpu_name => "none",
+                :gpu_time => 0.0,
+                :gpu_rate => 0.0,
+                :gpu_gflops => 0.0,
+            ),
+        )
+    end
+
+    return nothing
+end
+
+function bench_qed(process_string::String, skip_unoptimized = false)
+    optimizer = ReductionOptimizer()
+
+    process = parse_process(process_string, QEDModel())
+    gen_time = @elapsed graph = gen_graph(process)
+    opt_time = 0.0
+    if !skip_unoptimized
+        bench_process(process, "$process not optimized tape", graph, gen_time, opt_time)
+    end
+
+    opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
+    bench_process(process, "$process reduced tape", graph, gen_time, opt_time)
+
+    return nothing
+end
+
+function bench_abc(process_string::String)
+    optimizer = ReductionOptimizer()
+
+    # AB->AB
+    process = parse_process(process_string, ABCModel())
+    gen_time = @elapsed graph = parse_dag("input/$process_string.txt", ABCModel())
+    bench_process(process, "$process not optimized tape", graph, gen_time, 0.0)
+
+    opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
+    bench_process(process, "$process reduced tape", graph, gen_time, opt_time)
+
+    return nothing
+end
+
+# sadly cannot put these in functions because the world age must increase after the function is created which happens only in the global scope
+
+## -- WARMUP TO COMPILE FUNCTIONS first
+optimizer = ReductionOptimizer()
+
+process = parse_process("ke->kke", QEDModel())
+gen_time = @elapsed graph = gen_graph(process)
+opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
+bench_process(process, "warmup", graph, gen_time, opt_time)
+
+# AB->AB^3
+process = parse_process("AB->ABBB", ABCModel())
+gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
+opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
+bench_process(process, "warmup", graph, gen_time, opt_time)
+
+## -- WARMUP END
+
+# compton
+bench_qed("ke->ke")
+bench_qed("ke->kke")
+bench_qed("ke->kkke")
+bench_qed("ke->kkkke")
+bench_qed("ke->kkkkke")
+bench_qed("ke->kkkkkke")
+bench_qed("ke->kkkkkkke", true)
+
+bench_abc("AB->AB")
+bench_abc("AB->ABBB")
+bench_abc("AB->ABBBBB")
+
+CSV.write(results_filename, df)