Prepare hemera execution benchmark

2024-02-13 15:54:09 +01:00
parent 4c05167901
commit 5be483c4c1
6 changed files with 160 additions and 86 deletions
--- a/examples/qed_bench.jl
+++ b/examples/qed_bench.jl
@@ -5,10 +5,17 @@ using UUIDs
 using DataFrames
 using CSV
 using Random
+using BenchmarkTools
+using Dates

 DISABLE_GPU = true

-results_filename = "results.csv"
+function log(x...)
+    println(now(), " ", join(x, " ")...)
+    return flush(stdout)
+end
+
+results_filename = "bench_results_$(Threads.nthreads()).csv"

 df = DataFrame(
    process_name = String[],
@@ -22,10 +29,12 @@ df = DataFrame(
    n_inputs = Int[],
    nflops_likwid = Int[],
    cpu_time = Float64[],
+    cpu_std = Float64[],
    cpu_rate = Float64[],
    cpu_gflops = Float64[],
    gpu_name = String[],
    gpu_time = Float64[],
+    gpu_std = Float64[],
    gpu_rate = Float64[],
    gpu_gflops = Float64[],
 )
@@ -35,25 +44,32 @@ if isfile(results_filename)
    df = CSV.read(results_filename, DataFrame)
 end

-nInputs = 100_000
+nInputs = 1_000

 function cpu_bench(compute_function, inputs)
-    compute_function.(inputs[begin:10]) # make sure it's compiled
+    bench = @benchmark begin
+        @inbounds Threads.@threads for i in eachindex($inputs)
+            @invokelatest $compute_function($inputs[i])
+        end
+    end gcsample = true seconds = 300

-    time = @elapsed Threads.@threads for i in eachindex(inputs)
-        @invokelatest compute_function(inputs[i])
-    end
+    time = mean(bench.times) / 1e9
+    s = std(bench.times) / 1e9
    rate = length(inputs) / time
-    return (time, rate)
+
+    return (time, rate, s)
 end

 function gpu_bench(compute_function, inputs)
-    CUDA.@sync compute_function.(inputs[begin:10])  # make sure it's compiled
+    bench = @benchmark begin
+        CUDA.@sync $compute_function.($inputs)
+    end gcsample = true seconds = 300

-    time = @elapsed CUDA.@sync compute_function.(inputs)
+    time = mean(bench.times) / 1e9
+    s = std(bench.times) / 1e9
    rate = length(inputs) / time

-    return (time, rate)
+    return (time, rate, s)
 end

 function bench_process(
@@ -63,12 +79,11 @@ function bench_process(
    func,
    gen_time::Float64,
    opt_time::Float64,
-    func_time::Float64,
-    io::IO = stdout;
+    func_time::Float64;
    use_likwid = true,
    use_gpu = true,
 )
-    println(io, "\n--- Benchmarking $(process_name) ---")
+    log("\n--- Benchmarking $(process_name) ---")
    if DISABLE_GPU
        use_gpu = false
    end
@@ -90,7 +105,7 @@ function bench_process(
        nflops_likwid = NFLOPs
    end

-    println(io, "Generating $nInputs inputs with $(Threads.nthreads()) threads...")
+    log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")

    inputs = Vector{typeof(gen_process_input(process))}()
    resize!(inputs, nInputs)
@@ -99,45 +114,46 @@ function bench_process(
        push!(processes, copy(process))
    end

-    Threads.@threads for i in eachindex(inputs)
+    @inbounds Threads.@threads for i in eachindex(inputs)
        inputs[i] = gen_process_input(processes[Threads.nthreads()])
    end

-    println(io, "Benchmarking CPU with $(Threads.nthreads()) threads...")
-    (time_cpu, rate_cpu) = cpu_bench(func, inputs)
+    log("Benchmarking CPU with $(Threads.nthreads()) threads...")
+    (time_cpu, rate_cpu, std_cpu) = cpu_bench(func, inputs)
    flops_cpu = (rate_cpu * NFLOPs) / 10^9

    time_gpu = 0.0
+    std_gpu = 0.0
    rate_gpu = 0.0
    flops_gpu = 0.0
    gpu_name = "none"
    if use_gpu
-        println(io, "Benchmarking GPU...")
+        log("Benchmarking GPU...")
        gpu_name = "$(name(first(CUDA.devices())))"
        cuInputs = CuArray(inputs)
-        (time_gpu, rate_gpu) = gpu_bench(func, cuInputs)
+        (time_gpu, rate_gpu, std_gpu) = gpu_bench(func, cuInputs)
        flops_gpu = (rate_gpu * NFLOPs) / 10^9
    else
-        println(io, "Skipping GPU...")
+        log("Skipping GPU...")
    end

-    println(io, "\nBenchmark Summary for $(process):")
+    log("\nBenchmark Summary for $(process):")

    if use_likwid
-        println(io, "Measured FLOPS by LIKWID: $NFLOPs")
+        log("Measured FLOPS by LIKWID: $NFLOPs")
    else
-        println(io, "Total graph compute effort: $NFLOPs")
+        log("Total graph compute effort: $NFLOPs")
    end
-    println(io, "Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))")
-    println(io, "CPU, $(Threads.nthreads()) threads")
-    println(io, "  Time:  $time_cpu")
-    println(io, "  Rate:  $rate_cpu")
-    println(io, "  GFLOPS: $flops_cpu")
+    log("Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))")
+    log("CPU, $(Threads.nthreads()) threads")
+    log("  Time:  $time_cpu")
+    log("  Rate:  $rate_cpu")
+    log("  GFLOPS: $flops_cpu")
    if use_gpu
-        println(io, "GPU, $gpu_name")
-        println(io, "  Time:  $time_gpu")
-        println(io, "  Rate:  $rate_gpu")
-        println(io, "  GFLOPS: $flops_gpu")
+        log("GPU, $gpu_name")
+        log("  Time:  $time_gpu")
+        log("  Rate:  $rate_gpu")
+        log("  GFLOPS: $flops_gpu")
    end

    if (process_name != "warmup")
@@ -155,10 +171,12 @@ function bench_process(
                :n_inputs => nInputs,
                :nflops_likwid => nflops_likwid,
                :cpu_time => time_cpu,
+                :cpu_std => std_cpu,
                :cpu_rate => rate_cpu,
                :cpu_gflops => flops_cpu,
                :gpu_name => gpu_name,
                :gpu_time => time_gpu,
+                :gpu_std => std_gpu,
                :gpu_rate => rate_gpu,
                :gpu_gflops => flops_gpu,
            ),
@@ -212,52 +230,62 @@ optimizer = ReductionOptimizer()
 process = parse_process("ke->ke", QEDModel())
 gen_time = @elapsed graph = gen_graph(process)
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process no optimization", graph, compute_func, gen_time, 0.0, func_gen_time)
+bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)

 opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
 bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)

+CSV.write(results_filename, df)
+
 # 2-photon compton
 process = parse_process("ke->kke", QEDModel())
 gen_time = @elapsed graph = gen_graph(process)
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process no optimization", graph, compute_func, gen_time, 0.0, func_gen_time)
+bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)

 opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
 bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)

+CSV.write(results_filename, df)
+
 # 3-photon compton
 process = parse_process("ke->kkke", QEDModel())
 gen_time = @elapsed graph = gen_graph(process)
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process no optimization", graph, compute_func, gen_time, 0.0, func_gen_time)
+bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)

 opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
 bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)

+CSV.write(results_filename, df)
+
 # 4-photon compton
 process = parse_process("ke->kkkke", QEDModel())
 gen_time = @elapsed graph = gen_graph(process)
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process no optimization", graph, compute_func, gen_time, 0.0, func_gen_time, use_gpu = false)
+bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time, use_gpu = false)

 opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
 bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)

+CSV.write(results_filename, df)
+
 # 5-photon compton
 process = parse_process("ke->kkkkke", QEDModel())
 gen_time = @elapsed graph = gen_graph(process)
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process no optimization", graph, compute_func, gen_time, 0.0, func_gen_time, use_gpu = false)
+bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time, use_gpu = false)

 opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
 bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time, use_gpu = false)

+CSV.write(results_filename, df)
+
 # 6-photon compton
 process = parse_process("ke->kkkkkke", QEDModel())
 gen_time = @elapsed graph = gen_graph(process)
@@ -265,31 +293,37 @@ opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
 bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time, use_gpu = false)

+CSV.write(results_filename, df)
+
 # AB->AB
 process = parse_process("AB->AB", ABCModel())
 gen_time = @elapsed graph = parse_dag("input/AB->AB.txt", ABCModel())
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process no optimization", graph, compute_func, gen_time, 0.0, func_gen_time)
+bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)

 opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
 bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)

+CSV.write(results_filename, df)
+
 # AB->AB^3
 process = parse_process("AB->ABBB", ABCModel())
 gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process no optimization", graph, compute_func, gen_time, 0.0, func_gen_time)
+bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)

 opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
 bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)

+CSV.write(results_filename, df)
+
 # AB->AB^5
 process = parse_process("AB->ABBBBB", ABCModel())
 gen_time = @elapsed graph = parse_dag("input/AB->ABBBBB.txt", ABCModel())
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process no optimization", graph, compute_func, gen_time, 0.0, func_gen_time, use_gpu = false)
+bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time, use_gpu = false)

 opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
--- a/examples/qed_bench_tape.jl
+++ b/examples/qed_bench_tape.jl
@@ -4,8 +4,15 @@ using UUIDs
 using DataFrames
 using CSV
 using Random
+using BenchmarkTools
+using Dates

-results_filename = "results.csv"
+function log(x...)
+    println(now(), " ", join(x, " ")...)
+    return flush(stdout)
+end
+
+results_filename = "bench_results_tape_$(Threads.nthreads()).csv"

 df = DataFrame(
    process_name = String[],
@@ -21,8 +28,10 @@ df = DataFrame(
    cpu_time = Float64[],
    cpu_rate = Float64[],
    cpu_gflops = Float64[],
+    cpu_std = Float64[],
    gpu_name = String[],
    gpu_time = Float64[],
+    gpu_std = Float64[],
    gpu_rate = Float64[],
    gpu_gflops = Float64[],
 )
@@ -32,8 +41,7 @@ if isfile(results_filename)
    df = CSV.read(results_filename, DataFrame)
 end

-nInputs = 100_000
-
+nInputs = 1_000

 # use "mock" machine that only uses cpu
 machine = Machine(
@@ -51,11 +59,17 @@ machine = Machine(


 function cpu_bench(tape, inputs)
-    time = @elapsed Threads.@threads for i in eachindex(inputs)
-        execute_tape(tape, inputs[i])
-    end
+    bench = @benchmark begin
+        @inbounds Threads.@threads for i in eachindex($inputs)
+            execute_tape($tape, $inputs[i])
+        end
+    end gcsample = true seconds = 300
+
+    time = mean(bench.times) / 1e9
+    s = std(bench.times) / 1e9
    rate = length(inputs) / time
-    return (time, rate)
+
+    return (time, rate, s)
 end

 function bench_process(
@@ -67,7 +81,7 @@ function bench_process(
    io::IO = stdout;
    use_likwid = true,
 )
-    println(io, "\n--- Benchmarking $(process_name) ---")
+    log("\n--- Benchmarking $(process_name) ---")

    func_time = @elapsed tape = gen_tape(graph, process, machine)

@@ -87,7 +101,7 @@ function bench_process(
        nflops_likwid = NFLOPs
    end

-    println(io, "Generating $nInputs inputs with $(Threads.nthreads()) threads...")
+    log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")

    inputs = Vector{typeof(gen_process_input(process))}()
    resize!(inputs, nInputs)
@@ -96,26 +110,26 @@ function bench_process(
        push!(processes, copy(process))
    end

-    Threads.@threads for i in eachindex(inputs)
+    @inbounds Threads.@threads for i in eachindex(inputs)
        inputs[i] = gen_process_input(processes[Threads.nthreads()])
    end

-    println(io, "Benchmarking CPU with $(Threads.nthreads()) threads...")
-    (time_cpu, rate_cpu) = cpu_bench(tape, inputs)
+    log("Benchmarking CPU with $(Threads.nthreads()) threads...")
+    (time_cpu, rate_cpu, std_cpu) = cpu_bench(tape, inputs)
    flops_cpu = (rate_cpu * NFLOPs) / 10^9

-    println(io, "\nBenchmark Summary for $(process):")
+    log("\nBenchmark Summary for $(process):")

    if use_likwid
-        println(io, "Measured FLOPS by LIKWID: $NFLOPs")
+        log("Measured FLOPS by LIKWID: $NFLOPs")
    else
-        println(io, "Total graph compute effort: $NFLOPs")
+        log("Total graph compute effort: $NFLOPs")
    end
-    println(io, "Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))")
-    println(io, "CPU, $(Threads.nthreads()) threads")
-    println(io, "  Time:  $time_cpu")
-    println(io, "  Rate:  $rate_cpu")
-    println(io, "  GFLOPS: $flops_cpu")
+    log("Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))")
+    log("CPU, $(Threads.nthreads()) threads")
+    log("  Time:  $time_cpu")
+    log("  Rate:  $rate_cpu")
+    log("  GFLOPS: $flops_cpu")

    if (process_name != "warmup")
        push!(
@@ -132,10 +146,12 @@ function bench_process(
                :n_inputs => nInputs,
                :nflops_likwid => nflops_likwid,
                :cpu_time => time_cpu,
+                :cpu_std => std_cpu,
                :cpu_rate => rate_cpu,
                :cpu_gflops => flops_cpu,
                :gpu_name => "none",
                :gpu_time => 0.0,
+                :gpu_std => 0.0,
                :gpu_rate => 0.0,
                :gpu_gflops => 0.0,
            ),
@@ -164,7 +180,6 @@ end
 function bench_abc(process_string::String)
    optimizer = ReductionOptimizer()

-    # AB->AB
    process = parse_process(process_string, ABCModel())
    gen_time = @elapsed graph = parse_dag("input/$process_string.txt", ABCModel())
    bench_process(process, "$process not optimized tape", graph, gen_time, 0.0)
@@ -195,15 +210,23 @@ bench_process(process, "warmup", graph, gen_time, opt_time)

 # compton
 bench_qed("ke->ke")
+CSV.write(results_filename, df)
 bench_qed("ke->kke")
+CSV.write(results_filename, df)
 bench_qed("ke->kkke")
+CSV.write(results_filename, df)
 bench_qed("ke->kkkke")
+CSV.write(results_filename, df)
 bench_qed("ke->kkkkke")
+CSV.write(results_filename, df)
 bench_qed("ke->kkkkkke")
+CSV.write(results_filename, df)
 bench_qed("ke->kkkkkkke")
+CSV.write(results_filename, df)

 bench_abc("AB->AB")
-bench_abc("AB->ABBB")
-bench_abc("AB->ABBBBB")
-
+CSV.write(results_filename, df)
+bench_abc("AB->ABBB")
+CSV.write(results_filename, df)
+bench_abc("AB->ABBBBB")
 CSV.write(results_filename, df)