full node bench testing

2024-03-05 18:24:13 +01:00
parent b7f8e4a6b3
commit b39bc480a1
7 changed files with 68 additions and 41 deletions
--- a/data/evaluate_reduce_bench.jl
+++ b/data/evaluate_reduce_bench.jl
@@ -100,7 +100,7 @@ for process in processes
        plot!(
            title = ("$(beautify_title(process)) Reduction Progression ($(n_inputs) Inputs)"),
            xscale = :linear,
-            yscale = :log10,
+            yscale = :linear,
            #ylim = (0, ymax),
            legend = :outerbottom,
            minorgrid = true,
--- a/examples/full_node_bench.jl
+++ b/examples/full_node_bench.jl
@@ -12,7 +12,8 @@ using Base.Threads

 function log(x...)
    println(now(), " ", join(x, " ")...)
-    return flush(stdout)
+    #flush(stdout)
+    return nothing
 end

 results_filename = "full_node_bench.csv"
@@ -26,8 +27,8 @@ df = DataFrame(
    time = Float64[],
    std = Float64[],
    rate = Float64[],
-    ratio_cpu = Float64[],
-    ratio_gpu = Float64[],
+    cpu_chunks = Float64[],
+    gpu_chunks = Float64[],
 )

 # if they exist, read existing results and append new ones
@@ -35,19 +36,20 @@ if isfile(results_filename)
    df = CSV.read(results_filename, DataFrame)
 end

-nInputs = 100_000_000
+nInputs = 1_073_741_824 # 2^30

-lck = SpinLock()
+lck = ReentrantLock()

 progress = 1
 cpu_chunks = 0
 gpu_chunks = 0

-chunkSizes = [100, 1_000, 10_000, 50_000, 100_000]
+chunkSizes = [1024, 4096, 16384, 65536, 262144, 1048576]    # 2^10 to 2^20

 function cpu_worker(compute_func, inputs, chunk_size)
    global progress
    global cpu_chunks
+    global lck
    quit = false
    work_start = 0
    work_end = 0
@@ -58,8 +60,9 @@ function cpu_worker(compute_func, inputs, chunk_size)
            else
                work_start = progress
                progress = progress + chunk_size
-                work_end = min(progress, nInputs)
+                work_end = min(progress - 1, nInputs)
                cpu_chunks = cpu_chunks + 1
+                #log("CPU Worker $(Threads.threadid()) computing $(cpu_chunks)th cpu chunk ($work_start, $work_end)")
            end
        end
        if quit
@@ -71,6 +74,8 @@ function cpu_worker(compute_func, inputs, chunk_size)
        end
    end

+    #log("CPU Worker on $(Threads.threadid()) finished!")
+
    return nothing
 end

@@ -78,6 +83,7 @@ end
 function gpu_worker(compute_func, inputs, chunk_size)
    global progress
    global gpu_chunks
+    global lck
    quit = false
    work_start = 0
    work_end = 0
@@ -88,8 +94,9 @@ function gpu_worker(compute_func, inputs, chunk_size)
            else
                work_start = progress
                progress = progress + chunk_size
-                work_end = min(progress, nInputs)
+                work_end = min(progress - 1, nInputs)
                gpu_chunks = gpu_chunks + 1
+                #log("GPU Worker $(CUDA.device()) computing $(gpu_chunks)th gpu chunk ($work_start, $work_end)")
            end
        end
        if quit
@@ -100,30 +107,33 @@ function gpu_worker(compute_func, inputs, chunk_size)
        compute_func.(cuInputs)
    end

+    #log("GPU Worker on Device $(CUDA.device()) finished!")
+
    return nothing
 end

 cpu_gpu_ratio = Vector{Tuple{Int, Int}}()

 function full_compute(compute_func, inputs, chunk_size)
-    global progress = 1
-    global cpu_chunks = 0
-    global gpu_chunks = 0
+    global progress
+    progress = 1
+    global cpu_chunks
+    cpu_chunks = 0
+    global gpu_chunks
+    gpu_chunks = 0

    tasks = Vector()

    for dev in CUDA.devices()
-        t = @task device!(dev) do
+        t = Threads.@spawn device!(dev) do
            gpu_worker(compute_func, inputs, chunk_size)
            return nothing
        end
-        schedule(t)
        push!(tasks, t)
    end

-    for i in 1:Threads.nthreads()
-        t = @task cpu_worker(compute_func, inputs, chunk_size)
-        schedule(t)
+    for i in 1:(Threads.nthreads() - length(CUDA.devices()))
+        t = Threads.@spawn cpu_worker(compute_func, inputs, chunk_size)
        push!(tasks, t)
    end

@@ -136,6 +146,9 @@ function full_compute(compute_func, inputs, chunk_size)
 end

 function bench(compute_function, inputs, chunk_size)
+    global cpu_gpu_ratio
+    empty!(cpu_gpu_ratio)
+
    bench = @benchmark begin
        full_compute($compute_function, $inputs, $chunk_size)
    end gcsample = true seconds = 600
@@ -147,44 +160,37 @@ function bench(compute_function, inputs, chunk_size)
    med_cpu_chunks = median(getindex.(cpu_gpu_ratio, 1))
    med_gpu_chunks = median(getindex.(cpu_gpu_ratio, 2))

+    log("CPU/GPU ratios: $(cpu_gpu_ratio)")
+
    return (time, rate, s, med_cpu_chunks, med_gpu_chunks)
 end

-function full_node_bench(process::MetagraphOptimization.AbstractProcessDescription, func, chunk_size)
+function full_node_bench(process::MetagraphOptimization.AbstractProcessDescription, func, chunk_size, inputs)
    process_name = string(process)
    log("\n--- Benchmarking $(process_name) on $(nInputs) with chunk size $(chunk_size) ---")

    log("Available Cuda Devices:")
    display.(CUDA.devices())
-    log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")
-
-    inputs = Vector{typeof(gen_process_input(process))}()
-    resize!(inputs, nInputs)
-    processes = Vector{typeof(process)}()
-    for i in 1:Threads.nthreads()
-        push!(processes, copy(process))
-    end
-
-    @inbounds Threads.@threads for i in eachindex(inputs)
-        inputs[i] = gen_process_input(processes[Threads.nthreads()])
-    end

    log("Benchmarking full node...")
    (time, rate, s, med_cpu_chunks, med_gpu_chunks) = bench(func, inputs, chunk_size)
+    log(
+        "Benchmarking complete with median time $(time), $(med_cpu_chunks) cpu chunks, and $(med_gpu_chunks) gpu chunks.",
+    )

    push!(
        df,
        Dict(
            :process_name => process_name,
-            :cpu_threads => Threads.nthreads(),
+            :cpu_threads => Threads.nthreads() - length(CUDA.devices()),
            :gpu_devices => length(CUDA.devices()),
            :n_inputs => nInputs,
            :chunk_size => chunk_size,
            :time => time,
            :std => s,
            :rate => rate,
-            :ratio_cpu => med_cpu_chunks / (med_cpu_chunks + med_gpu_chunks),
-            :ratio_gpu => med_gpu_chunks / (med_cpu_chunks + med_gpu_chunks),
+            :cpu_chunks => med_cpu_chunks,
+            :gpu_chunks => med_gpu_chunks,
        ),
    )

@@ -206,7 +212,7 @@ machine = Machine(
 )

 optimizer = ReductionOptimizer()
-processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke"]
+processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke"]

 for proc in processes
    process = parse_process(proc, QEDModel())
@@ -214,8 +220,21 @@ for proc in processes
    optimize_to_fixpoint!(optimizer, graph)
    func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)

+
+    log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")
+    inputs = Vector{typeof(gen_process_input(process))}()
+    resize!(inputs, nInputs)
+    procs = Vector{typeof(process)}()
+    for i in 1:Threads.nthreads()
+        push!(procs, copy(process))
+    end
+
+    @inbounds Threads.@threads for i in eachindex(inputs)
+        inputs[i] = gen_process_input(procs[Threads.nthreads()])
+    end
+
    for chunk_size in chunkSizes
-        full_node_bench(process, compute_func, chunk_size)
+        full_node_bench(process, compute_func, chunk_size, inputs)
        CSV.write(results_filename, df)
    end
 end;
--- a/experiments/full_node.sh
+++ b/experiments/full_node.sh
@@ -20,5 +20,5 @@ lspci > results/pci_full_node.txt
 #julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1        # add requirements for the bench script
 julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"

-echo "Benchmarking Reduction 128 Threads, *GPU*"
+echo "Benchmarking Full Node 128 Threads + *GPUs*"
 julia --project --threads=128 examples/full_node_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
--- a/experiments/full_node_hemera.sh
+++ b/experiments/full_node_hemera.sh
@@ -2,12 +2,12 @@
 #SBATCH --job-name=qed_bench
 #SBATCH --partition=casus_a100
 #SBATCH --account=casus
-#SBATCH --time=10:00:00
+#SBATCH --time=8:00:00
 #SBATCH --nodes=1
 #SBATCH --ntasks=1
 #SBATCH --cpus-per-task=128
 #SBATCH --gres=gpu:4
-#SBATCH --mem=256GB
+#SBATCH --mem=2048GB
 #SBATCH --output=simulation-%A-%a.out
 #SBATCH --error=simulation-%A-%a.err

--- a/images/ke->kkke_reduction_bench_lin.pdf
+++ b/images/ke->kkke_reduction_bench_lin.pdf
--- a/src/models/qed/compute.jl
+++ b/src/models/qed/compute.jl
@@ -115,10 +115,18 @@ Linearly many FLOP with growing data.
 """
 function compute(::ComputeTaskQED_Sum, data...)::ComplexF64
    # TODO: want to use sum_kbn here but it doesn't seem to support ComplexF64, do it element-wise?
-    return sum(data)
+    s = 0.0im
+    for d in data
+        s += d
+    end
+    return s
 end

 function compute(::ComputeTaskQED_Sum, data::AbstractArray)::ComplexF64
    # TODO: want to use sum_kbn here but it doesn't seem to support ComplexF64, do it element-wise?
-    return sum(data)
+    s = 0.0im
+    for d in data
+        s += d
+    end
+    return s
 end
--- a/src/task/compute.jl
+++ b/src/task/compute.jl
@@ -39,7 +39,7 @@ function get_function_call(node::ComputeTaskNode)
    @assert length(children(node)) <= children(task(node)) "Node $(node) has too many children for its task: node has $(length(node.children)) versus task has $(children(task(node)))\nNode's children: $(getfield.(node.children, :children))"
    @assert !ismissing(node.device) "Trying to get expression for an unscheduled ComputeTaskNode\nNode: $(node)"

-    if (length(node.children) <= 50)
+    if (length(node.children) <= 800)
        #only use an SVector when there are few children
        return get_function_call(
            node.task,