Improve qed_bench_redcution_steps_gpu

2024-03-07 22:28:50 +01:00
parent 371467c2bc
commit ae99be7207
5 changed files with 1878 additions and 2383 deletions
--- a/examples/qed_bench_reduction_steps_gpu.jl
+++ b/examples/qed_bench_reduction_steps_gpu.jl
@ -18,12 +18,14 @@ df = DataFrame(
    graph_ce = Float64[],
    graph_dt = Float64[],
    graph_ci = Float64[],
-    gen_func_t = Float64[],
-    cpu_compile_t = Float64[],
    cpu_st_t = Float64[],
+    cpu_st_s = Float64[],
    cpu_mt_t = Float64[],
-    gpu_compile_t = Float64[],
+    cpu_mt_s = Float64[],
+    cpu_mem = Float64[],
    gpu_t = Float64[],
+    gpu_s = Float64[],
+    gpu_mem = Float64[],
 )

 # if they exist, read existing results and append new ones
@ -36,26 +38,53 @@ function log(x...)
    return flush(stdout)
 end

-function bench(func, inputs)
-    # todo: use gpu kernel instead of broadcasting
-    gpu_compile_time = @elapsed func.(inputs[1:2])
+function bench(func, kernel!, inputs)
+    # gpu part
+    n = length(inputs)
+    cu_inputs = CuVector(inputs)
+    cu_outputs = CuVector{ComplexF64}()
+    resize!(cu_outputs, n)
+    ts = 32
+    bs = Int(n / ts)
+    bench = @benchmark begin
+        @cuda threads = $ts blocks = $bs always_inline = true $kernel!($cu_inputs, $cu_outputs, $n)
+        CUDA.device_synchronize()
+    end gcsample = true samples = 20 evals = 1
+
+    gpu_time = median(bench.times) / 1e9
+    gpu_std = std(bench.times) / 1e9
+    gpu_mem = bench.memory
+
+    # cpu part
+    single_thread = @benchmark $func.($inputs)
+    multi_threaded = @benchmark Threads.@threads for i in eachindex($inputs)
+        $func($inputs[i])
+    end
+
+    cpu_st_time = median(single_thread.times) / 1e9
+    cpu_st_std = std(single_thread.times) / 1e9
+    cpu_mt_time = median(multi_threaded.times) / 1e9
+    cpu_mt_std = std(multi_threaded.times) / 1e9
+    cpu_mem = std(single_thread.times)

-    gpu_time = @benchmark $func.($inputs)

    return (
-        cpu_compile_time = 0.0,
-        gpu_compile_time = gpu_compile_time,
-        cpu_single_thread_time = 0.0,
-        cpu_multi_thread_time = 0.0,
-        gpu_time = mean(gpu_time.times) / 1e9,
+        cpu_single_thread_time = cpu_st_time,
+        cpu_single_thread_std = cpu_st_std,
+        cpu_multi_thread_time = cpu_mt_time,
+        cpu_multi_thread_std = cpu_mt_std,
+        cpu_mem = cpu_mem,
+        gpu_time = gpu_time,
+        gpu_std = gpu_std,
+        gpu_mem = gpu_mem,
    )
 end

+log("Available CUDA devices:")
 for dev in CUDA.devices()
-    log("CUDA device: $(dev)")
+    display(dev)
 end

-
 # preparation of machine
 machine = Machine(
    [
@ -72,9 +101,9 @@ machine = Machine(


 # bench and produce data
-n_inputs = 50_000
+n_inputs = 2^16
 optimizer = ReductionOptimizer()
-processes = [("ke->kke", 50), ("ke->ke", 1), ("ke->kke", 1), ("ke->kkke", 1)]
+processes = [("ke->ke", 1), ("ke->kke", 1), ("ke->kkke", 1), ("ke->kkkke", 5)]

 for (process_str, STEPSIZE) in processes
    n = 0
@ -82,13 +111,14 @@ for (process_str, STEPSIZE) in processes

    process = parse_process(process_str, QEDModel())
    graph = gen_graph(process)
-    inputs = CuVector([gen_process_input(process) for _ in 1:n_inputs])
+    inputs = Vector([gen_process_input(process) for _ in 1:n_inputs])

    get_compute_function(graph, process, machine)

    while true
-        func_gen_time = @elapsed func = get_compute_function(graph, process, machine)
-        res = bench(func, inputs)
+        func = get_compute_function(graph, process, machine)
+        kernel! = get_cuda_kernel(graph, process, machine)
+        res = bench(func, kernel!, inputs)

        graph_properties = get_properties(graph)
        push!(
@ -103,12 +133,14 @@ for (process_str, STEPSIZE) in processes
                graph_properties.computeEffort,
                graph_properties.data,
                graph_properties.computeIntensity,
-                func_gen_time,
-                res.cpu_compile_time,
                res.cpu_single_thread_time,
+                res.cpu_single_thread_std,
                res.cpu_multi_thread_time,
-                res.gpu_compile_time,
+                res.cpu_multi_thread_std,
+                res.cpu_mem,
                res.gpu_time,
+                res.gpu_std,
+                res.gpu_mem,
            ),
        )
        CSV.write(results_filename, df)
@ -130,13 +162,14 @@ for (process_str, STEPSIZE) in [("AB->AB", 1), ("AB->ABBB", 1), ("AB->ABBBBB", 1

    process = parse_process(process_str, ABCModel())
    graph = parse_dag("input/$process_str.txt", ABCModel())
-    inputs = CuVector([gen_process_input(process) for _ in 1:n_inputs])
+    inputs = Vector([gen_process_input(process) for _ in 1:n_inputs])

    get_compute_function(graph, process, machine)

    while true
-        func_gen_time = @elapsed func = get_compute_function(graph, process, machine)
-        res = bench(func, inputs)
+        func = get_compute_function(graph, process, machine)
+        kernel! = get_cuda_kernel(graph, process, machine)
+        res = bench(func, kernel!, inputs)

        graph_properties = get_properties(graph)
        push!(
@ -151,12 +184,14 @@ for (process_str, STEPSIZE) in [("AB->AB", 1), ("AB->ABBB", 1), ("AB->ABBBBB", 1
                graph_properties.computeEffort,
                graph_properties.data,
                graph_properties.computeIntensity,
-                func_gen_time,
-                res.cpu_compile_time,
                res.cpu_single_thread_time,
+                res.cpu_single_thread_std,
                res.cpu_multi_thread_time,
-                res.gpu_compile_time,
+                res.cpu_multi_thread_std,
+                res.cpu_mem,
                res.gpu_time,
+                res.gpu_std,
+                res.gpu_mem,
            ),
        )
        CSV.write(results_filename, df)