Improve qed_bench_redcution_steps_gpu

2024-03-07 22:28:50 +01:00 · 2024-03-07 22:28:50 +01:00 · ae99be7207
commit ae99be7207
parent 371467c2bc
5 changed files with 1878 additions and 2383 deletions
--- a/examples/qed_bench_reduction_steps_gpu.jl
+++ b/examples/qed_bench_reduction_steps_gpu.jl
@ -18,12 +18,14 @@ df = DataFrame(
    graph_ce = Float64[],
    graph_dt = Float64[],
    graph_ci = Float64[],
-    gen_func_t = Float64[],
-    cpu_compile_t = Float64[],
    cpu_st_t = Float64[],
+    cpu_st_s = Float64[],
    cpu_mt_t = Float64[],
-    gpu_compile_t = Float64[],
+    cpu_mt_s = Float64[],
+    cpu_mem = Float64[],
    gpu_t = Float64[],
+    gpu_s = Float64[],
+    gpu_mem = Float64[],
 )

 # if they exist, read existing results and append new ones
@ -36,26 +38,53 @@ function log(x...)
    return flush(stdout)
 end

-function bench(func, inputs)
-    # todo: use gpu kernel instead of broadcasting
-    gpu_compile_time = @elapsed func.(inputs[1:2])
+function bench(func, kernel!, inputs)
+    # gpu part
+    n = length(inputs)
+    cu_inputs = CuVector(inputs)
+    cu_outputs = CuVector{ComplexF64}()
+    resize!(cu_outputs, n)
+    ts = 32
+    bs = Int(n / ts)
+    bench = @benchmark begin
+        @cuda threads = $ts blocks = $bs always_inline = true $kernel!($cu_inputs, $cu_outputs, $n)
+        CUDA.device_synchronize()
+    end gcsample = true samples = 20 evals = 1
+
+    gpu_time = median(bench.times) / 1e9
+    gpu_std = std(bench.times) / 1e9
+    gpu_mem = bench.memory
+
+    # cpu part
+    single_thread = @benchmark $func.($inputs)
+    multi_threaded = @benchmark Threads.@threads for i in eachindex($inputs)
+        $func($inputs[i])
+    end
+
+    cpu_st_time = median(single_thread.times) / 1e9
+    cpu_st_std = std(single_thread.times) / 1e9
+    cpu_mt_time = median(multi_threaded.times) / 1e9
+    cpu_mt_std = std(multi_threaded.times) / 1e9
+    cpu_mem = std(single_thread.times)

-    gpu_time = @benchmark $func.($inputs)

    return (
-        cpu_compile_time = 0.0,
-        gpu_compile_time = gpu_compile_time,
-        cpu_single_thread_time = 0.0,
-        cpu_multi_thread_time = 0.0,
-        gpu_time = mean(gpu_time.times) / 1e9,
+        cpu_single_thread_time = cpu_st_time,
+        cpu_single_thread_std = cpu_st_std,
+        cpu_multi_thread_time = cpu_mt_time,
+        cpu_multi_thread_std = cpu_mt_std,
+        cpu_mem = cpu_mem,
+        gpu_time = gpu_time,
+        gpu_std = gpu_std,
+        gpu_mem = gpu_mem,
    )
 end

+log("Available CUDA devices:")
 for dev in CUDA.devices()
-    log("CUDA device: $(dev)")
+    display(dev)
 end

-
 # preparation of machine
 machine = Machine(
    [
@ -72,9 +101,9 @@ machine = Machine(


 # bench and produce data
-n_inputs = 50_000
+n_inputs = 2^16
 optimizer = ReductionOptimizer()
-processes = [("ke->kke", 50), ("ke->ke", 1), ("ke->kke", 1), ("ke->kkke", 1)]
+processes = [("ke->ke", 1), ("ke->kke", 1), ("ke->kkke", 1), ("ke->kkkke", 5)]

 for (process_str, STEPSIZE) in processes
    n = 0
@ -82,13 +111,14 @@ for (process_str, STEPSIZE) in processes

    process = parse_process(process_str, QEDModel())
    graph = gen_graph(process)
-    inputs = CuVector([gen_process_input(process) for _ in 1:n_inputs])
+    inputs = Vector([gen_process_input(process) for _ in 1:n_inputs])

    get_compute_function(graph, process, machine)

    while true
-        func_gen_time = @elapsed func = get_compute_function(graph, process, machine)
-        res = bench(func, inputs)
+        func = get_compute_function(graph, process, machine)
+        kernel! = get_cuda_kernel(graph, process, machine)
+        res = bench(func, kernel!, inputs)

        graph_properties = get_properties(graph)
        push!(
@ -103,12 +133,14 @@ for (process_str, STEPSIZE) in processes
                graph_properties.computeEffort,
                graph_properties.data,
                graph_properties.computeIntensity,
-                func_gen_time,
-                res.cpu_compile_time,
                res.cpu_single_thread_time,
+                res.cpu_single_thread_std,
                res.cpu_multi_thread_time,
-                res.gpu_compile_time,
+                res.cpu_multi_thread_std,
+                res.cpu_mem,
                res.gpu_time,
+                res.gpu_std,
+                res.gpu_mem,
            ),
        )
        CSV.write(results_filename, df)
@ -130,13 +162,14 @@ for (process_str, STEPSIZE) in [("AB->AB", 1), ("AB->ABBB", 1), ("AB->ABBBBB", 1

    process = parse_process(process_str, ABCModel())
    graph = parse_dag("input/$process_str.txt", ABCModel())
-    inputs = CuVector([gen_process_input(process) for _ in 1:n_inputs])
+    inputs = Vector([gen_process_input(process) for _ in 1:n_inputs])

    get_compute_function(graph, process, machine)

    while true
-        func_gen_time = @elapsed func = get_compute_function(graph, process, machine)
-        res = bench(func, inputs)
+        func = get_compute_function(graph, process, machine)
+        kernel! = get_cuda_kernel(graph, process, machine)
+        res = bench(func, kernel!, inputs)

        graph_properties = get_properties(graph)
        push!(
@ -151,12 +184,14 @@ for (process_str, STEPSIZE) in [("AB->AB", 1), ("AB->ABBB", 1), ("AB->ABBBBB", 1
                graph_properties.computeEffort,
                graph_properties.data,
                graph_properties.computeIntensity,
-                func_gen_time,
-                res.cpu_compile_time,
                res.cpu_single_thread_time,
+                res.cpu_single_thread_std,
                res.cpu_multi_thread_time,
-                res.gpu_compile_time,
+                res.cpu_multi_thread_std,
+                res.cpu_mem,
                res.gpu_time,
+                res.gpu_std,
+                res.gpu_mem,
            ),
        )
        CSV.write(results_filename, df)
--- a/experiments/reduce_bench_hemera_gpu.sh
+++ b/experiments/reduce_bench_hemera_gpu.sh
@ -2,12 +2,12 @@
 #SBATCH --job-name=qed_bench
 #SBATCH --partition=casus_a100
 #SBATCH --account=casus
-#SBATCH --time=48:00:00
+#SBATCH --time=16:00:00
 #SBATCH --nodes=1
 #SBATCH --ntasks=1
 #SBATCH --cpus-per-task=32
 #SBATCH --gres=gpu:1
-#SBATCH --mem=24GB
+#SBATCH --mem=256GB
 #SBATCH --output=simulation-%A-%a.out
 #SBATCH --error=simulation-%A-%a.err

--- a/experiments/run_reduce_bench.sh
+++ b/experiments/run_reduce_bench.sh
@ -15,10 +15,10 @@ nvidia-smi > results/cuda_gpu_bench_reduce.txt
 lsblk > results/storage_bench_reduce.txt
 lspci > results/pci_bench_reduce.txt

-echo "Initiating julia..."
-julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1    # need current dev version of QEDprocesses
-julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
-julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1        # add requirements for the bench script
+#echo "Initiating julia..."
+#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1    # need current dev version of QEDprocesses
+#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
+#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1        # add requirements for the bench script

 echo "Benchmarking Reduction 32 Threads"
 julia --project -O3 --threads=32 examples/qed_bench_reduction_steps.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
--- a/experiments/run_reduce_bench_gpu.sh
+++ b/experiments/run_reduce_bench_gpu.sh
@ -15,11 +15,11 @@ nvidia-smi > results/cuda_gpu_bench_reduce_gpu.txt
 lsblk > results/storage_bench_reduce_gpu.txt
 lspci > results/pci_bench_reduce_gpu.txt

-echo "Initiating julia..."
-julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1    # need current dev version of QEDprocesses
-julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
-julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1        # add requirements for the bench script
-julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
+#echo "Initiating julia..."
+#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1    # need current dev version of QEDprocesses
+#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
+#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1        # add requirements for the bench script
+#julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"

 echo "Benchmarking Reduction 32 Threads, *GPU*"
 julia --project --threads=32 examples/qed_bench_reduction_steps_gpu.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
--- a/notebooks/reduction.ipynb
+++ b/notebooks/reduction.ipynb