From d036f218620daacc5b8f4c8d8a37b6e65823771e Mon Sep 17 00:00:00 2001
From: Anton Reinhard <anton.reinhard@proton.me>
Date: Thu, 7 Mar 2024 00:08:04 +0100
Subject: [PATCH] Use CUDA kernels in bench scripts

---
 data/bench_results_32.csv                 |   9 ++
 data/evaluate_cpu_gpu_exec.jl             | 143 ++++++++++++++++++++++
 examples/full_node_bench.jl               |  29 +++--
 examples/qed_bench.jl                     | 137 +++++++--------------
 examples/qed_bench_reduction_steps_gpu.jl |   1 +
 experiments/full_node.sh                  |   9 +-
 experiments/run_gen_diagram.sh            |   3 +-
 experiments/run_qed_exec.sh               |  11 +-
 experiments/run_reduce_bench.sh           |   9 +-
 experiments/run_reduce_bench_gpu.sh       |   7 +-
 src/MetagraphOptimization.jl              |   2 +-
 11 files changed, 236 insertions(+), 124 deletions(-)
 create mode 100644 data/bench_results_32.csv
 create mode 100644 data/evaluate_cpu_gpu_exec.jl

diff --git a/data/bench_results_32.csv b/data/bench_results_32.csv
new file mode 100644
index 0000000..51454f2
--- /dev/null
+++ b/data/bench_results_32.csv
@@ -0,0 +1,9 @@
+process_name,graph_gen_time,optimization_time,function_generation_time,graph_nodes,graph_edges,graph_mem,cpu_threads,n_inputs,nflops_likwid,cpu_time,cpu_std,cpu_rate,cpu_gflops,gpu_name,gpu_time,gpu_std,gpu_rate,gpu_gflops
+QED Process: 'ke->ke' reduced,0.004851193,0.001290078,0.006093401,26,29,6948.0,32,10000000,0,0.2810178885,0.00909457898005121,3.5584923270818755e7,1.886000933353394,NVIDIA A100-SXM4-80GB,0.4060797745,0.0013320688448668838,2.462570319418851e7,1.305162269291991
+QED Process: 'ke->kke' reduced,0.001065397,0.010432606,0.014287271,59,77,16383.0,32,10000000,0,0.823029796,0.01692859562197734,1.2150228398292398e7,2.223491796887509,NVIDIA A100-SXM4-80GB,2.3333098275,0.0014037935241043983,4.285757460128814e6,0.784293615203573
+QED Process: 'ke->kkke' reduced,0.001348518,0.005210738,0.034243651,188,273,54426.0,32,10000000,0,2.9432864705,0.031053960614444084,3.397562588700793e6,2.497208502695083,NVIDIA A100-SXM4-80GB,10.340032588,0.0028660606476431714,967114.9403924877,0.7108294811884784
+QED Process: 'ke->kkkke' reduced,0.004413783,0.039469525,0.15704043,853,1295,243781.0,32,10000000,0,14.980394603,0.5162977440607073,667539.1580137269,2.4318451526440072,NVIDIA A100-SXM4-80GB,54.2063089555,0.006347197107681703,184480.371246258,0.672061992450118
+QED Process: 'ke->kkkkke' reduced,0.021871728,0.716956567,1.121625045,4982,7655,1.800816e6,32,10000000,0,82.035650126,0.3421310894344223,121898.22332901397,2.6545776094359375,NVIDIA A100-SXM4-80GB,321.789538108,NaN,31076.212293277757,0.6767466751107096
+ABC Process: 'AB->AB' reduced,0.000867035,0.002263493,0.007340721,34,37,9296.0,32,10000000,0,0.1877912925,0.0029540808349122686,5.325060532292784e7,2.8222820821151755,NVIDIA A100-SXM4-80GB,0.0016617045,1.5729813606955104e-5,6.01791714471496e9,318.9496086698929
+ABC Process: 'AB->ABBB' reduced,0.000547175,0.004720326,0.035918118,200,285,57156.0,32,10000000,0,0.257040364,0.007250633041861087,3.8904395575785905e7,28.59473074820264,NVIDIA A100-SXM4-80GB,0.003641165,3.2217340292524716e-5,2.74637375675093e9,2018.5847112119334
+ABC Process: 'AB->ABBBBB' reduced,0.019826198,0.258674017,1.136386232,4998,7671,1.507432e6,32,10000000,0,1.818710381,0.03353568966350073,5.498401562156146e6,119.7386908190744,NVIDIA A100-SXM4-80GB,0.492263776,0.0031065569742746986,2.031431213821429e7,442.38477543389257
diff --git a/data/evaluate_cpu_gpu_exec.jl b/data/evaluate_cpu_gpu_exec.jl
new file mode 100644
index 0000000..b7a3007
--- /dev/null
+++ b/data/evaluate_cpu_gpu_exec.jl
@@ -0,0 +1,143 @@
+using CSV
+using DataFrames
+using Plots
+using StatsPlots
+using LaTeXStrings
+
+if (length(ARGS) < 1)
+    println("Please use with \"input_file.csv\"")
+end
+
+processes = [
+    "QED Process: 'ke->ke'",
+    "QED Process: 'ke->kke'",
+    "QED Process: 'ke->kkke'",
+    "QED Process: 'ke->kkkke'",
+    "QED Process: 'ke->kkkkke'",
+    #"QED Process: 'ke->kkkkkke'",
+    #"QED Process: 'ke->kkkkkkke'",
+    "ABC Process: 'AB->AB'",
+    "ABC Process: 'AB->ABBB'",
+    "ABC Process: 'AB->ABBBBB'",
+]
+
+function proc_to_n(str::AbstractString)
+    parts = split(str, "'")
+    parts = split(parts[2], "->")
+    k_count = count(c -> c == 'k', parts[2])
+    return k_count
+end
+
+function abc_proc_to_n(str::AbstractString)
+    parts = split(str, "'")
+    parts = split(parts[2], "->")
+    b_count = count(c -> c == 'B', parts[2])
+    return b_count
+end
+
+function beautify_title(str::AbstractString)
+    parts = split(str, "'")
+
+    preprefix = parts[1]
+    infix = parts[2]
+    sufsuffix = parts[3]
+
+    parts = split(infix, "->")
+
+    prefix = parts[1]
+    suffix = parts[2]
+
+    k_count = count(c -> c == 'k', suffix)
+    B_count = count(c -> c == 'B', suffix)
+
+    if k_count == 1 || B_count == 1
+        new_suffix = suffix
+    elseif k_count >= 1
+        new_suffix = replace(suffix, r"k+" => "k^$k_count")
+    elseif B_count >= 1
+        new_suffix = replace(suffix, r"B+" => "B^$B_count")
+    end
+
+    return preprefix * L"%$prefix \rightarrow %$new_suffix" * sufsuffix
+end
+
+input_file = ARGS[1]
+df = CSV.read(input_file, DataFrame)
+n_inputs = df[:, "n_inputs"][1]
+
+
+
+title_string = "QED N-Photon Compton Scattering\nCalculate 10,000,000 Matrix Elements"
+
+df_filt = filter(:process_name => x -> proc_to_n(x) >= 1, df)
+
+df_filt.process_size = @. proc_to_n(df_filt.process_name)
+
+df_red = filter(:process_name => x -> match(r" reduced$", x) !== nothing, df_filt)
+
+@df df_red scatter(
+    :process_size,
+    :cpu_time,
+    yerror = :cpu_std,
+    label = "CPU execution time, 32 threads (s)",
+    markersize = 6,
+)
+@df df_red scatter!(
+    :process_size,
+    :gpu_time,
+    yerror = :gpu_std,
+    label = "GPU execution time, A100 80GB (s)",
+    markersize = 6,
+)
+
+plot!(
+    title = title_string,
+    yscale = :log10,
+    legend = :outerbottom,
+    legendcolumns = 2,
+    legend_font_pointsize = 10,
+    size = (800, 600),
+    ylabel = "time (s)",
+    xlabel = "process size (#)",
+)
+
+savefig("cpu_vs_gpu_qed.pdf")
+
+
+
+
+title_string = "\$AB\\rightarrow AB^n\$ ABC Processes\nCalculate 10,000,000 Matrix Elements"
+
+df_filt = filter(:process_name => x -> abc_proc_to_n(x) >= 1, df)
+
+df_filt.process_size = @. abc_proc_to_n(df_filt.process_name)
+
+df_red = filter(:process_name => x -> match(r" reduced$", x) !== nothing, df_filt)
+
+@df df_red scatter(
+    :process_size,
+    :cpu_time,
+    yerror = :cpu_std,
+    label = "CPU execution time, 32 threads (s)",
+    markersize = 6,
+)
+@df df_red scatter!(
+    :process_size,
+    :gpu_time,
+    yerror = :gpu_std,
+    label = "GPU execution time, A100 80GB (s)",
+    markersize = 6,
+)
+
+plot!(
+    title = title_string,
+    yscale = :log10,
+    legend = :outerbottom,
+    legendcolumns = 2,
+    legend_font_pointsize = 10,
+    size = (800, 600),
+    ylabel = "time (s)",
+    xlabel = "process size (#)",
+)
+
+savefig("cpu_vs_gpu_abc.pdf")
diff --git a/examples/full_node_bench.jl b/examples/full_node_bench.jl
index 351096a..e4b2efa 100644
--- a/examples/full_node_bench.jl
+++ b/examples/full_node_bench.jl
@@ -80,10 +80,13 @@ function cpu_worker(compute_func, inputs, chunk_size)
 end
 
 # called with a specific device selected
-function gpu_worker(compute_func, inputs, chunk_size)
+function gpu_worker(kernel!, inputs, chunk_size)
     global progress
     global gpu_chunks
     global lck
+    cuOutputs = CuVector{ComplexF64}()
+    resize!(cuOutputs, chunk_size)
+
     quit = false
     work_start = 0
     work_end = 0
@@ -104,7 +107,9 @@ function gpu_worker(compute_func, inputs, chunk_size)
         end
 
         cuInputs = CuVector(inputs[work_start:work_end])
-        compute_func.(cuInputs)
+        ts = 32
+        bs = Int(chunk_size / 32)
+        CUDA.@sync threads = ts blocks = bs always_inline = true kernel!(cuInputs, cuOutputs, chunk_size)
     end
 
     #log("GPU Worker on Device $(CUDA.device()) finished!")
@@ -114,7 +119,7 @@ end
 
 cpu_gpu_ratio = Vector{Tuple{Int, Int}}()
 
-function full_compute(compute_func, inputs, chunk_size)
+function full_compute(compute_func, kernel!, inputs, chunk_size)
     global progress
     progress = 1
     global cpu_chunks
@@ -126,7 +131,7 @@ function full_compute(compute_func, inputs, chunk_size)
 
     for dev in CUDA.devices()
         t = Threads.@spawn device!(dev) do
-            gpu_worker(compute_func, inputs, chunk_size)
+            gpu_worker(kernel!, inputs, chunk_size)
             return nothing
         end
         push!(tasks, t)
@@ -145,12 +150,12 @@ function full_compute(compute_func, inputs, chunk_size)
     return nothing
 end
 
-function bench(compute_function, inputs, chunk_size)
+function bench(compute_function, kernel!, inputs, chunk_size)
     global cpu_gpu_ratio
     empty!(cpu_gpu_ratio)
 
     bench = @benchmark begin
-        full_compute($compute_function, $inputs, $chunk_size)
+        full_compute($compute_function, $kernel!, $inputs, $chunk_size)
     end gcsample = true seconds = 30
 
     time = median(bench.times) / 1e9
@@ -165,7 +170,7 @@ function bench(compute_function, inputs, chunk_size)
     return (time, rate, s, med_cpu_chunks, med_gpu_chunks)
 end
 
-function full_node_bench(process::MetagraphOptimization.AbstractProcessDescription, func, chunk_size, inputs)
+function full_node_bench(process::MetagraphOptimization.AbstractProcessDescription, func, kernel!, chunk_size, inputs)
     process_name = string(process)
     log("\n--- Benchmarking $(process_name) on $(nInputs) with chunk size $(chunk_size) ---")
 
@@ -173,7 +178,7 @@ function full_node_bench(process::MetagraphOptimization.AbstractProcessDescripti
     display.(CUDA.devices())
 
     log("Benchmarking full node...")
-    (time, rate, s, med_cpu_chunks, med_gpu_chunks) = bench(func, inputs, chunk_size)
+    (time, rate, s, med_cpu_chunks, med_gpu_chunks) = bench(func, kernel!, inputs, chunk_size)
     log(
         "Benchmarking complete with median time $(time), $(med_cpu_chunks) cpu chunks, and $(med_gpu_chunks) gpu chunks.",
     )
@@ -212,14 +217,14 @@ machine = Machine(
 )
 
 optimizer = ReductionOptimizer()
-processes = [#="ke->ke", "ke->kke", "ke->kkke", =#"ke->kkkke", "ke->kkkkke"]
+processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke"]
 
 for proc in processes
     process = parse_process(proc, QEDModel())
     graph = gen_graph(process)
     optimize_to_fixpoint!(optimizer, graph)
-    func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-
+    compute_func = get_compute_function(graph, process, machine)
+    kernel! = get_cuda_kernel(graph, process, machine)
 
     log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")
     inputs = Vector{typeof(gen_process_input(process))}()
@@ -234,7 +239,7 @@ for proc in processes
     end
 
     for chunk_size in chunkSizes
-        full_node_bench(process, compute_func, chunk_size, inputs)
+        full_node_bench(process, compute_func, kernel!, chunk_size, inputs)
         CSV.write(results_filename, df)
     end
 end;
diff --git a/examples/qed_bench.jl b/examples/qed_bench.jl
index 07fc7dd..fbbf1d2 100644
--- a/examples/qed_bench.jl
+++ b/examples/qed_bench.jl
@@ -44,7 +44,7 @@ if isfile(results_filename)
     df = CSV.read(results_filename, DataFrame)
 end
 
-nInputs = 10_000_000
+nInputs = 2^24
 
 function cpu_bench(compute_function, inputs)
     bench = @benchmark begin
@@ -60,9 +60,15 @@ function cpu_bench(compute_function, inputs)
     return (time, rate, s)
 end
 
-function gpu_bench(compute_function, inputs)
+function gpu_bench(kernel!, inputs)
+    n = length(inputs)
+    outputs = CuVector{ComplexF64}()
+    resize!(outputs, n)
+    ts = 32
+    bs = Int(n / ts)
     bench = @benchmark begin
-        CUDA.@sync $compute_function.($inputs)
+        @cuda threads = ts blocks = bs always_inline = true kernel!.($inputs, $outputs, $n)
+        CUDA.device_synchronize()
     end gcsample = true seconds = 300
 
     time = median(bench.times) / 1e9
@@ -77,6 +83,7 @@ function bench_process(
     process_name::String,
     graph::DAG,
     func,
+    kernel!,
     gen_time::Float64,
     opt_time::Float64,
     func_time::Float64;
@@ -131,7 +138,7 @@ function bench_process(
         log("Benchmarking GPU...")
         gpu_name = "$(name(first(CUDA.devices())))"
         cuInputs = CuArray(inputs)
-        (time_gpu, rate_gpu, std_gpu) = gpu_bench(func, cuInputs)
+        (time_gpu, rate_gpu, std_gpu) = gpu_bench(kernel!, cuInputs)
         flops_gpu = (rate_gpu * NFLOPs) / 10^9
     else
         log("Skipping GPU...")
@@ -211,7 +218,8 @@ process = parse_process("ke->kke", QEDModel())
 gen_time = @elapsed graph = gen_graph(process)
 opt_time = @elapsed optimize!(optimizer, graph, 200)
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "warmup", graph, compute_func, gen_time, opt_time, func_gen_time)
+kernel! = get_cuda_kernel(graph, process, machine)
+bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
 
 optimizer = ReductionOptimizer()
 
@@ -220,104 +228,45 @@ process = parse_process("AB->ABBB", ABCModel())
 gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
 opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
 func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "warmup", graph, compute_func, gen_time, opt_time, func_gen_time)
+kernel! = get_cuda_kernel(graph, process, machine)
+bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
 
 ## -- WARMUP END
 
 optimizer = ReductionOptimizer()
 
-# compton
-process = parse_process("ke->ke", QEDModel())
-gen_time = @elapsed graph = gen_graph(process)
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
+processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke", "ke->kkkkkke"]
 
-opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
+for process_str in processes
+    # compton
+    process = parse_process(process_str, QEDModel())
+    gen_time = @elapsed graph = gen_graph(process)
+    func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
+    kernel! = get_cuda_kernel(graph, process, machine)
+    bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)
 
-CSV.write(results_filename, df)
+    opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
+    func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
+    kernel! = get_cuda_kernel(graph, process, machine)
+    bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
 
-# 2-photon compton
-process = parse_process("ke->kke", QEDModel())
-gen_time = @elapsed graph = gen_graph(process)
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
+    CSV.write(results_filename, df)
+end
 
-opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
+processes = ["AB->AB", "AB->ABBB", "AB->ABBBBB"]
 
-CSV.write(results_filename, df)
+for process_str in processes
+    # AB->AB
+    process = parse_process(process_str, ABCModel())
+    gen_time = @elapsed graph = parse_dag("input/$(process_str).txt", ABCModel())
+    func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
+    kernel! = get_cuda_kernel(graph, process, machine)
+    bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)
 
-# 3-photon compton
-process = parse_process("ke->kkke", QEDModel())
-gen_time = @elapsed graph = gen_graph(process)
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
+    opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
+    func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
+    kernel! = get_cuda_kernel(graph, process, machine)
+    bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
 
-opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
-
-CSV.write(results_filename, df)
-
-# 4-photon compton
-process = parse_process("ke->kkkke", QEDModel())
-gen_time = @elapsed graph = gen_graph(process)
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time, use_gpu = false)
-
-opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
-
-CSV.write(results_filename, df)
-
-# 5-photon compton
-process = parse_process("ke->kkkkke", QEDModel())
-gen_time = @elapsed graph = gen_graph(process)
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
-
-opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
-
-CSV.write(results_filename, df)
-
-# AB->AB
-process = parse_process("AB->AB", ABCModel())
-gen_time = @elapsed graph = parse_dag("input/AB->AB.txt", ABCModel())
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
-
-opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
-
-CSV.write(results_filename, df)
-
-# AB->AB^3
-process = parse_process("AB->ABBB", ABCModel())
-gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
-
-opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
-
-CSV.write(results_filename, df)
-
-# AB->AB^5
-process = parse_process("AB->ABBBBB", ABCModel())
-gen_time = @elapsed graph = parse_dag("input/AB->ABBBBB.txt", ABCModel())
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
-
-opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
-func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
-bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
-
-CSV.write(results_filename, df)
+    CSV.write(results_filename, df)
+end
diff --git a/examples/qed_bench_reduction_steps_gpu.jl b/examples/qed_bench_reduction_steps_gpu.jl
index 49c3a71..39e1239 100644
--- a/examples/qed_bench_reduction_steps_gpu.jl
+++ b/examples/qed_bench_reduction_steps_gpu.jl
@@ -37,6 +37,7 @@ function log(x...)
 end
 
 function bench(func, inputs)
+    # todo: use gpu kernel instead of broadcasting
     gpu_compile_time = @elapsed func.(inputs[1:2])
 
     gpu_time = @benchmark $func.($inputs)
diff --git a/experiments/full_node.sh b/experiments/full_node.sh
index c046bfe..1e9462e 100755
--- a/experiments/full_node.sh
+++ b/experiments/full_node.sh
@@ -15,10 +15,11 @@ nvidia-smi > results/cuda_gpu_full_node.txt
 lsblk > results/storage_full_node.txt
 lspci > results/pci_full_node.txt
 
-#echo "Initiating julia..."
-#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1    # need current dev version of QEDprocesses
-#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1        # add requirements for the bench script
+echo "Initiating julia..."
+julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1    # need current dev version of QEDprocesses
+julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
+julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1        # add requirements for the bench script
 julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
 
 echo "Benchmarking Full Node 128 Threads + *GPUs*"
-julia --project --threads=128 examples/full_node_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
+julia --project -O3 --threads=128 examples/full_node_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
diff --git a/experiments/run_gen_diagram.sh b/experiments/run_gen_diagram.sh
index 7b04f55..6d5fd83 100755
--- a/experiments/run_gen_diagram.sh
+++ b/experiments/run_gen_diagram.sh
@@ -19,8 +19,9 @@ lspci > results/pci.txt
 
 echo "Initiating julia..."
 julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1    # need current dev version of QEDprocesses
+julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
 julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("BenchmarkTools"); Pkg.add("StatsBase")' >> $LOG_FILE 2>&1 || exit 1        # add requirements for the bench script
 
 echo "Benchmarking with $i threads..."
 
-julia --project --threads=$i examples/qed_gen_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
+julia --project -O3 --threads=$i examples/qed_gen_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
diff --git a/experiments/run_qed_exec.sh b/experiments/run_qed_exec.sh
index d735dc3..21fed28 100755
--- a/experiments/run_qed_exec.sh
+++ b/experiments/run_qed_exec.sh
@@ -18,13 +18,14 @@ nvidia-smi > results/cuda_gpu_$i.txt
 lsblk > results/storage_$i.txt
 lspci > results/pci_$i.txt
 
-#echo "Initiating julia..."
-#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1    # need current dev version of QEDprocesses
-#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1        # add requirements for the bench script
+echo "Initiating julia..."
+julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1    # need current dev version of QEDprocesses
+julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
+julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1        # add requirements for the bench script
 julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
 
 echo "Benchmarking $i Threads"
-julia --project --threads=$i examples/qed_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
+julia --project -O3 --threads=$i examples/qed_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
 
 echo "Benchmarking Tape variant $i Threads"
-julia --project --threads=$i examples/qed_bench_tape.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
+julia --project -O3 --threads=$i examples/qed_bench_tape.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
diff --git a/experiments/run_reduce_bench.sh b/experiments/run_reduce_bench.sh
index 48684b4..0135e1c 100755
--- a/experiments/run_reduce_bench.sh
+++ b/experiments/run_reduce_bench.sh
@@ -15,9 +15,10 @@ nvidia-smi > results/cuda_gpu_bench_reduce.txt
 lsblk > results/storage_bench_reduce.txt
 lspci > results/pci_bench_reduce.txt
 
-#echo "Initiating julia..."
-#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1    # need current dev version of QEDprocesses
-#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1        # add requirements for the bench script
+echo "Initiating julia..."
+julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1    # need current dev version of QEDprocesses
+julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
+julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1        # add requirements for the bench script
 
 echo "Benchmarking Reduction 32 Threads"
-julia --project --threads=32 examples/qed_bench_reduction_steps.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
+julia --project -O3 --threads=32 examples/qed_bench_reduction_steps.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
diff --git a/experiments/run_reduce_bench_gpu.sh b/experiments/run_reduce_bench_gpu.sh
index a980c4b..989bdc0 100755
--- a/experiments/run_reduce_bench_gpu.sh
+++ b/experiments/run_reduce_bench_gpu.sh
@@ -15,9 +15,10 @@ nvidia-smi > results/cuda_gpu_bench_reduce_gpu.txt
 lsblk > results/storage_bench_reduce_gpu.txt
 lspci > results/pci_bench_reduce_gpu.txt
 
-#echo "Initiating julia..."
-#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1    # need current dev version of QEDprocesses
-#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1        # add requirements for the bench script
+echo "Initiating julia..."
+julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1    # need current dev version of QEDprocesses
+julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
+julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1        # add requirements for the bench script
 julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
 
 echo "Benchmarking Reduction 32 Threads, *GPU*"
diff --git a/src/MetagraphOptimization.jl b/src/MetagraphOptimization.jl
index f4ea95f..f17c8b1 100644
--- a/src/MetagraphOptimization.jl
+++ b/src/MetagraphOptimization.jl
@@ -78,7 +78,7 @@ export gen_graph
 export execute
 export parse_dag, parse_process
 export gen_process_input
-export get_compute_function
+export get_compute_function, get_cuda_kernel
 export gen_tape, execute_tape
 
 # estimator