experiments (#1)

Co-authored-by: Anton Reinhard <anton.reinhard@proton.me> Reviewed-on: #1
2024-05-08 12:03:27 +02:00
parent 82ed774b7e
commit 87dbaf2c32
155 changed files with 5372 additions and 1029 deletions
--- a/examples/Project.toml
+++ b/examples/Project.toml
@@ -5,5 +5,6 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 MetagraphOptimization = "3e869610-d48d-4942-ba70-c1b702a33ca4"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+QEDbase = "10e22c08-3ccb-4172-bfcf-7d7aa3d04d93"
 QEDprocesses = "46de9c38-1bb3-4547-a1ec-da24d767fdad"
 StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
--- a/examples/full_node_bench.jl
+++ b/examples/full_node_bench.jl
@@ -0,0 +1,249 @@
+using MetagraphOptimization
+using CUDA
+using UUIDs
+using DataFrames
+using CSV
+using Random
+using BenchmarkTools
+using Dates
+
+using Base.Threads
+
+
+function log(x...)
+    println(now(), " ", join(x, " ")...)
+    flush(stdout)
+    return nothing
+end
+
+results_filename = "full_node_bench.csv"
+
+df = DataFrame(
+    process_name = String[],
+    cpu_threads = Int[],
+    gpu_devices = Int[],
+    n_inputs = Int[],
+    chunk_size = Int[],
+    time = Float64[],
+    std = Float64[],
+    rate = Float64[],
+    cpu_chunks = Float64[],
+    gpu_chunks = Float64[],
+    memory_est = Float64[],
+)
+
+# if they exist, read existing results and append new ones
+if isfile(results_filename)
+    df = CSV.read(results_filename, DataFrame)
+end
+
+nInputs = 2^26
+
+lck = ReentrantLock()
+
+progress = 1
+cpu_chunks = 0
+gpu_chunks = 0
+
+chunkSizes = [1024, 4096, 16384, 65536, 262144, 1048576]    # 2^10 to 2^20
+
+function cpu_worker(compute_func, inputs, chunk_size)
+    global progress
+    global cpu_chunks
+    global lck
+    quit = false
+    work_start = 0
+    work_end = 0
+    while true
+        lock(lck) do
+            if progress >= nInputs
+                quit = true
+            else
+                work_start = progress
+                progress = progress + chunk_size
+                work_end = min(progress - 1, nInputs)
+                cpu_chunks = cpu_chunks + 1
+                #log("CPU Worker $(Threads.threadid()) computing $(cpu_chunks)th cpu chunk ($work_start, $work_end)")
+            end
+        end
+        if quit
+            break
+        end
+
+        for i in work_start:work_end
+            compute_func(inputs[i])
+        end
+    end
+
+    #log("CPU Worker on $(Threads.threadid()) finished!")
+
+    return nothing
+end
+
+# called with a specific device selected
+function gpu_worker(kernel!, inputs, chunk_size)
+    global progress
+    global gpu_chunks
+    global lck
+    cuOutputs = CuVector{ComplexF64}()
+    resize!(cuOutputs, chunk_size)
+
+    quit = false
+    work_start = 0
+    work_end = 0
+    while true
+        lock(lck) do
+            if progress >= nInputs
+                quit = true
+            else
+                work_start = progress
+                progress = progress + chunk_size
+                work_end = min(progress - 1, nInputs)
+                gpu_chunks = gpu_chunks + 1
+                #log("GPU Worker $(CUDA.device()) computing $(gpu_chunks)th gpu chunk ($work_start, $work_end)")
+            end
+        end
+        if quit
+            break
+        end
+
+        cuInputs = CuVector(inputs[work_start:work_end])
+        ts = 32
+        bs = Int(chunk_size / 32)
+        @cuda threads = ts blocks = bs always_inline = true kernel!(cuInputs, cuOutputs, chunk_size)
+        CUDA.device_synchronize()
+    end
+
+    #log("GPU Worker on Device $(CUDA.device()) finished!")
+
+    return nothing
+end
+
+cpu_gpu_ratio = Vector{Tuple{Int, Int}}()
+
+function full_compute(compute_func, kernel!, inputs, chunk_size)
+    global progress
+    progress = 1
+    global cpu_chunks
+    cpu_chunks = 0
+    global gpu_chunks
+    gpu_chunks = 0
+
+    tasks = Vector()
+
+    for dev in CUDA.devices()
+        t = Threads.@spawn device!(dev) do
+            gpu_worker(kernel!, inputs, chunk_size)
+            return nothing
+        end
+        push!(tasks, t)
+    end
+
+    for i in 1:(Threads.nthreads() - length(CUDA.devices()))
+        t = Threads.@spawn cpu_worker(compute_func, inputs, chunk_size)
+        push!(tasks, t)
+    end
+
+    for t in tasks
+        wait(t)
+    end
+
+    push!(cpu_gpu_ratio, (cpu_chunks, gpu_chunks))
+    return nothing
+end
+
+function bench(compute_function, kernel!, inputs, chunk_size)
+    global cpu_gpu_ratio
+    empty!(cpu_gpu_ratio)
+
+    bench = @benchmark begin
+        full_compute($compute_function, $kernel!, $inputs, $chunk_size)
+    end gcsample = true seconds = 60
+
+    time = median(bench.times) / 1e9
+    s = std(bench.times) / 1e9
+    rate = length(inputs) / time
+
+    med_cpu_chunks = median(getindex.(cpu_gpu_ratio, 1))
+    med_gpu_chunks = median(getindex.(cpu_gpu_ratio, 2))
+    mem_estimate = bench.memory
+
+    log("CPU/GPU ratios: $(cpu_gpu_ratio)")
+
+    return (time, rate, s, med_cpu_chunks, med_gpu_chunks, mem_estimate)
+end
+
+function full_node_bench(process::MetagraphOptimization.AbstractProcessDescription, func, kernel!, chunk_size, inputs)
+    process_name = string(process)
+    log("\n--- Benchmarking $(process_name) on $(nInputs) with chunk size $(chunk_size) ---")
+
+    log("Available Cuda Devices:")
+    display.(CUDA.devices())
+
+    log("Benchmarking full node...")
+    (time, rate, s, med_cpu_chunks, med_gpu_chunks, mem_estimate) = bench(func, kernel!, inputs, chunk_size)
+    log(
+        "Benchmarking complete with median time $(time), $(med_cpu_chunks) cpu chunks, and $(med_gpu_chunks) gpu chunks.",
+    )
+
+    push!(
+        df,
+        Dict(
+            :process_name => process_name,
+            :cpu_threads => Threads.nthreads() - length(CUDA.devices()),
+            :gpu_devices => length(CUDA.devices()),
+            :n_inputs => nInputs,
+            :chunk_size => chunk_size,
+            :time => time,
+            :std => s,
+            :rate => rate,
+            :cpu_chunks => med_cpu_chunks,
+            :gpu_chunks => med_gpu_chunks,
+            :memory_est => mem_estimate,
+        ),
+    )
+
+    return nothing
+end
+
+# use "mock" machine that only uses cpu for compilation
+machine = Machine(
+    [
+        MetagraphOptimization.NumaNode(
+            0,
+            1,
+            MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
+            -1.0,
+            UUIDs.uuid1(),
+        ),
+    ],
+    [-1.0;;],
+)
+
+optimizer = ReductionOptimizer()
+processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke"]
+
+for proc in processes
+    process = parse_process(proc, QEDModel())
+    graph = gen_graph(process)
+    optimize_to_fixpoint!(optimizer, graph)
+    compute_func = get_compute_function(graph, process, machine)
+    kernel! = get_cuda_kernel(graph, process, machine)
+
+    log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")
+    inputs = Vector{typeof(gen_process_input(process))}()
+    resize!(inputs, nInputs)
+    procs = Vector{typeof(process)}()
+    for i in 1:Threads.nthreads()
+        push!(procs, copy(process))
+    end
+
+    @inbounds Threads.@threads for i in eachindex(inputs)
+        inputs[i] = gen_process_input(procs[Threads.nthreads()])
+    end
+
+    for chunk_size in chunkSizes
+        full_node_bench(process, compute_func, kernel!, chunk_size, inputs)
+        CSV.write(results_filename, df)
+    end
+end;
--- a/examples/import_bench.jl
+++ b/examples/import_bench.jl
@@ -34,9 +34,10 @@ function import_bench()
    bench_txt("AB->ABBB.txt")
    bench_txt("AB->ABBBBB.txt")
    bench_txt("AB->ABBBBBBB.txt")
-    #bench_txt("AB->ABBBBBBBBB.txt")
+    bench_txt("AB->ABBBBBBBBB.txt")
    bench_txt("ABAB->ABAB.txt")
-    return bench_txt("ABAB->ABC.txt")
+    bench_txt("ABAB->ABC.txt")
+    return nothing
 end

 import_bench()
--- a/examples/qed_bench.jl
+++ b/examples/qed_bench.jl
@@ -2,44 +2,117 @@ using MetagraphOptimization
 using LIKWID
 using CUDA
 using UUIDs
+using DataFrames
+using CSV
+using Random
+using BenchmarkTools
+using Dates

-function cpu_bench(compute_function, inputs)
-    compute_function.(inputs[begin:10]) # make sure it's compiled
+DISABLE_GPU = false

-    time = @elapsed Threads.@threads for i in eachindex(inputs)
-        @invokelatest compute_function(inputs[i])
-    end
-    rate = length(inputs) / time
-    return (time, rate)
+function log(x...)
+    println(now(), " ", join(x, " ")...)
+    return flush(stdout)
 end

-function gpu_bench(compute_function, inputs)
-    CUDA.@sync compute_function.(inputs[begin:10])  # make sure it's compiled
+results_filename = "bench_results_$(Threads.nthreads()).csv"

-    time = @elapsed CUDA.@sync compute_function.(inputs)
+df = DataFrame(
+    process_name = String[],
+    graph_gen_time = Float64[],
+    optimization_time = Float64[],
+    function_generation_time = Float64[],
+    graph_nodes = Int[],
+    graph_edges = Int[],
+    graph_mem = Float64[],
+    cpu_threads = Int[],
+    n_inputs = Int[],
+    nflops_likwid = Int[],
+    cpu_time = Float64[],
+    cpu_std = Float64[],
+    cpu_rate = Float64[],
+    cpu_gflops = Float64[],
+    gpu_name = String[],
+    gpu_time = Float64[],
+    gpu_std = Float64[],
+    gpu_rate = Float64[],
+    gpu_gflops = Float64[],
+)
+
+# if they exist, read existing results and append new ones
+if isfile(results_filename)
+    df = CSV.read(results_filename, DataFrame)
+end
+
+nInputs = 2^20
+
+function cpu_bench(compute_function, inputs)
+    bench = @benchmark begin
+        @inbounds Threads.@threads for i in eachindex($inputs)
+            @invokelatest $compute_function($inputs[i])
+        end
+    end gcsample = true samples = 20 evals = 1
+
+    time = median(bench.times) / 1e9
+    s = std(bench.times) / 1e9
    rate = length(inputs) / time

-    return (time, rate)
+    return (time, rate, s)
+end
+
+function gpu_bench(kernel!, inputs)
+    n = length(inputs)
+    outputs = CuVector{ComplexF64}()
+    resize!(outputs, n)
+    ts = 32
+    bs = Int(n / ts)
+    bench = @benchmark begin
+        @cuda threads = $ts blocks = $bs always_inline = true $kernel!($inputs, $outputs, $n)
+        CUDA.device_synchronize()
+    end gcsample = true samples = 20 evals = 1
+
+    time = median(bench.times) / 1e9
+    s = std(bench.times) / 1e9
+    rate = length(inputs) / time
+
+    return (time, rate, s)
 end

 function bench_process(
    process::MetagraphOptimization.AbstractProcessDescription,
+    process_name::String,
+    graph::DAG,
    func,
-    io::IO = stdout;
-    use_likwid = true,
+    kernel!,
+    gen_time::Float64,
+    opt_time::Float64,
+    func_time::Float64;
+    use_likwid = false,
+    use_gpu = true,
 )
-    println(io, "\n--- Benchmarking $(process) ---")
+    log("\n--- Benchmarking $(process_name) ---")
+    if DISABLE_GPU
+        use_gpu = false
+    end

-    NFLOPs = GraphProperties(graph).computeEffort
+    graph_props = GraphProperties(graph)
+    NFLOPs = graph_props.computeEffort
+    nflops_likwid = 0
    if use_likwid
        input = gen_process_input(process)
        func(input) # compile first
+
+        # get rid of annoying output to console
+        oldstd = stdout
+        redirect_stdout(devnull)
        _, events = @perfmon "FLOPS_DP" func(input)
+        redirect_stdout(oldstd) # recover original stdout
+
        NFLOPs = first(events["FLOPS_DP"])["RETIRED_SSE_AVX_FLOPS_ALL"]
+        nflops_likwid = NFLOPs
    end

-    nInputs = 10000000  # ten million
-    println(io, "Generating $nInputs inputs with $(Threads.nthreads()) threads...")
+    log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")

    inputs = Vector{typeof(gen_process_input(process))}()
    resize!(inputs, nInputs)
@@ -48,35 +121,76 @@ function bench_process(
        push!(processes, copy(process))
    end

-    Threads.@threads for i in eachindex(inputs)
+    @inbounds Threads.@threads for i in eachindex(inputs)
        inputs[i] = gen_process_input(processes[Threads.nthreads()])
    end

-    println(io, "Benchmarking CPU with $(Threads.nthreads()) threads...")
-    (time_cpu, rate_cpu) = cpu_bench(func, inputs)
-    flops_cpu = (rate_cpu * NFLOPs) / 1024^3
+    log("Benchmarking CPU with $(Threads.nthreads()) threads...")
+    (time_cpu, rate_cpu, std_cpu) = cpu_bench(func, inputs)
+    flops_cpu = (rate_cpu * NFLOPs) / 10^9

-    println(io, "Benchmarking GPU...")
-    cuInputs = CuArray(inputs)
-    (time_gpu, rate_gpu) = gpu_bench(func, cuInputs)
-    flops_gpu = (rate_gpu * NFLOPs) / 1024^3
+    time_gpu = 0.0
+    std_gpu = 0.0
+    rate_gpu = 0.0
+    flops_gpu = 0.0
+    gpu_name = "none"
+    if use_gpu
+        log("Benchmarking GPU...")
+        gpu_name = "$(name(first(CUDA.devices())))"
+        cuInputs = CuArray(inputs)
+        (time_gpu, rate_gpu, std_gpu) = gpu_bench(kernel!, cuInputs)
+        flops_gpu = (rate_gpu * NFLOPs) / 10^9
+    else
+        log("Skipping GPU...")
+    end

-    println(io, "\nBenchmark Summary for $(process):")
+    log("\nBenchmark Summary for $(process):")

    if use_likwid
-        println(io, "Measured FLOPS by LIKWID: $NFLOPs")
+        log("Measured FLOPS by LIKWID: $NFLOPs")
    else
-        println(io, "Total graph compute effort: $NFLOPs")
+        log("Total graph compute effort: $NFLOPs")
    end
-    println(io, "Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))")
-    println(io, "CPU, $(Threads.nthreads()) threads")
-    println(io, "  Time:  $time_cpu")
-    println(io, "  Rate:  $rate_cpu")
-    println(io, "  GFLOPS: $flops_cpu")
-    println(io, "GPU, $(name(first(CUDA.devices())))")
-    println(io, "  Time:  $time_gpu")
-    println(io, "  Rate:  $rate_gpu")
-    return println(io, "  GFLOPS: $flops_gpu")
+    log("Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))")
+    log("CPU, $(Threads.nthreads()) threads")
+    log("  Time:  $time_cpu")
+    log("  Rate:  $rate_cpu")
+    log("  GFLOPS: $flops_cpu")
+    if use_gpu
+        log("GPU, $gpu_name")
+        log("  Time:  $time_gpu")
+        log("  Rate:  $rate_gpu")
+        log("  GFLOPS: $flops_gpu")
+    end
+
+    if (process_name != "warmup")
+        push!(
+            df,
+            Dict(
+                :process_name => process_name,
+                :graph_gen_time => gen_time,
+                :optimization_time => opt_time,
+                :function_generation_time => func_time,
+                :graph_nodes => graph_props.noNodes,
+                :graph_edges => graph_props.noEdges,
+                :graph_mem => MetagraphOptimization.mem(graph),
+                :cpu_threads => Threads.nthreads(),
+                :n_inputs => nInputs,
+                :nflops_likwid => nflops_likwid,
+                :cpu_time => time_cpu,
+                :cpu_std => std_cpu,
+                :cpu_rate => rate_cpu,
+                :cpu_gflops => flops_cpu,
+                :gpu_name => gpu_name,
+                :gpu_time => time_gpu,
+                :gpu_std => std_gpu,
+                :gpu_rate => rate_gpu,
+                :gpu_gflops => flops_gpu,
+            ),
+        )
+    end
+
+    return nothing
 end

 # use "mock" machine that only uses cpu
@@ -92,57 +206,67 @@ machine = Machine(
    ],
    [-1.0;;],
 )
-optimizer = ReductionOptimizer()

 # sadly cannot put these in functions because the world age must increase after the function is created which happens only in the global scope

-# compton
-process = parse_process("ke->ke", QEDModel())
-graph = gen_graph(process)
-optimize_to_fixpoint!(optimizer, graph)
-compute_func = get_compute_function(graph, process, machine)
-bench_process(process, compute_func)
+## -- WARMUP TO COMPILE FUNCTIONS first
+#=
+optimizer = RandomWalkOptimizer(MersenneTwister(0))

 # 2-photon compton
 process = parse_process("ke->kke", QEDModel())
-graph = gen_graph(process)
-optimize_to_fixpoint!(optimizer, graph)
-compute_func = get_compute_function(graph, process, machine)
-bench_process(process, compute_func)
+gen_time = @elapsed graph = gen_graph(process)
+opt_time = @elapsed optimize!(optimizer, graph, 200)
+func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
+kernel! = get_cuda_kernel(graph, process, machine)
+bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)

-# 3-photon compton
-process = parse_process("ke->kkke", QEDModel())
-graph = gen_graph(process)
-optimize_to_fixpoint!(optimizer, graph)
-compute_func = get_compute_function(graph, process, machine)
-bench_process(process, compute_func)
-
-# AB->AB
-process = parse_process("AB->AB", ABCModel())
-graph = parse_dag("input/AB->AB.txt", ABCModel())
-optimize_to_fixpoint!(optimizer, graph)
-compute_func = get_compute_function(graph, process, machine)
-bench_process(process, compute_func)
+optimizer = ReductionOptimizer()

 # AB->AB^3
 process = parse_process("AB->ABBB", ABCModel())
-graph = parse_dag("input/AB->ABBB.txt", ABCModel())
-optimize_to_fixpoint!(optimizer, graph)
-compute_func = get_compute_function(graph, process, machine)
-bench_process(process, compute_func)
+gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
+opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
+func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
+kernel! = get_cuda_kernel(graph, process, machine)
+bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
+=#
+## -- WARMUP END

-exit(0)
+optimizer = ReductionOptimizer()

-# 4-photon compton
-process = parse_process("ke->kkkke", QEDModel())
-graph = gen_graph(process)
-optimize_to_fixpoint!(optimizer, graph)
-compute_func = get_compute_function(graph, process, machine)
-bench_process(process, compute_func)
+processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke"]

-# AB->AB^5
-process = parse_process("AB->ABBBBB", ABCModel())
-graph = parse_dag("input/AB->ABBBBB.txt", ABCModel())
-optimize_to_fixpoint!(optimizer, graph)
-compute_func = get_compute_function(graph, process, machine)
-bench_process(process, compute_func)
+for process_str in processes
+    # compton
+    process = parse_process(process_str, QEDModel())
+    gen_time = @elapsed graph = gen_graph(process)
+    func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
+    kernel! = get_cuda_kernel(graph, process, machine)
+    bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)
+
+    opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
+    func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
+    kernel! = get_cuda_kernel(graph, process, machine)
+    bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
+
+    CSV.write(results_filename, df)
+end
+
+processes = ["AB->AB", "AB->ABBB", "AB->ABBBBB", "AB->ABBBBBBB"]
+
+for process_str in processes
+    # AB->AB
+    process = parse_process(process_str, ABCModel())
+    gen_time = @elapsed graph = parse_dag("input/$(process_str).txt", ABCModel())
+    func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
+    kernel! = get_cuda_kernel(graph, process, machine)
+    bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)
+
+    opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
+    func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
+    kernel! = get_cuda_kernel(graph, process, machine)
+    bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
+
+    CSV.write(results_filename, df)
+end
--- a/examples/qed_bench_reduction_steps.jl
+++ b/examples/qed_bench_reduction_steps.jl
@@ -0,0 +1,163 @@
+using MetagraphOptimization
+using CUDA
+using UUIDs
+using BenchmarkTools
+using DataFrames
+using CSV
+
+results_filename = "bench_results_reduction_steps.csv"
+
+df = DataFrame(
+    threads = Int[],
+    process = String[],
+    operations = Int[],
+    cumulative_optimization_time = Float64[],
+    graph_nodes = Int[],
+    graph_edges = Int[],
+    graph_ce = Float64[],
+    graph_dt = Float64[],
+    graph_ci = Float64[],
+    gen_func_t = Float64[],
+    cpu_compile_t = Float64[],
+    cpu_st_t = Float64[],
+    cpu_mt_t = Float64[],
+    gpu_compile_t = Float64[],
+    gpu_t = Float64[],
+)
+
+# if they exist, read existing results and append new ones
+if isfile(results_filename)
+    df = CSV.read(results_filename, DataFrame)
+end
+
+function bench(func, inputs)
+    compile_time = @elapsed func(inputs[1])
+
+    single_thread = @benchmark $func.($inputs)
+    multi_threaded = @benchmark Threads.@threads for i in eachindex($inputs)
+        $func($inputs[i])
+    end
+
+    return (
+        cpu_compile_time = compile_time,
+        gpu_compile_time = 0.0,
+        cpu_single_thread_time = mean(single_thread.times) / 1e9,
+        cpu_multi_thread_time = mean(multi_threaded.times) / 1e9,
+        gpu_time = 0.0,
+    )
+end
+
+# preparation of machine
+machine = Machine(
+    [
+        MetagraphOptimization.NumaNode(
+            0,
+            1,
+            MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
+            -1.0,
+            UUIDs.uuid1(),
+        ),
+    ],
+    [-1.0;;],
+)
+
+# bench and produce data
+n_inputs = 50_000
+optimizer = ReductionOptimizer()
+processes = [("ke->kke", 5), ("ke->ke", 1), ("ke->kke", 1), ("ke->kkke", 1), ("ke->kkkke", 1), ("ke->kkkkke", 1)]
+
+for (process_str, STEPSIZE) in processes
+    n = 0
+    opt_time_cum = 0
+
+    process = parse_process(process_str, QEDModel())
+    graph = gen_graph(process)
+    inputs = [gen_process_input(process) for _ in 1:n_inputs]
+
+    get_compute_function(graph, process, machine)
+
+    while true
+        func_gen_time = @elapsed func = get_compute_function(graph, process, machine)
+        res = bench(func, inputs)
+
+        graph_properties = get_properties(graph)
+        push!(
+            df,
+            (
+                Threads.nthreads(),
+                process_str,
+                n,
+                opt_time_cum,
+                graph_properties.noNodes,
+                graph_properties.noEdges,
+                graph_properties.computeEffort,
+                graph_properties.data,
+                graph_properties.computeIntensity,
+                func_gen_time,
+                res.cpu_compile_time,
+                res.cpu_single_thread_time,
+                res.cpu_multi_thread_time,
+                res.gpu_compile_time,
+                res.gpu_time,
+            ),
+        )
+        CSV.write(results_filename, df)
+
+        if fixpoint_reached(optimizer, graph)
+            break
+        end
+
+        opt_time_cum += @elapsed optimize!(optimizer, graph, STEPSIZE)
+        n += STEPSIZE
+    end
+end
+
+CSV.write(results_filename, df)
+
+for (process_str, STEPSIZE) in [("AB->AB", 1), ("AB->ABBB", 1), ("AB->ABBBBB", 1)]
+    n = 0
+    opt_time_cum = 0
+
+    process = parse_process(process_str, ABCModel())
+    graph = parse_dag("input/$process_str.txt", ABCModel())
+    inputs = [gen_process_input(process) for _ in 1:n_inputs]
+
+    get_compute_function(graph, process, machine)
+
+    while true
+        func_gen_time = @elapsed func = get_compute_function(graph, process, machine)
+        res = bench(func, inputs)
+
+        graph_properties = get_properties(graph)
+        push!(
+            df,
+            (
+                Threads.nthreads(),
+                process_str,
+                n,
+                opt_time_cum,
+                graph_properties.noNodes,
+                graph_properties.noEdges,
+                graph_properties.computeEffort,
+                graph_properties.data,
+                graph_properties.computeIntensity,
+                func_gen_time,
+                res.cpu_compile_time,
+                res.cpu_single_thread_time,
+                res.cpu_multi_thread_time,
+                res.gpu_compile_time,
+                res.gpu_time,
+            ),
+        )
+        CSV.write(results_filename, df)
+
+        if fixpoint_reached(optimizer, graph)
+            break
+        end
+
+        opt_time_cum += @elapsed optimize!(optimizer, graph, STEPSIZE)
+        n += STEPSIZE
+    end
+end
+
+CSV.write(results_filename, df)
--- a/examples/qed_bench_reduction_steps_gpu.jl
+++ b/examples/qed_bench_reduction_steps_gpu.jl
@@ -0,0 +1,208 @@
+using MetagraphOptimization
+using CUDA
+using UUIDs
+using BenchmarkTools
+using DataFrames
+using CSV
+using Dates
+
+results_filename = "bench_results_reduction_steps_gpu.csv"
+
+df = DataFrame(
+    threads = Int[],
+    process = String[],
+    operations = Int[],
+    cumulative_optimization_time = Float64[],
+    graph_nodes = Int[],
+    graph_edges = Int[],
+    graph_ce = Float64[],
+    graph_dt = Float64[],
+    graph_ci = Float64[],
+    cpu_st_t = Float64[],
+    cpu_st_s = Float64[],
+    cpu_mt_t = Float64[],
+    cpu_mt_s = Float64[],
+    cpu_mem = Float64[],
+    gpu_t = Float64[],
+    gpu_s = Float64[],
+    gpu_mem = Float64[],
+)
+
+# if they exist, read existing results and append new ones
+if isfile(results_filename)
+    df = CSV.read(results_filename, DataFrame)
+end
+
+function log(x...)
+    println(now(), " ", join(x, " ")...)
+    return flush(stdout)
+end
+
+function bench(func, kernel!, inputs)
+    # gpu part
+    n = length(inputs)
+    cu_inputs = CuVector(inputs)
+    cu_outputs = CuVector{ComplexF64}()
+    resize!(cu_outputs, n)
+    ts = 32
+    bs = Int(n / ts)
+    bench = @benchmark begin
+        @cuda threads = $ts blocks = $bs always_inline = true $kernel!($cu_inputs, $cu_outputs, $n)
+        CUDA.device_synchronize()
+    end gcsample = true samples = 20 evals = 1
+
+    gpu_time = median(bench.times) / 1e9
+    gpu_std = std(bench.times) / 1e9
+    gpu_mem = bench.memory
+
+    # cpu part
+    single_thread = @benchmark $func.($inputs)
+    multi_threaded = @benchmark Threads.@threads for i in eachindex($inputs)
+        $func($inputs[i])
+    end
+
+    cpu_st_time = median(single_thread.times) / 1e9
+    cpu_st_std = std(single_thread.times) / 1e9
+    cpu_mt_time = median(multi_threaded.times) / 1e9
+    cpu_mt_std = std(multi_threaded.times) / 1e9
+    cpu_mem = std(single_thread.times)
+
+
+    return (
+        cpu_single_thread_time = cpu_st_time,
+        cpu_single_thread_std = cpu_st_std,
+        cpu_multi_thread_time = cpu_mt_time,
+        cpu_multi_thread_std = cpu_mt_std,
+        cpu_mem = cpu_mem,
+        gpu_time = gpu_time,
+        gpu_std = gpu_std,
+        gpu_mem = gpu_mem,
+    )
+end
+
+log("Available CUDA devices:")
+for dev in CUDA.devices()
+    display(dev)
+end
+
+# preparation of machine
+machine = Machine(
+    [
+        MetagraphOptimization.NumaNode(
+            0,
+            1,
+            MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
+            -1.0,
+            UUIDs.uuid1(),
+        ),
+    ],
+    [-1.0;;],
+)
+
+
+# bench and produce data
+n_inputs = 2^16
+optimizer = ReductionOptimizer()
+processes = [("ke->ke", 1), ("ke->kke", 1), ("ke->kkke", 1), ("ke->kkkke", 5)]
+
+for (process_str, STEPSIZE) in processes
+    n = 0
+    opt_time_cum = 0
+
+    process = parse_process(process_str, QEDModel())
+    graph = gen_graph(process)
+    inputs = Vector([gen_process_input(process) for _ in 1:n_inputs])
+
+    get_compute_function(graph, process, machine)
+
+    while true
+        func = get_compute_function(graph, process, machine)
+        kernel! = get_cuda_kernel(graph, process, machine)
+        res = bench(func, kernel!, inputs)
+
+        graph_properties = get_properties(graph)
+        push!(
+            df,
+            (
+                Threads.nthreads(),
+                process_str,
+                n,
+                opt_time_cum,
+                graph_properties.noNodes,
+                graph_properties.noEdges,
+                graph_properties.computeEffort,
+                graph_properties.data,
+                graph_properties.computeIntensity,
+                res.cpu_single_thread_time,
+                res.cpu_single_thread_std,
+                res.cpu_multi_thread_time,
+                res.cpu_multi_thread_std,
+                res.cpu_mem,
+                res.gpu_time,
+                res.gpu_std,
+                res.gpu_mem,
+            ),
+        )
+        CSV.write(results_filename, df)
+
+        if fixpoint_reached(optimizer, graph)
+            break
+        end
+
+        opt_time_cum += @elapsed optimize!(optimizer, graph, STEPSIZE)
+        n += STEPSIZE
+    end
+end
+
+CSV.write(results_filename, df)
+
+for (process_str, STEPSIZE) in [("AB->AB", 1), ("AB->ABBB", 1), ("AB->ABBBBB", 1)]
+    n = 0
+    opt_time_cum = 0
+
+    process = parse_process(process_str, ABCModel())
+    graph = parse_dag("input/$process_str.txt", ABCModel())
+    inputs = Vector([gen_process_input(process) for _ in 1:n_inputs])
+
+    get_compute_function(graph, process, machine)
+
+    while true
+        func = get_compute_function(graph, process, machine)
+        kernel! = get_cuda_kernel(graph, process, machine)
+        res = bench(func, kernel!, inputs)
+
+        graph_properties = get_properties(graph)
+        push!(
+            df,
+            (
+                Threads.nthreads(),
+                process_str,
+                n,
+                opt_time_cum,
+                graph_properties.noNodes,
+                graph_properties.noEdges,
+                graph_properties.computeEffort,
+                graph_properties.data,
+                graph_properties.computeIntensity,
+                res.cpu_single_thread_time,
+                res.cpu_single_thread_std,
+                res.cpu_multi_thread_time,
+                res.cpu_multi_thread_std,
+                res.cpu_mem,
+                res.gpu_time,
+                res.gpu_std,
+                res.gpu_mem,
+            ),
+        )
+        CSV.write(results_filename, df)
+
+        if fixpoint_reached(optimizer, graph)
+            break
+        end
+
+        opt_time_cum += @elapsed optimize!(optimizer, graph, STEPSIZE)
+        n += STEPSIZE
+    end
+end
+
+CSV.write(results_filename, df)
--- a/examples/qed_bench_tape.jl
+++ b/examples/qed_bench_tape.jl
@@ -0,0 +1,232 @@
+using MetagraphOptimization
+using LIKWID
+using UUIDs
+using DataFrames
+using CSV
+using Random
+using BenchmarkTools
+using Dates
+
+function log(x...)
+    println(now(), " ", join(x, " ")...)
+    return flush(stdout)
+end
+
+results_filename = "bench_results_tape_$(Threads.nthreads()).csv"
+
+df = DataFrame(
+    process_name = String[],
+    graph_gen_time = Float64[],
+    optimization_time = Float64[],
+    function_generation_time = Float64[],
+    graph_nodes = Int[],
+    graph_edges = Int[],
+    graph_mem = Float64[],
+    cpu_threads = Int[],
+    n_inputs = Int[],
+    nflops_likwid = Int[],
+    cpu_time = Float64[],
+    cpu_rate = Float64[],
+    cpu_gflops = Float64[],
+    cpu_std = Float64[],
+    gpu_name = String[],
+    gpu_time = Float64[],
+    gpu_std = Float64[],
+    gpu_rate = Float64[],
+    gpu_gflops = Float64[],
+)
+
+# if they exist, read existing results and append new ones
+if isfile(results_filename)
+    df = CSV.read(results_filename, DataFrame)
+end
+
+nInputs = 1_000_000
+
+# use "mock" machine that only uses cpu
+machine = Machine(
+    [
+        MetagraphOptimization.NumaNode(
+            0,
+            1,
+            MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
+            -1.0,
+            UUIDs.uuid1(),
+        ),
+    ],
+    [-1.0;;],
+)
+
+
+function cpu_bench(tape, inputs)
+    bench = @benchmark begin
+        @inbounds Threads.@threads for i in eachindex($inputs)
+            execute_tape($tape, $inputs[i])
+        end
+    end gcsample = true seconds = 300
+
+    time = mean(bench.times) / 1e9
+    s = std(bench.times) / 1e9
+    rate = length(inputs) / time
+
+    return (time, rate, s)
+end
+
+function bench_process(
+    process::MetagraphOptimization.AbstractProcessDescription,
+    process_name::String,
+    graph::DAG,
+    gen_time::Float64,
+    opt_time::Float64,
+    io::IO = stdout;
+    use_likwid = false,
+)
+    log("\n--- Benchmarking $(process_name) ---")
+
+    func_time = @elapsed tape = gen_tape(graph, process, machine)
+
+    graph_props = GraphProperties(graph)
+    NFLOPs = graph_props.computeEffort
+    nflops_likwid = 0
+    if use_likwid
+        input = gen_process_input(process)
+
+        # get rid of annoying output to console
+        oldstd = stdout
+        redirect_stdout(devnull)
+        _, events = @perfmon "FLOPS_DP" execute_tape(tape, input)
+        redirect_stdout(oldstd) # recover original stdout
+
+        NFLOPs = first(events["FLOPS_DP"])["RETIRED_SSE_AVX_FLOPS_ALL"]
+        nflops_likwid = NFLOPs
+    end
+
+    log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")
+
+    inputs = Vector{typeof(gen_process_input(process))}()
+    resize!(inputs, nInputs)
+    processes = Vector{typeof(process)}()
+    for i in 1:Threads.nthreads()
+        push!(processes, copy(process))
+    end
+
+    @inbounds Threads.@threads for i in eachindex(inputs)
+        inputs[i] = gen_process_input(processes[Threads.nthreads()])
+    end
+
+    log("Benchmarking CPU with $(Threads.nthreads()) threads...")
+    (time_cpu, rate_cpu, std_cpu) = cpu_bench(tape, inputs)
+    flops_cpu = (rate_cpu * NFLOPs) / 10^9
+
+    log("\nBenchmark Summary for $(process):")
+
+    if use_likwid
+        log("Measured FLOPS by LIKWID: $NFLOPs")
+    else
+        log("Total graph compute effort: $NFLOPs")
+    end
+    log("Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))")
+    log("CPU, $(Threads.nthreads()) threads")
+    log("  Time:  $time_cpu")
+    log("  Rate:  $rate_cpu")
+    log("  GFLOPS: $flops_cpu")
+
+    if (process_name != "warmup")
+        push!(
+            df,
+            Dict(
+                :process_name => process_name,
+                :graph_gen_time => gen_time,
+                :optimization_time => opt_time,
+                :function_generation_time => func_time,
+                :graph_nodes => graph_props.noNodes,
+                :graph_edges => graph_props.noEdges,
+                :graph_mem => MetagraphOptimization.mem(graph),
+                :cpu_threads => Threads.nthreads(),
+                :n_inputs => nInputs,
+                :nflops_likwid => nflops_likwid,
+                :cpu_time => time_cpu,
+                :cpu_std => std_cpu,
+                :cpu_rate => rate_cpu,
+                :cpu_gflops => flops_cpu,
+                :gpu_name => "none",
+                :gpu_time => 0.0,
+                :gpu_std => 0.0,
+                :gpu_rate => 0.0,
+                :gpu_gflops => 0.0,
+            ),
+        )
+    end
+
+    return nothing
+end
+
+function bench_qed(process_string::String, skip_unoptimized = false)
+    optimizer = ReductionOptimizer()
+
+    process = parse_process(process_string, QEDModel())
+    gen_time = @elapsed graph = gen_graph(process)
+    opt_time = 0.0
+    if !skip_unoptimized
+        bench_process(process, "$process not optimized tape", graph, gen_time, opt_time)
+    end
+
+    opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
+    bench_process(process, "$process reduced tape", graph, gen_time, opt_time)
+
+    return nothing
+end
+
+function bench_abc(process_string::String)
+    optimizer = ReductionOptimizer()
+
+    process = parse_process(process_string, ABCModel())
+    gen_time = @elapsed graph = parse_dag("input/$process_string.txt", ABCModel())
+    bench_process(process, "$process not optimized tape", graph, gen_time, 0.0)
+
+    opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
+    bench_process(process, "$process reduced tape", graph, gen_time, opt_time)
+
+    return nothing
+end
+
+# sadly cannot put these in functions because the world age must increase after the function is created which happens only in the global scope
+
+## -- WARMUP TO COMPILE FUNCTIONS first
+optimizer = ReductionOptimizer()
+
+process = parse_process("ke->kke", QEDModel())
+gen_time = @elapsed graph = gen_graph(process)
+opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
+bench_process(process, "warmup", graph, gen_time, opt_time)
+
+# AB->AB^3
+process = parse_process("AB->ABBB", ABCModel())
+gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
+opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
+bench_process(process, "warmup", graph, gen_time, opt_time)
+
+## -- WARMUP END
+
+# compton
+bench_qed("ke->ke")
+CSV.write(results_filename, df)
+bench_qed("ke->kke")
+CSV.write(results_filename, df)
+bench_qed("ke->kkke")
+CSV.write(results_filename, df)
+bench_qed("ke->kkkke")
+CSV.write(results_filename, df)
+bench_qed("ke->kkkkke")
+CSV.write(results_filename, df)
+bench_qed("ke->kkkkkke")
+CSV.write(results_filename, df)
+bench_qed("ke->kkkkkkke")
+CSV.write(results_filename, df)
+
+bench_abc("AB->AB")
+CSV.write(results_filename, df)
+bench_abc("AB->ABBB")
+CSV.write(results_filename, df)
+bench_abc("AB->ABBBBB")
+CSV.write(results_filename, df)
--- a/examples/qed_gen_bench.jl
+++ b/examples/qed_gen_bench.jl
@@ -0,0 +1,144 @@
+using MetagraphOptimization
+using DataFrames
+using CSV
+using BenchmarkTools
+using StatsBase
+
+results_filename = "qed_gen_results_$(Threads.nthreads()).csv"
+
+df = DataFrame(
+    process_name = String[],
+    cpu_threads = Int[],
+    graph_gen_samples = Int[],
+    graph_gen_mean = Float64[],
+    graph_gen_std = Float64[],
+    graph_gen_median = Float64[],
+    graph_nodes = Int[],
+    graph_data_nodes = Int[],
+    graph_u_nodes = Int[],
+    graph_v_nodes = Int[],
+    graph_s1_nodes = Int[],
+    graph_s2_nodes = Int[],
+    graph_edges = Int[],
+    graph_nodes_reduced = Int[],
+    graph_data_nodes_reduced = Int[],
+    graph_u_nodes_reduced = Int[],
+    graph_v_nodes_reduced = Int[],
+    graph_s1_nodes_reduced = Int[],
+    graph_s2_nodes_reduced = Int[],
+    graph_edges_reduced = Int[],
+    graph_mem = Float64[],
+    graph_mem_reduced = Float64[],
+    graph_elapsed_reduce = Float64[],
+)
+
+function bench_process(process::AbstractString; warmup = false, optimize = true)
+    println("Benchmarking $process...")
+    model = QEDModel()
+
+    proc = parse_process(process, model)
+
+    gen_bench = @benchmark gen_graph($proc) gcsample = true seconds = 5
+
+    graph = gen_graph(proc)
+
+    props = GraphProperties(graph)
+    node_dict = countmap(typeof.(graph.nodes))
+    graph_size = Base.summarysize(graph)
+
+    reduce_elapsed = -1.0
+    node_dict_reduced = Dict()
+    graph_size_reduced = -1.0
+    props_reduced = GraphProperties()
+    if optimize
+        reduce_elapsed = @elapsed optimize_to_fixpoint!(ReductionOptimizer(), graph)
+
+        props_reduced = GraphProperties(graph)
+        node_dict_reduced = countmap(typeof.(graph.nodes))
+        graph_size_reduced = Base.summarysize(graph)
+    end
+
+    if warmup
+        return nothing
+    end
+
+    push!(
+        df,
+        Dict(
+            :process_name => process,
+            :cpu_threads => Threads.nthreads(),
+            :graph_gen_samples => length(gen_bench.times),
+            :graph_gen_mean => mean(gen_bench.times),
+            :graph_gen_std => std(gen_bench.times),
+            :graph_gen_median => median(gen_bench.times),
+            :graph_nodes => props.noNodes,
+            :graph_data_nodes => get(node_dict, DataTaskNode{DataTask}, 0),
+            :graph_u_nodes => get(node_dict, ComputeTaskNode{ComputeTaskQED_U}, 0),
+            :graph_v_nodes => get(node_dict, ComputeTaskNode{ComputeTaskQED_V}, 0),
+            :graph_s1_nodes => get(node_dict, ComputeTaskNode{ComputeTaskQED_S1}, 0),
+            :graph_s2_nodes => get(node_dict, ComputeTaskNode{ComputeTaskQED_S2}, 0),
+            :graph_edges => props.noEdges,
+            :graph_nodes_reduced => props_reduced.noNodes,
+            :graph_data_nodes_reduced => get(node_dict_reduced, DataTaskNode{DataTask}, 0),
+            :graph_u_nodes_reduced => get(node_dict_reduced, ComputeTaskNode{ComputeTaskQED_U}, 0),
+            :graph_v_nodes_reduced => get(node_dict_reduced, ComputeTaskNode{ComputeTaskQED_V}, 0),
+            :graph_s1_nodes_reduced => get(node_dict_reduced, ComputeTaskNode{ComputeTaskQED_S1}, 0),
+            :graph_s2_nodes_reduced => get(node_dict_reduced, ComputeTaskNode{ComputeTaskQED_S2}, 0),
+            :graph_edges_reduced => props_reduced.noEdges,
+            :graph_mem => graph_size,
+            :graph_mem_reduced => graph_size_reduced,
+            :graph_elapsed_reduce => reduce_elapsed,
+        ),
+    )
+    return nothing
+end
+
+processes = [
+    ("ke->ke", true),
+    ("ke->kke", true),
+    ("ke->kkke", true),
+    ("ke->kkkke", true),
+    ("ke->kkkkke", true),
+    ("ke->kkkkkke", true),
+    ("ke->kkkkkkke", true),
+    #("ke->kkkkkkkke", false),
+    #("ke->kkkkkkkkke", false),
+]
+
+df = DataFrame(
+    process_name = String[],
+    cpu_threads = Int[],
+    graph_gen_samples = Int[],
+    graph_gen_mean = Float64[],
+    graph_gen_std = Float64[],
+    graph_gen_median = Float64[],
+    graph_nodes = Int[],
+    graph_data_nodes = Int[],
+    graph_u_nodes = Int[],
+    graph_v_nodes = Int[],
+    graph_s1_nodes = Int[],
+    graph_s2_nodes = Int[],
+    graph_edges = Int[],
+    graph_nodes_reduced = Int[],
+    graph_data_nodes_reduced = Int[],
+    graph_u_nodes_reduced = Int[],
+    graph_v_nodes_reduced = Int[],
+    graph_s1_nodes_reduced = Int[],
+    graph_s2_nodes_reduced = Int[],
+    graph_edges_reduced = Int[],
+    graph_mem = Float64[],
+    graph_mem_reduced = Float64[],
+    graph_elapsed_reduce = Float64[],
+)
+
+# if they exist, read existing results and append new ones
+if isfile(results_filename)
+    df = CSV.read(results_filename, DataFrame)
+end
+
+bench_process("ke->kke", warmup = true)
+
+for (process, opt) in processes
+    bench_process(process, optimize = opt)
+    CSV.write(results_filename, df)
+end
--- a/examples/reduction.ipynb
+++ b/examples/reduction.ipynb