using MetagraphOptimization
using LIKWID
using CUDA
using UUIDs

function cpu_bench(compute_function, inputs)
    compute_function.(inputs[begin:10]) # make sure it's compiled

    time = @elapsed Threads.@threads for i in eachindex(inputs)
        @invokelatest compute_function(inputs[i])
    end
    rate = length(inputs) / time
    return (time, rate)
end

function gpu_bench(compute_function, inputs)
    CUDA.@sync compute_function.(inputs[begin:10])  # make sure it's compiled

    time = @elapsed CUDA.@sync compute_function.(inputs)
    rate = length(inputs) / time

    return (time, rate)
end

function bench_process(
    process::MetagraphOptimization.AbstractProcessDescription,
    func,
    io::IO = stdout;
    use_likwid = true,
)
    println(io, "\n--- Benchmarking $(process) ---")

    NFLOPs = GraphProperties(graph).computeEffort
    if use_likwid
        input = gen_process_input(process)
        func(input) # compile first
        _, events = @perfmon "FLOPS_DP" func(input)
        NFLOPs = first(events["FLOPS_DP"])["RETIRED_SSE_AVX_FLOPS_ALL"]
    end

    nInputs = 10000000  # ten million
    println(io, "Generating $nInputs inputs with $(Threads.nthreads()) threads...")

    inputs = Vector{typeof(gen_process_input(process))}()
    resize!(inputs, nInputs)
    processes = Vector{typeof(process)}()
    for i in 1:Threads.nthreads()
        push!(processes, copy(process))
    end

    Threads.@threads for i in eachindex(inputs)
        inputs[i] = gen_process_input(processes[Threads.nthreads()])
    end

    println(io, "Benchmarking CPU with $(Threads.nthreads()) threads...")
    (time_cpu, rate_cpu) = cpu_bench(func, inputs)
    flops_cpu = (rate_cpu * NFLOPs) / 1024^3

    println(io, "Benchmarking GPU...")
    cuInputs = CuArray(inputs)
    (time_gpu, rate_gpu) = gpu_bench(func, cuInputs)
    flops_gpu = (rate_gpu * NFLOPs) / 1024^3

    println(io, "\nBenchmark Summary for $(process):")

    if use_likwid
        println(io, "Measured FLOPS by LIKWID: $NFLOPs")
    else
        println(io, "Total graph compute effort: $NFLOPs")
    end
    println(io, "Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))")
    println(io, "CPU, $(Threads.nthreads()) threads")
    println(io, "  Time:  $time_cpu")
    println(io, "  Rate:  $rate_cpu")
    println(io, "  GFLOPS: $flops_cpu")
    println(io, "GPU, $(name(first(CUDA.devices())))")
    println(io, "  Time:  $time_gpu")
    println(io, "  Rate:  $rate_gpu")
    return println(io, "  GFLOPS: $flops_gpu")
end

# use "mock" machine that only uses cpu
machine = Machine(
    [
        MetagraphOptimization.NumaNode(
            0,
            1,
            MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
            -1.0,
            UUIDs.uuid1(),
        ),
    ],
    [-1.0;;],
)
optimizer = ReductionOptimizer()

# sadly cannot put these in functions because the world age must increase after the function is created which happens only in the global scope

# compton
process = parse_process("ke->ke", QEDModel())
graph = gen_graph(process)
optimize_to_fixpoint!(optimizer, graph)
compute_func = get_compute_function(graph, process, machine)
bench_process(process, compute_func)

# 2-photon compton
process = parse_process("ke->kke", QEDModel())
graph = gen_graph(process)
optimize_to_fixpoint!(optimizer, graph)
compute_func = get_compute_function(graph, process, machine)
bench_process(process, compute_func)

# 3-photon compton
process = parse_process("ke->kkke", QEDModel())
graph = gen_graph(process)
optimize_to_fixpoint!(optimizer, graph)
compute_func = get_compute_function(graph, process, machine)
bench_process(process, compute_func)

# AB->AB
process = parse_process("AB->AB", ABCModel())
graph = parse_dag("input/AB->AB.txt", ABCModel())
optimize_to_fixpoint!(optimizer, graph)
compute_func = get_compute_function(graph, process, machine)
bench_process(process, compute_func)

# AB->AB^3
process = parse_process("AB->ABBB", ABCModel())
graph = parse_dag("input/AB->ABBB.txt", ABCModel())
optimize_to_fixpoint!(optimizer, graph)
compute_func = get_compute_function(graph, process, machine)
bench_process(process, compute_func)

exit(0)

# 4-photon compton
process = parse_process("ke->kkkke", QEDModel())
graph = gen_graph(process)
optimize_to_fixpoint!(optimizer, graph)
compute_func = get_compute_function(graph, process, machine)
bench_process(process, compute_func)

# AB->AB^5
process = parse_process("AB->ABBBBB", ABCModel())
graph = parse_dag("input/AB->ABBBBB.txt", ABCModel())
optimize_to_fixpoint!(optimizer, graph)
compute_func = get_compute_function(graph, process, machine)
bench_process(process, compute_func)