using MetagraphOptimization
using LIKWID
using CUDA
using UUIDs
using DataFrames
using CSV
using Random
using BenchmarkTools
using Dates

DISABLE_GPU = false

function log(x...)
    println(now(), " ", join(x, " ")...)
    return flush(stdout)
end

results_filename = "bench_results_$(Threads.nthreads()).csv"

df = DataFrame(
    process_name = String[],
    graph_gen_time = Float64[],
    optimization_time = Float64[],
    function_generation_time = Float64[],
    graph_nodes = Int[],
    graph_edges = Int[],
    graph_mem = Float64[],
    cpu_threads = Int[],
    n_inputs = Int[],
    nflops_likwid = Int[],
    cpu_time = Float64[],
    cpu_std = Float64[],
    cpu_rate = Float64[],
    cpu_gflops = Float64[],
    gpu_name = String[],
    gpu_time = Float64[],
    gpu_std = Float64[],
    gpu_rate = Float64[],
    gpu_gflops = Float64[],
)

# if they exist, read existing results and append new ones
if isfile(results_filename)
    df = CSV.read(results_filename, DataFrame)
end

nInputs = 2^20

function cpu_bench(compute_function, inputs)
    bench = @benchmark begin
        @inbounds Threads.@threads for i in eachindex($inputs)
            @invokelatest $compute_function($inputs[i])
        end
    end gcsample = true samples = 20 evals = 1

    time = median(bench.times) / 1e9
    s = std(bench.times) / 1e9
    rate = length(inputs) / time

    return (time, rate, s)
end

function gpu_bench(kernel!, inputs)
    n = length(inputs)
    outputs = CuVector{ComplexF64}()
    resize!(outputs, n)
    ts = 32
    bs = Int(n / ts)
    bench = @benchmark begin
        @cuda threads = $ts blocks = $bs always_inline = true $kernel!($inputs, $outputs, $n)
        CUDA.device_synchronize()
    end gcsample = true samples = 20 evals = 1

    time = median(bench.times) / 1e9
    s = std(bench.times) / 1e9
    rate = length(inputs) / time

    return (time, rate, s)
end

function bench_process(
    process::MetagraphOptimization.AbstractProcessDescription,
    process_name::String,
    graph::DAG,
    func,
    kernel!,
    gen_time::Float64,
    opt_time::Float64,
    func_time::Float64;
    use_likwid = false,
    use_gpu = true,
)
    log("\n--- Benchmarking $(process_name) ---")
    if DISABLE_GPU
        use_gpu = false
    end

    graph_props = GraphProperties(graph)
    NFLOPs = graph_props.computeEffort
    nflops_likwid = 0
    if use_likwid
        input = gen_process_input(process)
        func(input) # compile first

        # get rid of annoying output to console
        oldstd = stdout
        redirect_stdout(devnull)
        _, events = @perfmon "FLOPS_DP" func(input)
        redirect_stdout(oldstd) # recover original stdout

        NFLOPs = first(events["FLOPS_DP"])["RETIRED_SSE_AVX_FLOPS_ALL"]
        nflops_likwid = NFLOPs
    end

    log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")

    inputs = Vector{typeof(gen_process_input(process))}()
    resize!(inputs, nInputs)
    processes = Vector{typeof(process)}()
    for i in 1:Threads.nthreads()
        push!(processes, copy(process))
    end

    @inbounds Threads.@threads for i in eachindex(inputs)
        inputs[i] = gen_process_input(processes[Threads.nthreads()])
    end

    log("Benchmarking CPU with $(Threads.nthreads()) threads...")
    (time_cpu, rate_cpu, std_cpu) = cpu_bench(func, inputs)
    flops_cpu = (rate_cpu * NFLOPs) / 10^9

    time_gpu = 0.0
    std_gpu = 0.0
    rate_gpu = 0.0
    flops_gpu = 0.0
    gpu_name = "none"
    if use_gpu
        log("Benchmarking GPU...")
        gpu_name = "$(name(first(CUDA.devices())))"
        cuInputs = CuArray(inputs)
        (time_gpu, rate_gpu, std_gpu) = gpu_bench(kernel!, cuInputs)
        flops_gpu = (rate_gpu * NFLOPs) / 10^9
    else
        log("Skipping GPU...")
    end

    log("\nBenchmark Summary for $(process):")

    if use_likwid
        log("Measured FLOPS by LIKWID: $NFLOPs")
    else
        log("Total graph compute effort: $NFLOPs")
    end
    log("Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))")
    log("CPU, $(Threads.nthreads()) threads")
    log("  Time:  $time_cpu")
    log("  Rate:  $rate_cpu")
    log("  GFLOPS: $flops_cpu")
    if use_gpu
        log("GPU, $gpu_name")
        log("  Time:  $time_gpu")
        log("  Rate:  $rate_gpu")
        log("  GFLOPS: $flops_gpu")
    end

    if (process_name != "warmup")
        push!(
            df,
            Dict(
                :process_name => process_name,
                :graph_gen_time => gen_time,
                :optimization_time => opt_time,
                :function_generation_time => func_time,
                :graph_nodes => graph_props.noNodes,
                :graph_edges => graph_props.noEdges,
                :graph_mem => MetagraphOptimization.mem(graph),
                :cpu_threads => Threads.nthreads(),
                :n_inputs => nInputs,
                :nflops_likwid => nflops_likwid,
                :cpu_time => time_cpu,
                :cpu_std => std_cpu,
                :cpu_rate => rate_cpu,
                :cpu_gflops => flops_cpu,
                :gpu_name => gpu_name,
                :gpu_time => time_gpu,
                :gpu_std => std_gpu,
                :gpu_rate => rate_gpu,
                :gpu_gflops => flops_gpu,
            ),
        )
    end

    return nothing
end

# use "mock" machine that only uses cpu
machine = Machine(
    [
        MetagraphOptimization.NumaNode(
            0,
            1,
            MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode),
            -1.0,
            UUIDs.uuid1(),
        ),
    ],
    [-1.0;;],
)

# sadly cannot put these in functions because the world age must increase after the function is created which happens only in the global scope

## -- WARMUP TO COMPILE FUNCTIONS first
#=
optimizer = RandomWalkOptimizer(MersenneTwister(0))

# 2-photon compton
process = parse_process("ke->kke", QEDModel())
gen_time = @elapsed graph = gen_graph(process)
opt_time = @elapsed optimize!(optimizer, graph, 200)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)

optimizer = ReductionOptimizer()

# AB->AB^3
process = parse_process("AB->ABBB", ABCModel())
gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
=#
## -- WARMUP END

optimizer = ReductionOptimizer()

processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke"]

for process_str in processes
    # compton
    process = parse_process(process_str, QEDModel())
    gen_time = @elapsed graph = gen_graph(process)
    func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
    kernel! = get_cuda_kernel(graph, process, machine)
    bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)

    opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
    func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
    kernel! = get_cuda_kernel(graph, process, machine)
    bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)

    CSV.write(results_filename, df)
end

processes = ["AB->AB", "AB->ABBB", "AB->ABBBBB", "AB->ABBBBBBB"]

for process_str in processes
    # AB->AB
    process = parse_process(process_str, ABCModel())
    gen_time = @elapsed graph = parse_dag("input/$(process_str).txt", ABCModel())
    func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
    kernel! = get_cuda_kernel(graph, process, machine)
    bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)

    opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
    func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
    kernel! = get_cuda_kernel(graph, process, machine)
    bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)

    CSV.write(results_filename, df)
end