using MetagraphOptimization using LIKWID using CUDA using UUIDs using DataFrames using CSV using Random using BenchmarkTools using Dates DISABLE_GPU = false function log(x...) println(now(), " ", join(x, " ")...) return flush(stdout) end results_filename = "bench_results_$(Threads.nthreads()).csv" df = DataFrame( process_name = String[], graph_gen_time = Float64[], optimization_time = Float64[], function_generation_time = Float64[], graph_nodes = Int[], graph_edges = Int[], graph_mem = Float64[], cpu_threads = Int[], n_inputs = Int[], nflops_likwid = Int[], cpu_time = Float64[], cpu_std = Float64[], cpu_rate = Float64[], cpu_gflops = Float64[], gpu_name = String[], gpu_time = Float64[], gpu_std = Float64[], gpu_rate = Float64[], gpu_gflops = Float64[], ) # if they exist, read existing results and append new ones if isfile(results_filename) df = CSV.read(results_filename, DataFrame) end nInputs = 2^20 function cpu_bench(compute_function, inputs) bench = @benchmark begin @inbounds Threads.@threads for i in eachindex($inputs) @invokelatest $compute_function($inputs[i]) end end gcsample = true samples = 20 evals = 1 time = median(bench.times) / 1e9 s = std(bench.times) / 1e9 rate = length(inputs) / time return (time, rate, s) end function gpu_bench(kernel!, inputs) n = length(inputs) outputs = CuVector{ComplexF64}() resize!(outputs, n) ts = 32 bs = Int(n / ts) bench = @benchmark begin @cuda threads = $ts blocks = $bs always_inline = true $kernel!($inputs, $outputs, $n) CUDA.device_synchronize() end gcsample = true samples = 20 evals = 1 time = median(bench.times) / 1e9 s = std(bench.times) / 1e9 rate = length(inputs) / time return (time, rate, s) end function bench_process( process::MetagraphOptimization.AbstractProcessDescription, process_name::String, graph::DAG, func, kernel!, gen_time::Float64, opt_time::Float64, func_time::Float64; use_likwid = false, use_gpu = true, ) log("\n--- Benchmarking $(process_name) ---") if DISABLE_GPU use_gpu = false end graph_props = GraphProperties(graph) NFLOPs = graph_props.computeEffort nflops_likwid = 0 if use_likwid input = gen_process_input(process) func(input) # compile first # get rid of annoying output to console oldstd = stdout redirect_stdout(devnull) _, events = @perfmon "FLOPS_DP" func(input) redirect_stdout(oldstd) # recover original stdout NFLOPs = first(events["FLOPS_DP"])["RETIRED_SSE_AVX_FLOPS_ALL"] nflops_likwid = NFLOPs end log("Generating $nInputs inputs with $(Threads.nthreads()) threads...") inputs = Vector{typeof(gen_process_input(process))}() resize!(inputs, nInputs) processes = Vector{typeof(process)}() for i in 1:Threads.nthreads() push!(processes, copy(process)) end @inbounds Threads.@threads for i in eachindex(inputs) inputs[i] = gen_process_input(processes[Threads.nthreads()]) end log("Benchmarking CPU with $(Threads.nthreads()) threads...") (time_cpu, rate_cpu, std_cpu) = cpu_bench(func, inputs) flops_cpu = (rate_cpu * NFLOPs) / 10^9 time_gpu = 0.0 std_gpu = 0.0 rate_gpu = 0.0 flops_gpu = 0.0 gpu_name = "none" if use_gpu log("Benchmarking GPU...") gpu_name = "$(name(first(CUDA.devices())))" cuInputs = CuArray(inputs) (time_gpu, rate_gpu, std_gpu) = gpu_bench(kernel!, cuInputs) flops_gpu = (rate_gpu * NFLOPs) / 10^9 else log("Skipping GPU...") end log("\nBenchmark Summary for $(process):") if use_likwid log("Measured FLOPS by LIKWID: $NFLOPs") else log("Total graph compute effort: $NFLOPs") end log("Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))") log("CPU, $(Threads.nthreads()) threads") log(" Time: $time_cpu") log(" Rate: $rate_cpu") log(" GFLOPS: $flops_cpu") if use_gpu log("GPU, $gpu_name") log(" Time: $time_gpu") log(" Rate: $rate_gpu") log(" GFLOPS: $flops_gpu") end if (process_name != "warmup") push!( df, Dict( :process_name => process_name, :graph_gen_time => gen_time, :optimization_time => opt_time, :function_generation_time => func_time, :graph_nodes => graph_props.noNodes, :graph_edges => graph_props.noEdges, :graph_mem => MetagraphOptimization.mem(graph), :cpu_threads => Threads.nthreads(), :n_inputs => nInputs, :nflops_likwid => nflops_likwid, :cpu_time => time_cpu, :cpu_std => std_cpu, :cpu_rate => rate_cpu, :cpu_gflops => flops_cpu, :gpu_name => gpu_name, :gpu_time => time_gpu, :gpu_std => std_gpu, :gpu_rate => rate_gpu, :gpu_gflops => flops_gpu, ), ) end return nothing end # use "mock" machine that only uses cpu machine = Machine( [ MetagraphOptimization.NumaNode( 0, 1, MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode), -1.0, UUIDs.uuid1(), ), ], [-1.0;;], ) # sadly cannot put these in functions because the world age must increase after the function is created which happens only in the global scope ## -- WARMUP TO COMPILE FUNCTIONS first #= optimizer = RandomWalkOptimizer(MersenneTwister(0)) # 2-photon compton process = parse_process("ke->kke", QEDModel()) gen_time = @elapsed graph = gen_graph(process) opt_time = @elapsed optimize!(optimizer, graph, 200) func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) kernel! = get_cuda_kernel(graph, process, machine) bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time) optimizer = ReductionOptimizer() # AB->AB^3 process = parse_process("AB->ABBB", ABCModel()) gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel()) opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph) func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) kernel! = get_cuda_kernel(graph, process, machine) bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time) =# ## -- WARMUP END optimizer = ReductionOptimizer() processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke"] for process_str in processes # compton process = parse_process(process_str, QEDModel()) gen_time = @elapsed graph = gen_graph(process) func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) kernel! = get_cuda_kernel(graph, process, machine) bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time) opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph) func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) kernel! = get_cuda_kernel(graph, process, machine) bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time) CSV.write(results_filename, df) end processes = ["AB->AB", "AB->ABBB", "AB->ABBBBB", "AB->ABBBBBBB"] for process_str in processes # AB->AB process = parse_process(process_str, ABCModel()) gen_time = @elapsed graph = parse_dag("input/$(process_str).txt", ABCModel()) func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) kernel! = get_cuda_kernel(graph, process, machine) bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time) opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph) func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) kernel! = get_cuda_kernel(graph, process, machine) bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time) CSV.write(results_filename, df) end