using MetagraphOptimization using LIKWID using CUDA using UUIDs function cpu_bench(compute_function, inputs) compute_function.(inputs[begin:10]) # make sure it's compiled time = @elapsed Threads.@threads for i in eachindex(inputs) @invokelatest compute_function(inputs[i]) end rate = length(inputs) / time return (time, rate) end function gpu_bench(compute_function, inputs) CUDA.@sync compute_function.(inputs[begin:10]) # make sure it's compiled time = @elapsed CUDA.@sync compute_function.(inputs) rate = length(inputs) / time return (time, rate) end function bench_process( process::MetagraphOptimization.AbstractProcessDescription, func, io::IO = stdout; use_likwid = true, ) println(io, "\n--- Benchmarking $(process) ---") NFLOPs = GraphProperties(graph).computeEffort if use_likwid input = gen_process_input(process) func(input) # compile first _, events = @perfmon "FLOPS_DP" func(input) NFLOPs = first(events["FLOPS_DP"])["RETIRED_SSE_AVX_FLOPS_ALL"] end nInputs = 10000000 # ten million println(io, "Generating $nInputs inputs with $(Threads.nthreads()) threads...") inputs = Vector{typeof(gen_process_input(process))}() resize!(inputs, nInputs) processes = Vector{typeof(process)}() for i in 1:Threads.nthreads() push!(processes, copy(process)) end Threads.@threads for i in eachindex(inputs) inputs[i] = gen_process_input(processes[Threads.nthreads()]) end println(io, "Benchmarking CPU with $(Threads.nthreads()) threads...") (time_cpu, rate_cpu) = cpu_bench(func, inputs) flops_cpu = (rate_cpu * NFLOPs) / 1024^3 println(io, "Benchmarking GPU...") cuInputs = CuArray(inputs) (time_gpu, rate_gpu) = gpu_bench(func, cuInputs) flops_gpu = (rate_gpu * NFLOPs) / 1024^3 println(io, "\nBenchmark Summary for $(process):") if use_likwid println(io, "Measured FLOPS by LIKWID: $NFLOPs") else println(io, "Total graph compute effort: $NFLOPs") end println(io, "Total input size: $(bytes_to_human_readable(Base.summarysize(inputs)))") println(io, "CPU, $(Threads.nthreads()) threads") println(io, " Time: $time_cpu") println(io, " Rate: $rate_cpu") println(io, " GFLOPS: $flops_cpu") println(io, "GPU, $(name(first(CUDA.devices())))") println(io, " Time: $time_gpu") println(io, " Rate: $rate_gpu") return println(io, " GFLOPS: $flops_gpu") end # use "mock" machine that only uses cpu machine = Machine( [ MetagraphOptimization.NumaNode( 0, 1, MetagraphOptimization.default_strategy(MetagraphOptimization.NumaNode), -1.0, UUIDs.uuid1(), ), ], [-1.0;;], ) optimizer = ReductionOptimizer() # sadly cannot put these in functions because the world age must increase after the function is created which happens only in the global scope # compton process = parse_process("ke->ke", QEDModel()) graph = gen_graph(process) optimize_to_fixpoint!(optimizer, graph) compute_func = get_compute_function(graph, process, machine) bench_process(process, compute_func) # 2-photon compton process = parse_process("ke->kke", QEDModel()) graph = gen_graph(process) optimize_to_fixpoint!(optimizer, graph) compute_func = get_compute_function(graph, process, machine) bench_process(process, compute_func) # 3-photon compton process = parse_process("ke->kkke", QEDModel()) graph = gen_graph(process) optimize_to_fixpoint!(optimizer, graph) compute_func = get_compute_function(graph, process, machine) bench_process(process, compute_func) # AB->AB process = parse_process("AB->AB", ABCModel()) graph = parse_dag("input/AB->AB.txt", ABCModel()) optimize_to_fixpoint!(optimizer, graph) compute_func = get_compute_function(graph, process, machine) bench_process(process, compute_func) # AB->AB^3 process = parse_process("AB->ABBB", ABCModel()) graph = parse_dag("input/AB->ABBB.txt", ABCModel()) optimize_to_fixpoint!(optimizer, graph) compute_func = get_compute_function(graph, process, machine) bench_process(process, compute_func) exit(0) # 4-photon compton process = parse_process("ke->kkkke", QEDModel()) graph = gen_graph(process) optimize_to_fixpoint!(optimizer, graph) compute_func = get_compute_function(graph, process, machine) bench_process(process, compute_func) # AB->AB^5 process = parse_process("AB->ABBBBB", ABCModel()) graph = parse_dag("input/AB->ABBBBB.txt", ABCModel()) optimize_to_fixpoint!(optimizer, graph) compute_func = get_compute_function(graph, process, machine) bench_process(process, compute_func)