From d036f218620daacc5b8f4c8d8a37b6e65823771e Mon Sep 17 00:00:00 2001 From: Anton Reinhard <anton.reinhard@proton.me> Date: Thu, 7 Mar 2024 00:08:04 +0100 Subject: [PATCH] Use CUDA kernels in bench scripts --- data/bench_results_32.csv | 9 ++ data/evaluate_cpu_gpu_exec.jl | 143 ++++++++++++++++++++++ examples/full_node_bench.jl | 29 +++-- examples/qed_bench.jl | 137 +++++++-------------- examples/qed_bench_reduction_steps_gpu.jl | 1 + experiments/full_node.sh | 9 +- experiments/run_gen_diagram.sh | 3 +- experiments/run_qed_exec.sh | 11 +- experiments/run_reduce_bench.sh | 9 +- experiments/run_reduce_bench_gpu.sh | 7 +- src/MetagraphOptimization.jl | 2 +- 11 files changed, 236 insertions(+), 124 deletions(-) create mode 100644 data/bench_results_32.csv create mode 100644 data/evaluate_cpu_gpu_exec.jl diff --git a/data/bench_results_32.csv b/data/bench_results_32.csv new file mode 100644 index 0000000..51454f2 --- /dev/null +++ b/data/bench_results_32.csv @@ -0,0 +1,9 @@ +process_name,graph_gen_time,optimization_time,function_generation_time,graph_nodes,graph_edges,graph_mem,cpu_threads,n_inputs,nflops_likwid,cpu_time,cpu_std,cpu_rate,cpu_gflops,gpu_name,gpu_time,gpu_std,gpu_rate,gpu_gflops +QED Process: 'ke->ke' reduced,0.004851193,0.001290078,0.006093401,26,29,6948.0,32,10000000,0,0.2810178885,0.00909457898005121,3.5584923270818755e7,1.886000933353394,NVIDIA A100-SXM4-80GB,0.4060797745,0.0013320688448668838,2.462570319418851e7,1.305162269291991 +QED Process: 'ke->kke' reduced,0.001065397,0.010432606,0.014287271,59,77,16383.0,32,10000000,0,0.823029796,0.01692859562197734,1.2150228398292398e7,2.223491796887509,NVIDIA A100-SXM4-80GB,2.3333098275,0.0014037935241043983,4.285757460128814e6,0.784293615203573 +QED Process: 'ke->kkke' reduced,0.001348518,0.005210738,0.034243651,188,273,54426.0,32,10000000,0,2.9432864705,0.031053960614444084,3.397562588700793e6,2.497208502695083,NVIDIA A100-SXM4-80GB,10.340032588,0.0028660606476431714,967114.9403924877,0.7108294811884784 +QED Process: 'ke->kkkke' reduced,0.004413783,0.039469525,0.15704043,853,1295,243781.0,32,10000000,0,14.980394603,0.5162977440607073,667539.1580137269,2.4318451526440072,NVIDIA A100-SXM4-80GB,54.2063089555,0.006347197107681703,184480.371246258,0.672061992450118 +QED Process: 'ke->kkkkke' reduced,0.021871728,0.716956567,1.121625045,4982,7655,1.800816e6,32,10000000,0,82.035650126,0.3421310894344223,121898.22332901397,2.6545776094359375,NVIDIA A100-SXM4-80GB,321.789538108,NaN,31076.212293277757,0.6767466751107096 +ABC Process: 'AB->AB' reduced,0.000867035,0.002263493,0.007340721,34,37,9296.0,32,10000000,0,0.1877912925,0.0029540808349122686,5.325060532292784e7,2.8222820821151755,NVIDIA A100-SXM4-80GB,0.0016617045,1.5729813606955104e-5,6.01791714471496e9,318.9496086698929 +ABC Process: 'AB->ABBB' reduced,0.000547175,0.004720326,0.035918118,200,285,57156.0,32,10000000,0,0.257040364,0.007250633041861087,3.8904395575785905e7,28.59473074820264,NVIDIA A100-SXM4-80GB,0.003641165,3.2217340292524716e-5,2.74637375675093e9,2018.5847112119334 +ABC Process: 'AB->ABBBBB' reduced,0.019826198,0.258674017,1.136386232,4998,7671,1.507432e6,32,10000000,0,1.818710381,0.03353568966350073,5.498401562156146e6,119.7386908190744,NVIDIA A100-SXM4-80GB,0.492263776,0.0031065569742746986,2.031431213821429e7,442.38477543389257 diff --git a/data/evaluate_cpu_gpu_exec.jl b/data/evaluate_cpu_gpu_exec.jl new file mode 100644 index 0000000..b7a3007 --- /dev/null +++ b/data/evaluate_cpu_gpu_exec.jl @@ -0,0 +1,143 @@ +using CSV +using DataFrames +using Plots +using StatsPlots +using LaTeXStrings + +if (length(ARGS) < 1) + println("Please use with \"input_file.csv\"") +end + +processes = [ + "QED Process: 'ke->ke'", + "QED Process: 'ke->kke'", + "QED Process: 'ke->kkke'", + "QED Process: 'ke->kkkke'", + "QED Process: 'ke->kkkkke'", + #"QED Process: 'ke->kkkkkke'", + #"QED Process: 'ke->kkkkkkke'", + "ABC Process: 'AB->AB'", + "ABC Process: 'AB->ABBB'", + "ABC Process: 'AB->ABBBBB'", +] + +function proc_to_n(str::AbstractString) + parts = split(str, "'") + parts = split(parts[2], "->") + k_count = count(c -> c == 'k', parts[2]) + return k_count +end + +function abc_proc_to_n(str::AbstractString) + parts = split(str, "'") + parts = split(parts[2], "->") + b_count = count(c -> c == 'B', parts[2]) + return b_count +end + +function beautify_title(str::AbstractString) + parts = split(str, "'") + + preprefix = parts[1] + infix = parts[2] + sufsuffix = parts[3] + + parts = split(infix, "->") + + prefix = parts[1] + suffix = parts[2] + + k_count = count(c -> c == 'k', suffix) + B_count = count(c -> c == 'B', suffix) + + if k_count == 1 || B_count == 1 + new_suffix = suffix + elseif k_count >= 1 + new_suffix = replace(suffix, r"k+" => "k^$k_count") + elseif B_count >= 1 + new_suffix = replace(suffix, r"B+" => "B^$B_count") + end + + return preprefix * L"%$prefix \rightarrow %$new_suffix" * sufsuffix +end + +input_file = ARGS[1] +df = CSV.read(input_file, DataFrame) +n_inputs = df[:, "n_inputs"][1] + + + +title_string = "QED N-Photon Compton Scattering\nCalculate 10,000,000 Matrix Elements" + +df_filt = filter(:process_name => x -> proc_to_n(x) >= 1, df) + +df_filt.process_size = @. proc_to_n(df_filt.process_name) + +df_red = filter(:process_name => x -> match(r" reduced$", x) !== nothing, df_filt) + +@df df_red scatter( + :process_size, + :cpu_time, + yerror = :cpu_std, + label = "CPU execution time, 32 threads (s)", + markersize = 6, +) +@df df_red scatter!( + :process_size, + :gpu_time, + yerror = :gpu_std, + label = "GPU execution time, A100 80GB (s)", + markersize = 6, +) + +plot!( + title = title_string, + yscale = :log10, + legend = :outerbottom, + legendcolumns = 2, + legend_font_pointsize = 10, + size = (800, 600), + ylabel = "time (s)", + xlabel = "process size (#)", +) + +savefig("cpu_vs_gpu_qed.pdf") + + + + +title_string = "\$AB\\rightarrow AB^n\$ ABC Processes\nCalculate 10,000,000 Matrix Elements" + +df_filt = filter(:process_name => x -> abc_proc_to_n(x) >= 1, df) + +df_filt.process_size = @. abc_proc_to_n(df_filt.process_name) + +df_red = filter(:process_name => x -> match(r" reduced$", x) !== nothing, df_filt) + +@df df_red scatter( + :process_size, + :cpu_time, + yerror = :cpu_std, + label = "CPU execution time, 32 threads (s)", + markersize = 6, +) +@df df_red scatter!( + :process_size, + :gpu_time, + yerror = :gpu_std, + label = "GPU execution time, A100 80GB (s)", + markersize = 6, +) + +plot!( + title = title_string, + yscale = :log10, + legend = :outerbottom, + legendcolumns = 2, + legend_font_pointsize = 10, + size = (800, 600), + ylabel = "time (s)", + xlabel = "process size (#)", +) + +savefig("cpu_vs_gpu_abc.pdf") diff --git a/examples/full_node_bench.jl b/examples/full_node_bench.jl index 351096a..e4b2efa 100644 --- a/examples/full_node_bench.jl +++ b/examples/full_node_bench.jl @@ -80,10 +80,13 @@ function cpu_worker(compute_func, inputs, chunk_size) end # called with a specific device selected -function gpu_worker(compute_func, inputs, chunk_size) +function gpu_worker(kernel!, inputs, chunk_size) global progress global gpu_chunks global lck + cuOutputs = CuVector{ComplexF64}() + resize!(cuOutputs, chunk_size) + quit = false work_start = 0 work_end = 0 @@ -104,7 +107,9 @@ function gpu_worker(compute_func, inputs, chunk_size) end cuInputs = CuVector(inputs[work_start:work_end]) - compute_func.(cuInputs) + ts = 32 + bs = Int(chunk_size / 32) + CUDA.@sync threads = ts blocks = bs always_inline = true kernel!(cuInputs, cuOutputs, chunk_size) end #log("GPU Worker on Device $(CUDA.device()) finished!") @@ -114,7 +119,7 @@ end cpu_gpu_ratio = Vector{Tuple{Int, Int}}() -function full_compute(compute_func, inputs, chunk_size) +function full_compute(compute_func, kernel!, inputs, chunk_size) global progress progress = 1 global cpu_chunks @@ -126,7 +131,7 @@ function full_compute(compute_func, inputs, chunk_size) for dev in CUDA.devices() t = Threads.@spawn device!(dev) do - gpu_worker(compute_func, inputs, chunk_size) + gpu_worker(kernel!, inputs, chunk_size) return nothing end push!(tasks, t) @@ -145,12 +150,12 @@ function full_compute(compute_func, inputs, chunk_size) return nothing end -function bench(compute_function, inputs, chunk_size) +function bench(compute_function, kernel!, inputs, chunk_size) global cpu_gpu_ratio empty!(cpu_gpu_ratio) bench = @benchmark begin - full_compute($compute_function, $inputs, $chunk_size) + full_compute($compute_function, $kernel!, $inputs, $chunk_size) end gcsample = true seconds = 30 time = median(bench.times) / 1e9 @@ -165,7 +170,7 @@ function bench(compute_function, inputs, chunk_size) return (time, rate, s, med_cpu_chunks, med_gpu_chunks) end -function full_node_bench(process::MetagraphOptimization.AbstractProcessDescription, func, chunk_size, inputs) +function full_node_bench(process::MetagraphOptimization.AbstractProcessDescription, func, kernel!, chunk_size, inputs) process_name = string(process) log("\n--- Benchmarking $(process_name) on $(nInputs) with chunk size $(chunk_size) ---") @@ -173,7 +178,7 @@ function full_node_bench(process::MetagraphOptimization.AbstractProcessDescripti display.(CUDA.devices()) log("Benchmarking full node...") - (time, rate, s, med_cpu_chunks, med_gpu_chunks) = bench(func, inputs, chunk_size) + (time, rate, s, med_cpu_chunks, med_gpu_chunks) = bench(func, kernel!, inputs, chunk_size) log( "Benchmarking complete with median time $(time), $(med_cpu_chunks) cpu chunks, and $(med_gpu_chunks) gpu chunks.", ) @@ -212,14 +217,14 @@ machine = Machine( ) optimizer = ReductionOptimizer() -processes = [#="ke->ke", "ke->kke", "ke->kkke", =#"ke->kkkke", "ke->kkkkke"] +processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke"] for proc in processes process = parse_process(proc, QEDModel()) graph = gen_graph(process) optimize_to_fixpoint!(optimizer, graph) - func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) - + compute_func = get_compute_function(graph, process, machine) + kernel! = get_cuda_kernel(graph, process, machine) log("Generating $nInputs inputs with $(Threads.nthreads()) threads...") inputs = Vector{typeof(gen_process_input(process))}() @@ -234,7 +239,7 @@ for proc in processes end for chunk_size in chunkSizes - full_node_bench(process, compute_func, chunk_size, inputs) + full_node_bench(process, compute_func, kernel!, chunk_size, inputs) CSV.write(results_filename, df) end end; diff --git a/examples/qed_bench.jl b/examples/qed_bench.jl index 07fc7dd..fbbf1d2 100644 --- a/examples/qed_bench.jl +++ b/examples/qed_bench.jl @@ -44,7 +44,7 @@ if isfile(results_filename) df = CSV.read(results_filename, DataFrame) end -nInputs = 10_000_000 +nInputs = 2^24 function cpu_bench(compute_function, inputs) bench = @benchmark begin @@ -60,9 +60,15 @@ function cpu_bench(compute_function, inputs) return (time, rate, s) end -function gpu_bench(compute_function, inputs) +function gpu_bench(kernel!, inputs) + n = length(inputs) + outputs = CuVector{ComplexF64}() + resize!(outputs, n) + ts = 32 + bs = Int(n / ts) bench = @benchmark begin - CUDA.@sync $compute_function.($inputs) + @cuda threads = ts blocks = bs always_inline = true kernel!.($inputs, $outputs, $n) + CUDA.device_synchronize() end gcsample = true seconds = 300 time = median(bench.times) / 1e9 @@ -77,6 +83,7 @@ function bench_process( process_name::String, graph::DAG, func, + kernel!, gen_time::Float64, opt_time::Float64, func_time::Float64; @@ -131,7 +138,7 @@ function bench_process( log("Benchmarking GPU...") gpu_name = "$(name(first(CUDA.devices())))" cuInputs = CuArray(inputs) - (time_gpu, rate_gpu, std_gpu) = gpu_bench(func, cuInputs) + (time_gpu, rate_gpu, std_gpu) = gpu_bench(kernel!, cuInputs) flops_gpu = (rate_gpu * NFLOPs) / 10^9 else log("Skipping GPU...") @@ -211,7 +218,8 @@ process = parse_process("ke->kke", QEDModel()) gen_time = @elapsed graph = gen_graph(process) opt_time = @elapsed optimize!(optimizer, graph, 200) func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) -bench_process(process, "warmup", graph, compute_func, gen_time, opt_time, func_gen_time) +kernel! = get_cuda_kernel(graph, process, machine) +bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time) optimizer = ReductionOptimizer() @@ -220,104 +228,45 @@ process = parse_process("AB->ABBB", ABCModel()) gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel()) opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph) func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) -bench_process(process, "warmup", graph, compute_func, gen_time, opt_time, func_gen_time) +kernel! = get_cuda_kernel(graph, process, machine) +bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time) ## -- WARMUP END optimizer = ReductionOptimizer() -# compton -process = parse_process("ke->ke", QEDModel()) -gen_time = @elapsed graph = gen_graph(process) -func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) -#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time) +processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke", "ke->kkkkkke"] -opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph) -func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) -bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time) +for process_str in processes + # compton + process = parse_process(process_str, QEDModel()) + gen_time = @elapsed graph = gen_graph(process) + func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) + kernel! = get_cuda_kernel(graph, process, machine) + bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time) -CSV.write(results_filename, df) + opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph) + func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) + kernel! = get_cuda_kernel(graph, process, machine) + bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time) -# 2-photon compton -process = parse_process("ke->kke", QEDModel()) -gen_time = @elapsed graph = gen_graph(process) -func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) -#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time) + CSV.write(results_filename, df) +end -opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph) -func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) -bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time) +processes = ["AB->AB", "AB->ABBB", "AB->ABBBBB"] -CSV.write(results_filename, df) +for process_str in processes + # AB->AB + process = parse_process(process_str, ABCModel()) + gen_time = @elapsed graph = parse_dag("input/$(process_str).txt", ABCModel()) + func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) + kernel! = get_cuda_kernel(graph, process, machine) + bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time) -# 3-photon compton -process = parse_process("ke->kkke", QEDModel()) -gen_time = @elapsed graph = gen_graph(process) -func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) -#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time) + opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph) + func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) + kernel! = get_cuda_kernel(graph, process, machine) + bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time) -opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph) -func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) -bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time) - -CSV.write(results_filename, df) - -# 4-photon compton -process = parse_process("ke->kkkke", QEDModel()) -gen_time = @elapsed graph = gen_graph(process) -func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) -#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time, use_gpu = false) - -opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph) -func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) -bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time) - -CSV.write(results_filename, df) - -# 5-photon compton -process = parse_process("ke->kkkkke", QEDModel()) -gen_time = @elapsed graph = gen_graph(process) -func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) -#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time) - -opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph) -func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) -bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time) - -CSV.write(results_filename, df) - -# AB->AB -process = parse_process("AB->AB", ABCModel()) -gen_time = @elapsed graph = parse_dag("input/AB->AB.txt", ABCModel()) -func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) -#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time) - -opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph) -func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) -bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time) - -CSV.write(results_filename, df) - -# AB->AB^3 -process = parse_process("AB->ABBB", ABCModel()) -gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel()) -func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) -#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time) - -opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph) -func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) -bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time) - -CSV.write(results_filename, df) - -# AB->AB^5 -process = parse_process("AB->ABBBBB", ABCModel()) -gen_time = @elapsed graph = parse_dag("input/AB->ABBBBB.txt", ABCModel()) -func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) -#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time) - -opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph) -func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine) -bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time) - -CSV.write(results_filename, df) + CSV.write(results_filename, df) +end diff --git a/examples/qed_bench_reduction_steps_gpu.jl b/examples/qed_bench_reduction_steps_gpu.jl index 49c3a71..39e1239 100644 --- a/examples/qed_bench_reduction_steps_gpu.jl +++ b/examples/qed_bench_reduction_steps_gpu.jl @@ -37,6 +37,7 @@ function log(x...) end function bench(func, inputs) + # todo: use gpu kernel instead of broadcasting gpu_compile_time = @elapsed func.(inputs[1:2]) gpu_time = @benchmark $func.($inputs) diff --git a/experiments/full_node.sh b/experiments/full_node.sh index c046bfe..1e9462e 100755 --- a/experiments/full_node.sh +++ b/experiments/full_node.sh @@ -15,10 +15,11 @@ nvidia-smi > results/cuda_gpu_full_node.txt lsblk > results/storage_full_node.txt lspci > results/pci_full_node.txt -#echo "Initiating julia..." -#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses -#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script +echo "Initiating julia..." +julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses +julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu +julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number" echo "Benchmarking Full Node 128 Threads + *GPUs*" -julia --project --threads=128 examples/full_node_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --" +julia --project -O3 --threads=128 examples/full_node_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --" diff --git a/experiments/run_gen_diagram.sh b/experiments/run_gen_diagram.sh index 7b04f55..6d5fd83 100755 --- a/experiments/run_gen_diagram.sh +++ b/experiments/run_gen_diagram.sh @@ -19,8 +19,9 @@ lspci > results/pci.txt echo "Initiating julia..." julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses +julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("BenchmarkTools"); Pkg.add("StatsBase")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script echo "Benchmarking with $i threads..." -julia --project --threads=$i examples/qed_gen_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --" +julia --project -O3 --threads=$i examples/qed_gen_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --" diff --git a/experiments/run_qed_exec.sh b/experiments/run_qed_exec.sh index d735dc3..21fed28 100755 --- a/experiments/run_qed_exec.sh +++ b/experiments/run_qed_exec.sh @@ -18,13 +18,14 @@ nvidia-smi > results/cuda_gpu_$i.txt lsblk > results/storage_$i.txt lspci > results/pci_$i.txt -#echo "Initiating julia..." -#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses -#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script +echo "Initiating julia..." +julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses +julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu +julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number" echo "Benchmarking $i Threads" -julia --project --threads=$i examples/qed_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --" +julia --project -O3 --threads=$i examples/qed_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --" echo "Benchmarking Tape variant $i Threads" -julia --project --threads=$i examples/qed_bench_tape.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --" +julia --project -O3 --threads=$i examples/qed_bench_tape.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --" diff --git a/experiments/run_reduce_bench.sh b/experiments/run_reduce_bench.sh index 48684b4..0135e1c 100755 --- a/experiments/run_reduce_bench.sh +++ b/experiments/run_reduce_bench.sh @@ -15,9 +15,10 @@ nvidia-smi > results/cuda_gpu_bench_reduce.txt lsblk > results/storage_bench_reduce.txt lspci > results/pci_bench_reduce.txt -#echo "Initiating julia..." -#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses -#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script +echo "Initiating julia..." +julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses +julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu +julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script echo "Benchmarking Reduction 32 Threads" -julia --project --threads=32 examples/qed_bench_reduction_steps.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --" +julia --project -O3 --threads=32 examples/qed_bench_reduction_steps.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --" diff --git a/experiments/run_reduce_bench_gpu.sh b/experiments/run_reduce_bench_gpu.sh index a980c4b..989bdc0 100755 --- a/experiments/run_reduce_bench_gpu.sh +++ b/experiments/run_reduce_bench_gpu.sh @@ -15,9 +15,10 @@ nvidia-smi > results/cuda_gpu_bench_reduce_gpu.txt lsblk > results/storage_bench_reduce_gpu.txt lspci > results/pci_bench_reduce_gpu.txt -#echo "Initiating julia..." -#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses -#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script +echo "Initiating julia..." +julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses +julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu +julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number" echo "Benchmarking Reduction 32 Threads, *GPU*" diff --git a/src/MetagraphOptimization.jl b/src/MetagraphOptimization.jl index f4ea95f..f17c8b1 100644 --- a/src/MetagraphOptimization.jl +++ b/src/MetagraphOptimization.jl @@ -78,7 +78,7 @@ export gen_graph export execute export parse_dag, parse_process export gen_process_input -export get_compute_function +export get_compute_function, get_cuda_kernel export gen_tape, execute_tape # estimator