experiments #1
9
data/bench_results_32.csv
Normal file
9
data/bench_results_32.csv
Normal file
@ -0,0 +1,9 @@
|
||||
process_name,graph_gen_time,optimization_time,function_generation_time,graph_nodes,graph_edges,graph_mem,cpu_threads,n_inputs,nflops_likwid,cpu_time,cpu_std,cpu_rate,cpu_gflops,gpu_name,gpu_time,gpu_std,gpu_rate,gpu_gflops
|
||||
QED Process: 'ke->ke' reduced,0.004851193,0.001290078,0.006093401,26,29,6948.0,32,10000000,0,0.2810178885,0.00909457898005121,3.5584923270818755e7,1.886000933353394,NVIDIA A100-SXM4-80GB,0.4060797745,0.0013320688448668838,2.462570319418851e7,1.305162269291991
|
||||
QED Process: 'ke->kke' reduced,0.001065397,0.010432606,0.014287271,59,77,16383.0,32,10000000,0,0.823029796,0.01692859562197734,1.2150228398292398e7,2.223491796887509,NVIDIA A100-SXM4-80GB,2.3333098275,0.0014037935241043983,4.285757460128814e6,0.784293615203573
|
||||
QED Process: 'ke->kkke' reduced,0.001348518,0.005210738,0.034243651,188,273,54426.0,32,10000000,0,2.9432864705,0.031053960614444084,3.397562588700793e6,2.497208502695083,NVIDIA A100-SXM4-80GB,10.340032588,0.0028660606476431714,967114.9403924877,0.7108294811884784
|
||||
QED Process: 'ke->kkkke' reduced,0.004413783,0.039469525,0.15704043,853,1295,243781.0,32,10000000,0,14.980394603,0.5162977440607073,667539.1580137269,2.4318451526440072,NVIDIA A100-SXM4-80GB,54.2063089555,0.006347197107681703,184480.371246258,0.672061992450118
|
||||
QED Process: 'ke->kkkkke' reduced,0.021871728,0.716956567,1.121625045,4982,7655,1.800816e6,32,10000000,0,82.035650126,0.3421310894344223,121898.22332901397,2.6545776094359375,NVIDIA A100-SXM4-80GB,321.789538108,NaN,31076.212293277757,0.6767466751107096
|
||||
ABC Process: 'AB->AB' reduced,0.000867035,0.002263493,0.007340721,34,37,9296.0,32,10000000,0,0.1877912925,0.0029540808349122686,5.325060532292784e7,2.8222820821151755,NVIDIA A100-SXM4-80GB,0.0016617045,1.5729813606955104e-5,6.01791714471496e9,318.9496086698929
|
||||
ABC Process: 'AB->ABBB' reduced,0.000547175,0.004720326,0.035918118,200,285,57156.0,32,10000000,0,0.257040364,0.007250633041861087,3.8904395575785905e7,28.59473074820264,NVIDIA A100-SXM4-80GB,0.003641165,3.2217340292524716e-5,2.74637375675093e9,2018.5847112119334
|
||||
ABC Process: 'AB->ABBBBB' reduced,0.019826198,0.258674017,1.136386232,4998,7671,1.507432e6,32,10000000,0,1.818710381,0.03353568966350073,5.498401562156146e6,119.7386908190744,NVIDIA A100-SXM4-80GB,0.492263776,0.0031065569742746986,2.031431213821429e7,442.38477543389257
|
|
143
data/evaluate_cpu_gpu_exec.jl
Normal file
143
data/evaluate_cpu_gpu_exec.jl
Normal file
@ -0,0 +1,143 @@
|
||||
using CSV
|
||||
using DataFrames
|
||||
using Plots
|
||||
using StatsPlots
|
||||
using LaTeXStrings
|
||||
|
||||
if (length(ARGS) < 1)
|
||||
println("Please use with \"input_file.csv\"")
|
||||
end
|
||||
|
||||
processes = [
|
||||
"QED Process: 'ke->ke'",
|
||||
"QED Process: 'ke->kke'",
|
||||
"QED Process: 'ke->kkke'",
|
||||
"QED Process: 'ke->kkkke'",
|
||||
"QED Process: 'ke->kkkkke'",
|
||||
#"QED Process: 'ke->kkkkkke'",
|
||||
#"QED Process: 'ke->kkkkkkke'",
|
||||
"ABC Process: 'AB->AB'",
|
||||
"ABC Process: 'AB->ABBB'",
|
||||
"ABC Process: 'AB->ABBBBB'",
|
||||
]
|
||||
|
||||
function proc_to_n(str::AbstractString)
|
||||
parts = split(str, "'")
|
||||
parts = split(parts[2], "->")
|
||||
k_count = count(c -> c == 'k', parts[2])
|
||||
return k_count
|
||||
end
|
||||
|
||||
function abc_proc_to_n(str::AbstractString)
|
||||
parts = split(str, "'")
|
||||
parts = split(parts[2], "->")
|
||||
b_count = count(c -> c == 'B', parts[2])
|
||||
return b_count
|
||||
end
|
||||
|
||||
function beautify_title(str::AbstractString)
|
||||
parts = split(str, "'")
|
||||
|
||||
preprefix = parts[1]
|
||||
infix = parts[2]
|
||||
sufsuffix = parts[3]
|
||||
|
||||
parts = split(infix, "->")
|
||||
|
||||
prefix = parts[1]
|
||||
suffix = parts[2]
|
||||
|
||||
k_count = count(c -> c == 'k', suffix)
|
||||
B_count = count(c -> c == 'B', suffix)
|
||||
|
||||
if k_count == 1 || B_count == 1
|
||||
new_suffix = suffix
|
||||
elseif k_count >= 1
|
||||
new_suffix = replace(suffix, r"k+" => "k^$k_count")
|
||||
elseif B_count >= 1
|
||||
new_suffix = replace(suffix, r"B+" => "B^$B_count")
|
||||
end
|
||||
|
||||
return preprefix * L"%$prefix \rightarrow %$new_suffix" * sufsuffix
|
||||
end
|
||||
|
||||
input_file = ARGS[1]
|
||||
df = CSV.read(input_file, DataFrame)
|
||||
n_inputs = df[:, "n_inputs"][1]
|
||||
|
||||
|
||||
|
||||
title_string = "QED N-Photon Compton Scattering\nCalculate 10,000,000 Matrix Elements"
|
||||
|
||||
df_filt = filter(:process_name => x -> proc_to_n(x) >= 1, df)
|
||||
|
||||
df_filt.process_size = @. proc_to_n(df_filt.process_name)
|
||||
|
||||
df_red = filter(:process_name => x -> match(r" reduced$", x) !== nothing, df_filt)
|
||||
|
||||
@df df_red scatter(
|
||||
:process_size,
|
||||
:cpu_time,
|
||||
yerror = :cpu_std,
|
||||
label = "CPU execution time, 32 threads (s)",
|
||||
markersize = 6,
|
||||
)
|
||||
@df df_red scatter!(
|
||||
:process_size,
|
||||
:gpu_time,
|
||||
yerror = :gpu_std,
|
||||
label = "GPU execution time, A100 80GB (s)",
|
||||
markersize = 6,
|
||||
)
|
||||
|
||||
plot!(
|
||||
title = title_string,
|
||||
yscale = :log10,
|
||||
legend = :outerbottom,
|
||||
legendcolumns = 2,
|
||||
legend_font_pointsize = 10,
|
||||
size = (800, 600),
|
||||
ylabel = "time (s)",
|
||||
xlabel = "process size (#)",
|
||||
)
|
||||
|
||||
savefig("cpu_vs_gpu_qed.pdf")
|
||||
|
||||
|
||||
|
||||
|
||||
title_string = "\$AB\\rightarrow AB^n\$ ABC Processes\nCalculate 10,000,000 Matrix Elements"
|
||||
|
||||
df_filt = filter(:process_name => x -> abc_proc_to_n(x) >= 1, df)
|
||||
|
||||
df_filt.process_size = @. abc_proc_to_n(df_filt.process_name)
|
||||
|
||||
df_red = filter(:process_name => x -> match(r" reduced$", x) !== nothing, df_filt)
|
||||
|
||||
@df df_red scatter(
|
||||
:process_size,
|
||||
:cpu_time,
|
||||
yerror = :cpu_std,
|
||||
label = "CPU execution time, 32 threads (s)",
|
||||
markersize = 6,
|
||||
)
|
||||
@df df_red scatter!(
|
||||
:process_size,
|
||||
:gpu_time,
|
||||
yerror = :gpu_std,
|
||||
label = "GPU execution time, A100 80GB (s)",
|
||||
markersize = 6,
|
||||
)
|
||||
|
||||
plot!(
|
||||
title = title_string,
|
||||
yscale = :log10,
|
||||
legend = :outerbottom,
|
||||
legendcolumns = 2,
|
||||
legend_font_pointsize = 10,
|
||||
size = (800, 600),
|
||||
ylabel = "time (s)",
|
||||
xlabel = "process size (#)",
|
||||
)
|
||||
|
||||
savefig("cpu_vs_gpu_abc.pdf")
|
@ -80,10 +80,13 @@ function cpu_worker(compute_func, inputs, chunk_size)
|
||||
end
|
||||
|
||||
# called with a specific device selected
|
||||
function gpu_worker(compute_func, inputs, chunk_size)
|
||||
function gpu_worker(kernel!, inputs, chunk_size)
|
||||
global progress
|
||||
global gpu_chunks
|
||||
global lck
|
||||
cuOutputs = CuVector{ComplexF64}()
|
||||
resize!(cuOutputs, chunk_size)
|
||||
|
||||
quit = false
|
||||
work_start = 0
|
||||
work_end = 0
|
||||
@ -104,7 +107,9 @@ function gpu_worker(compute_func, inputs, chunk_size)
|
||||
end
|
||||
|
||||
cuInputs = CuVector(inputs[work_start:work_end])
|
||||
compute_func.(cuInputs)
|
||||
ts = 32
|
||||
bs = Int(chunk_size / 32)
|
||||
CUDA.@sync threads = ts blocks = bs always_inline = true kernel!(cuInputs, cuOutputs, chunk_size)
|
||||
end
|
||||
|
||||
#log("GPU Worker on Device $(CUDA.device()) finished!")
|
||||
@ -114,7 +119,7 @@ end
|
||||
|
||||
cpu_gpu_ratio = Vector{Tuple{Int, Int}}()
|
||||
|
||||
function full_compute(compute_func, inputs, chunk_size)
|
||||
function full_compute(compute_func, kernel!, inputs, chunk_size)
|
||||
global progress
|
||||
progress = 1
|
||||
global cpu_chunks
|
||||
@ -126,7 +131,7 @@ function full_compute(compute_func, inputs, chunk_size)
|
||||
|
||||
for dev in CUDA.devices()
|
||||
t = Threads.@spawn device!(dev) do
|
||||
gpu_worker(compute_func, inputs, chunk_size)
|
||||
gpu_worker(kernel!, inputs, chunk_size)
|
||||
return nothing
|
||||
end
|
||||
push!(tasks, t)
|
||||
@ -145,12 +150,12 @@ function full_compute(compute_func, inputs, chunk_size)
|
||||
return nothing
|
||||
end
|
||||
|
||||
function bench(compute_function, inputs, chunk_size)
|
||||
function bench(compute_function, kernel!, inputs, chunk_size)
|
||||
global cpu_gpu_ratio
|
||||
empty!(cpu_gpu_ratio)
|
||||
|
||||
bench = @benchmark begin
|
||||
full_compute($compute_function, $inputs, $chunk_size)
|
||||
full_compute($compute_function, $kernel!, $inputs, $chunk_size)
|
||||
end gcsample = true seconds = 30
|
||||
|
||||
time = median(bench.times) / 1e9
|
||||
@ -165,7 +170,7 @@ function bench(compute_function, inputs, chunk_size)
|
||||
return (time, rate, s, med_cpu_chunks, med_gpu_chunks)
|
||||
end
|
||||
|
||||
function full_node_bench(process::MetagraphOptimization.AbstractProcessDescription, func, chunk_size, inputs)
|
||||
function full_node_bench(process::MetagraphOptimization.AbstractProcessDescription, func, kernel!, chunk_size, inputs)
|
||||
process_name = string(process)
|
||||
log("\n--- Benchmarking $(process_name) on $(nInputs) with chunk size $(chunk_size) ---")
|
||||
|
||||
@ -173,7 +178,7 @@ function full_node_bench(process::MetagraphOptimization.AbstractProcessDescripti
|
||||
display.(CUDA.devices())
|
||||
|
||||
log("Benchmarking full node...")
|
||||
(time, rate, s, med_cpu_chunks, med_gpu_chunks) = bench(func, inputs, chunk_size)
|
||||
(time, rate, s, med_cpu_chunks, med_gpu_chunks) = bench(func, kernel!, inputs, chunk_size)
|
||||
log(
|
||||
"Benchmarking complete with median time $(time), $(med_cpu_chunks) cpu chunks, and $(med_gpu_chunks) gpu chunks.",
|
||||
)
|
||||
@ -212,14 +217,14 @@ machine = Machine(
|
||||
)
|
||||
|
||||
optimizer = ReductionOptimizer()
|
||||
processes = [#="ke->ke", "ke->kke", "ke->kkke", =#"ke->kkkke", "ke->kkkkke"]
|
||||
processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke"]
|
||||
|
||||
for proc in processes
|
||||
process = parse_process(proc, QEDModel())
|
||||
graph = gen_graph(process)
|
||||
optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
|
||||
compute_func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
|
||||
log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")
|
||||
inputs = Vector{typeof(gen_process_input(process))}()
|
||||
@ -234,7 +239,7 @@ for proc in processes
|
||||
end
|
||||
|
||||
for chunk_size in chunkSizes
|
||||
full_node_bench(process, compute_func, chunk_size, inputs)
|
||||
full_node_bench(process, compute_func, kernel!, chunk_size, inputs)
|
||||
CSV.write(results_filename, df)
|
||||
end
|
||||
end;
|
||||
|
@ -44,7 +44,7 @@ if isfile(results_filename)
|
||||
df = CSV.read(results_filename, DataFrame)
|
||||
end
|
||||
|
||||
nInputs = 10_000_000
|
||||
nInputs = 2^24
|
||||
|
||||
function cpu_bench(compute_function, inputs)
|
||||
bench = @benchmark begin
|
||||
@ -60,9 +60,15 @@ function cpu_bench(compute_function, inputs)
|
||||
return (time, rate, s)
|
||||
end
|
||||
|
||||
function gpu_bench(compute_function, inputs)
|
||||
function gpu_bench(kernel!, inputs)
|
||||
n = length(inputs)
|
||||
outputs = CuVector{ComplexF64}()
|
||||
resize!(outputs, n)
|
||||
ts = 32
|
||||
bs = Int(n / ts)
|
||||
bench = @benchmark begin
|
||||
CUDA.@sync $compute_function.($inputs)
|
||||
@cuda threads = ts blocks = bs always_inline = true kernel!.($inputs, $outputs, $n)
|
||||
CUDA.device_synchronize()
|
||||
end gcsample = true seconds = 300
|
||||
|
||||
time = median(bench.times) / 1e9
|
||||
@ -77,6 +83,7 @@ function bench_process(
|
||||
process_name::String,
|
||||
graph::DAG,
|
||||
func,
|
||||
kernel!,
|
||||
gen_time::Float64,
|
||||
opt_time::Float64,
|
||||
func_time::Float64;
|
||||
@ -131,7 +138,7 @@ function bench_process(
|
||||
log("Benchmarking GPU...")
|
||||
gpu_name = "$(name(first(CUDA.devices())))"
|
||||
cuInputs = CuArray(inputs)
|
||||
(time_gpu, rate_gpu, std_gpu) = gpu_bench(func, cuInputs)
|
||||
(time_gpu, rate_gpu, std_gpu) = gpu_bench(kernel!, cuInputs)
|
||||
flops_gpu = (rate_gpu * NFLOPs) / 10^9
|
||||
else
|
||||
log("Skipping GPU...")
|
||||
@ -211,7 +218,8 @@ process = parse_process("ke->kke", QEDModel())
|
||||
gen_time = @elapsed graph = gen_graph(process)
|
||||
opt_time = @elapsed optimize!(optimizer, graph, 200)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "warmup", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
|
||||
|
||||
optimizer = ReductionOptimizer()
|
||||
|
||||
@ -220,104 +228,45 @@ process = parse_process("AB->ABBB", ABCModel())
|
||||
gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "warmup", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
|
||||
|
||||
## -- WARMUP END
|
||||
|
||||
optimizer = ReductionOptimizer()
|
||||
|
||||
# compton
|
||||
process = parse_process("ke->ke", QEDModel())
|
||||
gen_time = @elapsed graph = gen_graph(process)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
|
||||
processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke", "ke->kkkkkke"]
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
for process_str in processes
|
||||
# compton
|
||||
process = parse_process(process_str, QEDModel())
|
||||
gen_time = @elapsed graph = gen_graph(process)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
|
||||
|
||||
# 2-photon compton
|
||||
process = parse_process("ke->kke", QEDModel())
|
||||
gen_time = @elapsed graph = gen_graph(process)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
|
||||
CSV.write(results_filename, df)
|
||||
end
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
processes = ["AB->AB", "AB->ABBB", "AB->ABBBBB"]
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
for process_str in processes
|
||||
# AB->AB
|
||||
process = parse_process(process_str, ABCModel())
|
||||
gen_time = @elapsed graph = parse_dag("input/$(process_str).txt", ABCModel())
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)
|
||||
|
||||
# 3-photon compton
|
||||
process = parse_process("ke->kkke", QEDModel())
|
||||
gen_time = @elapsed graph = gen_graph(process)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
kernel! = get_cuda_kernel(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
|
||||
# 4-photon compton
|
||||
process = parse_process("ke->kkkke", QEDModel())
|
||||
gen_time = @elapsed graph = gen_graph(process)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time, use_gpu = false)
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
|
||||
# 5-photon compton
|
||||
process = parse_process("ke->kkkkke", QEDModel())
|
||||
gen_time = @elapsed graph = gen_graph(process)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
|
||||
# AB->AB
|
||||
process = parse_process("AB->AB", ABCModel())
|
||||
gen_time = @elapsed graph = parse_dag("input/AB->AB.txt", ABCModel())
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
|
||||
# AB->AB^3
|
||||
process = parse_process("AB->ABBB", ABCModel())
|
||||
gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
|
||||
# AB->AB^5
|
||||
process = parse_process("AB->ABBBBB", ABCModel())
|
||||
gen_time = @elapsed graph = parse_dag("input/AB->ABBBBB.txt", ABCModel())
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
|
||||
|
||||
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
|
||||
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
|
||||
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
|
||||
|
||||
CSV.write(results_filename, df)
|
||||
CSV.write(results_filename, df)
|
||||
end
|
||||
|
@ -37,6 +37,7 @@ function log(x...)
|
||||
end
|
||||
|
||||
function bench(func, inputs)
|
||||
# todo: use gpu kernel instead of broadcasting
|
||||
gpu_compile_time = @elapsed func.(inputs[1:2])
|
||||
|
||||
gpu_time = @benchmark $func.($inputs)
|
||||
|
@ -15,10 +15,11 @@ nvidia-smi > results/cuda_gpu_full_node.txt
|
||||
lsblk > results/storage_full_node.txt
|
||||
lspci > results/pci_full_node.txt
|
||||
|
||||
#echo "Initiating julia..."
|
||||
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
echo "Initiating julia..."
|
||||
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
|
||||
julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
|
||||
|
||||
echo "Benchmarking Full Node 128 Threads + *GPUs*"
|
||||
julia --project --threads=128 examples/full_node_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
|
||||
julia --project -O3 --threads=128 examples/full_node_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
|
||||
|
@ -19,8 +19,9 @@ lspci > results/pci.txt
|
||||
|
||||
echo "Initiating julia..."
|
||||
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
|
||||
julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("BenchmarkTools"); Pkg.add("StatsBase")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
|
||||
echo "Benchmarking with $i threads..."
|
||||
|
||||
julia --project --threads=$i examples/qed_gen_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
|
||||
julia --project -O3 --threads=$i examples/qed_gen_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
|
||||
|
@ -18,13 +18,14 @@ nvidia-smi > results/cuda_gpu_$i.txt
|
||||
lsblk > results/storage_$i.txt
|
||||
lspci > results/pci_$i.txt
|
||||
|
||||
#echo "Initiating julia..."
|
||||
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
echo "Initiating julia..."
|
||||
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
|
||||
julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
|
||||
|
||||
echo "Benchmarking $i Threads"
|
||||
julia --project --threads=$i examples/qed_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
|
||||
julia --project -O3 --threads=$i examples/qed_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
|
||||
|
||||
echo "Benchmarking Tape variant $i Threads"
|
||||
julia --project --threads=$i examples/qed_bench_tape.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
|
||||
julia --project -O3 --threads=$i examples/qed_bench_tape.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
|
||||
|
@ -15,9 +15,10 @@ nvidia-smi > results/cuda_gpu_bench_reduce.txt
|
||||
lsblk > results/storage_bench_reduce.txt
|
||||
lspci > results/pci_bench_reduce.txt
|
||||
|
||||
#echo "Initiating julia..."
|
||||
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
echo "Initiating julia..."
|
||||
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
|
||||
julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
|
||||
echo "Benchmarking Reduction 32 Threads"
|
||||
julia --project --threads=32 examples/qed_bench_reduction_steps.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
|
||||
julia --project -O3 --threads=32 examples/qed_bench_reduction_steps.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
|
||||
|
@ -15,9 +15,10 @@ nvidia-smi > results/cuda_gpu_bench_reduce_gpu.txt
|
||||
lsblk > results/storage_bench_reduce_gpu.txt
|
||||
lspci > results/pci_bench_reduce_gpu.txt
|
||||
|
||||
#echo "Initiating julia..."
|
||||
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
echo "Initiating julia..."
|
||||
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
|
||||
julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
|
||||
|
||||
echo "Benchmarking Reduction 32 Threads, *GPU*"
|
||||
|
@ -78,7 +78,7 @@ export gen_graph
|
||||
export execute
|
||||
export parse_dag, parse_process
|
||||
export gen_process_input
|
||||
export get_compute_function
|
||||
export get_compute_function, get_cuda_kernel
|
||||
export gen_tape, execute_tape
|
||||
|
||||
# estimator
|
||||
|
Loading…
x
Reference in New Issue
Block a user