experiments #1

Merged
rubydragon merged 39 commits from experiments into main 2024-05-08 12:03:28 +02:00
11 changed files with 236 additions and 124 deletions
Showing only changes of commit d036f21862 - Show all commits

View File

@ -0,0 +1,9 @@
process_name,graph_gen_time,optimization_time,function_generation_time,graph_nodes,graph_edges,graph_mem,cpu_threads,n_inputs,nflops_likwid,cpu_time,cpu_std,cpu_rate,cpu_gflops,gpu_name,gpu_time,gpu_std,gpu_rate,gpu_gflops
QED Process: 'ke->ke' reduced,0.004851193,0.001290078,0.006093401,26,29,6948.0,32,10000000,0,0.2810178885,0.00909457898005121,3.5584923270818755e7,1.886000933353394,NVIDIA A100-SXM4-80GB,0.4060797745,0.0013320688448668838,2.462570319418851e7,1.305162269291991
QED Process: 'ke->kke' reduced,0.001065397,0.010432606,0.014287271,59,77,16383.0,32,10000000,0,0.823029796,0.01692859562197734,1.2150228398292398e7,2.223491796887509,NVIDIA A100-SXM4-80GB,2.3333098275,0.0014037935241043983,4.285757460128814e6,0.784293615203573
QED Process: 'ke->kkke' reduced,0.001348518,0.005210738,0.034243651,188,273,54426.0,32,10000000,0,2.9432864705,0.031053960614444084,3.397562588700793e6,2.497208502695083,NVIDIA A100-SXM4-80GB,10.340032588,0.0028660606476431714,967114.9403924877,0.7108294811884784
QED Process: 'ke->kkkke' reduced,0.004413783,0.039469525,0.15704043,853,1295,243781.0,32,10000000,0,14.980394603,0.5162977440607073,667539.1580137269,2.4318451526440072,NVIDIA A100-SXM4-80GB,54.2063089555,0.006347197107681703,184480.371246258,0.672061992450118
QED Process: 'ke->kkkkke' reduced,0.021871728,0.716956567,1.121625045,4982,7655,1.800816e6,32,10000000,0,82.035650126,0.3421310894344223,121898.22332901397,2.6545776094359375,NVIDIA A100-SXM4-80GB,321.789538108,NaN,31076.212293277757,0.6767466751107096
ABC Process: 'AB->AB' reduced,0.000867035,0.002263493,0.007340721,34,37,9296.0,32,10000000,0,0.1877912925,0.0029540808349122686,5.325060532292784e7,2.8222820821151755,NVIDIA A100-SXM4-80GB,0.0016617045,1.5729813606955104e-5,6.01791714471496e9,318.9496086698929
ABC Process: 'AB->ABBB' reduced,0.000547175,0.004720326,0.035918118,200,285,57156.0,32,10000000,0,0.257040364,0.007250633041861087,3.8904395575785905e7,28.59473074820264,NVIDIA A100-SXM4-80GB,0.003641165,3.2217340292524716e-5,2.74637375675093e9,2018.5847112119334
ABC Process: 'AB->ABBBBB' reduced,0.019826198,0.258674017,1.136386232,4998,7671,1.507432e6,32,10000000,0,1.818710381,0.03353568966350073,5.498401562156146e6,119.7386908190744,NVIDIA A100-SXM4-80GB,0.492263776,0.0031065569742746986,2.031431213821429e7,442.38477543389257
1 process_name graph_gen_time optimization_time function_generation_time graph_nodes graph_edges graph_mem cpu_threads n_inputs nflops_likwid cpu_time cpu_std cpu_rate cpu_gflops gpu_name gpu_time gpu_std gpu_rate gpu_gflops
2 QED Process: 'ke->ke' reduced 0.004851193 0.001290078 0.006093401 26 29 6948.0 32 10000000 0 0.2810178885 0.00909457898005121 3.5584923270818755e7 1.886000933353394 NVIDIA A100-SXM4-80GB 0.4060797745 0.0013320688448668838 2.462570319418851e7 1.305162269291991
3 QED Process: 'ke->kke' reduced 0.001065397 0.010432606 0.014287271 59 77 16383.0 32 10000000 0 0.823029796 0.01692859562197734 1.2150228398292398e7 2.223491796887509 NVIDIA A100-SXM4-80GB 2.3333098275 0.0014037935241043983 4.285757460128814e6 0.784293615203573
4 QED Process: 'ke->kkke' reduced 0.001348518 0.005210738 0.034243651 188 273 54426.0 32 10000000 0 2.9432864705 0.031053960614444084 3.397562588700793e6 2.497208502695083 NVIDIA A100-SXM4-80GB 10.340032588 0.0028660606476431714 967114.9403924877 0.7108294811884784
5 QED Process: 'ke->kkkke' reduced 0.004413783 0.039469525 0.15704043 853 1295 243781.0 32 10000000 0 14.980394603 0.5162977440607073 667539.1580137269 2.4318451526440072 NVIDIA A100-SXM4-80GB 54.2063089555 0.006347197107681703 184480.371246258 0.672061992450118
6 QED Process: 'ke->kkkkke' reduced 0.021871728 0.716956567 1.121625045 4982 7655 1.800816e6 32 10000000 0 82.035650126 0.3421310894344223 121898.22332901397 2.6545776094359375 NVIDIA A100-SXM4-80GB 321.789538108 NaN 31076.212293277757 0.6767466751107096
7 ABC Process: 'AB->AB' reduced 0.000867035 0.002263493 0.007340721 34 37 9296.0 32 10000000 0 0.1877912925 0.0029540808349122686 5.325060532292784e7 2.8222820821151755 NVIDIA A100-SXM4-80GB 0.0016617045 1.5729813606955104e-5 6.01791714471496e9 318.9496086698929
8 ABC Process: 'AB->ABBB' reduced 0.000547175 0.004720326 0.035918118 200 285 57156.0 32 10000000 0 0.257040364 0.007250633041861087 3.8904395575785905e7 28.59473074820264 NVIDIA A100-SXM4-80GB 0.003641165 3.2217340292524716e-5 2.74637375675093e9 2018.5847112119334
9 ABC Process: 'AB->ABBBBB' reduced 0.019826198 0.258674017 1.136386232 4998 7671 1.507432e6 32 10000000 0 1.818710381 0.03353568966350073 5.498401562156146e6 119.7386908190744 NVIDIA A100-SXM4-80GB 0.492263776 0.0031065569742746986 2.031431213821429e7 442.38477543389257

View File

@ -0,0 +1,143 @@
using CSV
using DataFrames
using Plots
using StatsPlots
using LaTeXStrings
if (length(ARGS) < 1)
println("Please use with \"input_file.csv\"")
end
processes = [
"QED Process: 'ke->ke'",
"QED Process: 'ke->kke'",
"QED Process: 'ke->kkke'",
"QED Process: 'ke->kkkke'",
"QED Process: 'ke->kkkkke'",
#"QED Process: 'ke->kkkkkke'",
#"QED Process: 'ke->kkkkkkke'",
"ABC Process: 'AB->AB'",
"ABC Process: 'AB->ABBB'",
"ABC Process: 'AB->ABBBBB'",
]
function proc_to_n(str::AbstractString)
parts = split(str, "'")
parts = split(parts[2], "->")
k_count = count(c -> c == 'k', parts[2])
return k_count
end
function abc_proc_to_n(str::AbstractString)
parts = split(str, "'")
parts = split(parts[2], "->")
b_count = count(c -> c == 'B', parts[2])
return b_count
end
function beautify_title(str::AbstractString)
parts = split(str, "'")
preprefix = parts[1]
infix = parts[2]
sufsuffix = parts[3]
parts = split(infix, "->")
prefix = parts[1]
suffix = parts[2]
k_count = count(c -> c == 'k', suffix)
B_count = count(c -> c == 'B', suffix)
if k_count == 1 || B_count == 1
new_suffix = suffix
elseif k_count >= 1
new_suffix = replace(suffix, r"k+" => "k^$k_count")
elseif B_count >= 1
new_suffix = replace(suffix, r"B+" => "B^$B_count")
end
return preprefix * L"%$prefix \rightarrow %$new_suffix" * sufsuffix
end
input_file = ARGS[1]
df = CSV.read(input_file, DataFrame)
n_inputs = df[:, "n_inputs"][1]
title_string = "QED N-Photon Compton Scattering\nCalculate 10,000,000 Matrix Elements"
df_filt = filter(:process_name => x -> proc_to_n(x) >= 1, df)
df_filt.process_size = @. proc_to_n(df_filt.process_name)
df_red = filter(:process_name => x -> match(r" reduced$", x) !== nothing, df_filt)
@df df_red scatter(
:process_size,
:cpu_time,
yerror = :cpu_std,
label = "CPU execution time, 32 threads (s)",
markersize = 6,
)
@df df_red scatter!(
:process_size,
:gpu_time,
yerror = :gpu_std,
label = "GPU execution time, A100 80GB (s)",
markersize = 6,
)
plot!(
title = title_string,
yscale = :log10,
legend = :outerbottom,
legendcolumns = 2,
legend_font_pointsize = 10,
size = (800, 600),
ylabel = "time (s)",
xlabel = "process size (#)",
)
savefig("cpu_vs_gpu_qed.pdf")
title_string = "\$AB\\rightarrow AB^n\$ ABC Processes\nCalculate 10,000,000 Matrix Elements"
df_filt = filter(:process_name => x -> abc_proc_to_n(x) >= 1, df)
df_filt.process_size = @. abc_proc_to_n(df_filt.process_name)
df_red = filter(:process_name => x -> match(r" reduced$", x) !== nothing, df_filt)
@df df_red scatter(
:process_size,
:cpu_time,
yerror = :cpu_std,
label = "CPU execution time, 32 threads (s)",
markersize = 6,
)
@df df_red scatter!(
:process_size,
:gpu_time,
yerror = :gpu_std,
label = "GPU execution time, A100 80GB (s)",
markersize = 6,
)
plot!(
title = title_string,
yscale = :log10,
legend = :outerbottom,
legendcolumns = 2,
legend_font_pointsize = 10,
size = (800, 600),
ylabel = "time (s)",
xlabel = "process size (#)",
)
savefig("cpu_vs_gpu_abc.pdf")

View File

@ -80,10 +80,13 @@ function cpu_worker(compute_func, inputs, chunk_size)
end
# called with a specific device selected
function gpu_worker(compute_func, inputs, chunk_size)
function gpu_worker(kernel!, inputs, chunk_size)
global progress
global gpu_chunks
global lck
cuOutputs = CuVector{ComplexF64}()
resize!(cuOutputs, chunk_size)
quit = false
work_start = 0
work_end = 0
@ -104,7 +107,9 @@ function gpu_worker(compute_func, inputs, chunk_size)
end
cuInputs = CuVector(inputs[work_start:work_end])
compute_func.(cuInputs)
ts = 32
bs = Int(chunk_size / 32)
CUDA.@sync threads = ts blocks = bs always_inline = true kernel!(cuInputs, cuOutputs, chunk_size)
end
#log("GPU Worker on Device $(CUDA.device()) finished!")
@ -114,7 +119,7 @@ end
cpu_gpu_ratio = Vector{Tuple{Int, Int}}()
function full_compute(compute_func, inputs, chunk_size)
function full_compute(compute_func, kernel!, inputs, chunk_size)
global progress
progress = 1
global cpu_chunks
@ -126,7 +131,7 @@ function full_compute(compute_func, inputs, chunk_size)
for dev in CUDA.devices()
t = Threads.@spawn device!(dev) do
gpu_worker(compute_func, inputs, chunk_size)
gpu_worker(kernel!, inputs, chunk_size)
return nothing
end
push!(tasks, t)
@ -145,12 +150,12 @@ function full_compute(compute_func, inputs, chunk_size)
return nothing
end
function bench(compute_function, inputs, chunk_size)
function bench(compute_function, kernel!, inputs, chunk_size)
global cpu_gpu_ratio
empty!(cpu_gpu_ratio)
bench = @benchmark begin
full_compute($compute_function, $inputs, $chunk_size)
full_compute($compute_function, $kernel!, $inputs, $chunk_size)
end gcsample = true seconds = 30
time = median(bench.times) / 1e9
@ -165,7 +170,7 @@ function bench(compute_function, inputs, chunk_size)
return (time, rate, s, med_cpu_chunks, med_gpu_chunks)
end
function full_node_bench(process::MetagraphOptimization.AbstractProcessDescription, func, chunk_size, inputs)
function full_node_bench(process::MetagraphOptimization.AbstractProcessDescription, func, kernel!, chunk_size, inputs)
process_name = string(process)
log("\n--- Benchmarking $(process_name) on $(nInputs) with chunk size $(chunk_size) ---")
@ -173,7 +178,7 @@ function full_node_bench(process::MetagraphOptimization.AbstractProcessDescripti
display.(CUDA.devices())
log("Benchmarking full node...")
(time, rate, s, med_cpu_chunks, med_gpu_chunks) = bench(func, inputs, chunk_size)
(time, rate, s, med_cpu_chunks, med_gpu_chunks) = bench(func, kernel!, inputs, chunk_size)
log(
"Benchmarking complete with median time $(time), $(med_cpu_chunks) cpu chunks, and $(med_gpu_chunks) gpu chunks.",
)
@ -212,14 +217,14 @@ machine = Machine(
)
optimizer = ReductionOptimizer()
processes = [#="ke->ke", "ke->kke", "ke->kkke", =#"ke->kkkke", "ke->kkkkke"]
processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke"]
for proc in processes
process = parse_process(proc, QEDModel())
graph = gen_graph(process)
optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")
inputs = Vector{typeof(gen_process_input(process))}()
@ -234,7 +239,7 @@ for proc in processes
end
for chunk_size in chunkSizes
full_node_bench(process, compute_func, chunk_size, inputs)
full_node_bench(process, compute_func, kernel!, chunk_size, inputs)
CSV.write(results_filename, df)
end
end;

View File

@ -44,7 +44,7 @@ if isfile(results_filename)
df = CSV.read(results_filename, DataFrame)
end
nInputs = 10_000_000
nInputs = 2^24
function cpu_bench(compute_function, inputs)
bench = @benchmark begin
@ -60,9 +60,15 @@ function cpu_bench(compute_function, inputs)
return (time, rate, s)
end
function gpu_bench(compute_function, inputs)
function gpu_bench(kernel!, inputs)
n = length(inputs)
outputs = CuVector{ComplexF64}()
resize!(outputs, n)
ts = 32
bs = Int(n / ts)
bench = @benchmark begin
CUDA.@sync $compute_function.($inputs)
@cuda threads = ts blocks = bs always_inline = true kernel!.($inputs, $outputs, $n)
CUDA.device_synchronize()
end gcsample = true seconds = 300
time = median(bench.times) / 1e9
@ -77,6 +83,7 @@ function bench_process(
process_name::String,
graph::DAG,
func,
kernel!,
gen_time::Float64,
opt_time::Float64,
func_time::Float64;
@ -131,7 +138,7 @@ function bench_process(
log("Benchmarking GPU...")
gpu_name = "$(name(first(CUDA.devices())))"
cuInputs = CuArray(inputs)
(time_gpu, rate_gpu, std_gpu) = gpu_bench(func, cuInputs)
(time_gpu, rate_gpu, std_gpu) = gpu_bench(kernel!, cuInputs)
flops_gpu = (rate_gpu * NFLOPs) / 10^9
else
log("Skipping GPU...")
@ -211,7 +218,8 @@ process = parse_process("ke->kke", QEDModel())
gen_time = @elapsed graph = gen_graph(process)
opt_time = @elapsed optimize!(optimizer, graph, 200)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "warmup", graph, compute_func, gen_time, opt_time, func_gen_time)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
optimizer = ReductionOptimizer()
@ -220,104 +228,45 @@ process = parse_process("AB->ABBB", ABCModel())
gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "warmup", graph, compute_func, gen_time, opt_time, func_gen_time)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
## -- WARMUP END
optimizer = ReductionOptimizer()
# compton
process = parse_process("ke->ke", QEDModel())
gen_time = @elapsed graph = gen_graph(process)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke", "ke->kkkkkke"]
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
for process_str in processes
# compton
process = parse_process(process_str, QEDModel())
gen_time = @elapsed graph = gen_graph(process)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)
CSV.write(results_filename, df)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
# 2-photon compton
process = parse_process("ke->kke", QEDModel())
gen_time = @elapsed graph = gen_graph(process)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
CSV.write(results_filename, df)
end
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
processes = ["AB->AB", "AB->ABBB", "AB->ABBBBB"]
CSV.write(results_filename, df)
for process_str in processes
# AB->AB
process = parse_process(process_str, ABCModel())
gen_time = @elapsed graph = parse_dag("input/$(process_str).txt", ABCModel())
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "$process not optimized", graph, compute_func, kernel!, gen_time, 0.0, func_gen_time)
# 3-photon compton
process = parse_process("ke->kkke", QEDModel())
gen_time = @elapsed graph = gen_graph(process)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
kernel! = get_cuda_kernel(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, kernel!, gen_time, opt_time, func_gen_time)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
CSV.write(results_filename, df)
# 4-photon compton
process = parse_process("ke->kkkke", QEDModel())
gen_time = @elapsed graph = gen_graph(process)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time, use_gpu = false)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
CSV.write(results_filename, df)
# 5-photon compton
process = parse_process("ke->kkkkke", QEDModel())
gen_time = @elapsed graph = gen_graph(process)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
CSV.write(results_filename, df)
# AB->AB
process = parse_process("AB->AB", ABCModel())
gen_time = @elapsed graph = parse_dag("input/AB->AB.txt", ABCModel())
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
CSV.write(results_filename, df)
# AB->AB^3
process = parse_process("AB->ABBB", ABCModel())
gen_time = @elapsed graph = parse_dag("input/AB->ABBB.txt", ABCModel())
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
CSV.write(results_filename, df)
# AB->AB^5
process = parse_process("AB->ABBBBB", ABCModel())
gen_time = @elapsed graph = parse_dag("input/AB->ABBBBB.txt", ABCModel())
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
#bench_process(process, "$process not optimized", graph, compute_func, gen_time, 0.0, func_gen_time)
opt_time = @elapsed optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
bench_process(process, "$process reduced", graph, compute_func, gen_time, opt_time, func_gen_time)
CSV.write(results_filename, df)
CSV.write(results_filename, df)
end

View File

@ -37,6 +37,7 @@ function log(x...)
end
function bench(func, inputs)
# todo: use gpu kernel instead of broadcasting
gpu_compile_time = @elapsed func.(inputs[1:2])
gpu_time = @benchmark $func.($inputs)

View File

@ -15,10 +15,11 @@ nvidia-smi > results/cuda_gpu_full_node.txt
lsblk > results/storage_full_node.txt
lspci > results/pci_full_node.txt
#echo "Initiating julia..."
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
echo "Initiating julia..."
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
echo "Benchmarking Full Node 128 Threads + *GPUs*"
julia --project --threads=128 examples/full_node_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
julia --project -O3 --threads=128 examples/full_node_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"

View File

@ -19,8 +19,9 @@ lspci > results/pci.txt
echo "Initiating julia..."
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("BenchmarkTools"); Pkg.add("StatsBase")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
echo "Benchmarking with $i threads..."
julia --project --threads=$i examples/qed_gen_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
julia --project -O3 --threads=$i examples/qed_gen_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"

View File

@ -18,13 +18,14 @@ nvidia-smi > results/cuda_gpu_$i.txt
lsblk > results/storage_$i.txt
lspci > results/pci_$i.txt
#echo "Initiating julia..."
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
echo "Initiating julia..."
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
echo "Benchmarking $i Threads"
julia --project --threads=$i examples/qed_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
julia --project -O3 --threads=$i examples/qed_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
echo "Benchmarking Tape variant $i Threads"
julia --project --threads=$i examples/qed_bench_tape.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
julia --project -O3 --threads=$i examples/qed_bench_tape.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"

View File

@ -15,9 +15,10 @@ nvidia-smi > results/cuda_gpu_bench_reduce.txt
lsblk > results/storage_bench_reduce.txt
lspci > results/pci_bench_reduce.txt
#echo "Initiating julia..."
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
echo "Initiating julia..."
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
echo "Benchmarking Reduction 32 Threads"
julia --project --threads=32 examples/qed_bench_reduction_steps.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
julia --project -O3 --threads=32 examples/qed_bench_reduction_steps.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"

View File

@ -15,9 +15,10 @@ nvidia-smi > results/cuda_gpu_bench_reduce_gpu.txt
lsblk > results/storage_bench_reduce_gpu.txt
lspci > results/pci_bench_reduce_gpu.txt
#echo "Initiating julia..."
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
echo "Initiating julia..."
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("LIKWID"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
echo "Benchmarking Reduction 32 Threads, *GPU*"

View File

@ -78,7 +78,7 @@ export gen_graph
export execute
export parse_dag, parse_process
export gen_process_input
export get_compute_function
export get_compute_function, get_cuda_kernel
export gen_tape, execute_tape
# estimator