full node bench testing

This commit is contained in:
Anton Reinhard 2024-03-05 18:24:13 +01:00
parent b7f8e4a6b3
commit b39bc480a1
7 changed files with 68 additions and 41 deletions

View File

@ -100,7 +100,7 @@ for process in processes
plot!(
title = ("$(beautify_title(process)) Reduction Progression ($(n_inputs) Inputs)"),
xscale = :linear,
yscale = :log10,
yscale = :linear,
#ylim = (0, ymax),
legend = :outerbottom,
minorgrid = true,

View File

@ -12,7 +12,8 @@ using Base.Threads
function log(x...)
println(now(), " ", join(x, " ")...)
return flush(stdout)
#flush(stdout)
return nothing
end
results_filename = "full_node_bench.csv"
@ -26,8 +27,8 @@ df = DataFrame(
time = Float64[],
std = Float64[],
rate = Float64[],
ratio_cpu = Float64[],
ratio_gpu = Float64[],
cpu_chunks = Float64[],
gpu_chunks = Float64[],
)
# if they exist, read existing results and append new ones
@ -35,19 +36,20 @@ if isfile(results_filename)
df = CSV.read(results_filename, DataFrame)
end
nInputs = 100_000_000
nInputs = 1_073_741_824 # 2^30
lck = SpinLock()
lck = ReentrantLock()
progress = 1
cpu_chunks = 0
gpu_chunks = 0
chunkSizes = [100, 1_000, 10_000, 50_000, 100_000]
chunkSizes = [1024, 4096, 16384, 65536, 262144, 1048576] # 2^10 to 2^20
function cpu_worker(compute_func, inputs, chunk_size)
global progress
global cpu_chunks
global lck
quit = false
work_start = 0
work_end = 0
@ -58,8 +60,9 @@ function cpu_worker(compute_func, inputs, chunk_size)
else
work_start = progress
progress = progress + chunk_size
work_end = min(progress, nInputs)
work_end = min(progress - 1, nInputs)
cpu_chunks = cpu_chunks + 1
#log("CPU Worker $(Threads.threadid()) computing $(cpu_chunks)th cpu chunk ($work_start, $work_end)")
end
end
if quit
@ -71,6 +74,8 @@ function cpu_worker(compute_func, inputs, chunk_size)
end
end
#log("CPU Worker on $(Threads.threadid()) finished!")
return nothing
end
@ -78,6 +83,7 @@ end
function gpu_worker(compute_func, inputs, chunk_size)
global progress
global gpu_chunks
global lck
quit = false
work_start = 0
work_end = 0
@ -88,8 +94,9 @@ function gpu_worker(compute_func, inputs, chunk_size)
else
work_start = progress
progress = progress + chunk_size
work_end = min(progress, nInputs)
work_end = min(progress - 1, nInputs)
gpu_chunks = gpu_chunks + 1
#log("GPU Worker $(CUDA.device()) computing $(gpu_chunks)th gpu chunk ($work_start, $work_end)")
end
end
if quit
@ -100,30 +107,33 @@ function gpu_worker(compute_func, inputs, chunk_size)
compute_func.(cuInputs)
end
#log("GPU Worker on Device $(CUDA.device()) finished!")
return nothing
end
cpu_gpu_ratio = Vector{Tuple{Int, Int}}()
function full_compute(compute_func, inputs, chunk_size)
global progress = 1
global cpu_chunks = 0
global gpu_chunks = 0
global progress
progress = 1
global cpu_chunks
cpu_chunks = 0
global gpu_chunks
gpu_chunks = 0
tasks = Vector()
for dev in CUDA.devices()
t = @task device!(dev) do
t = Threads.@spawn device!(dev) do
gpu_worker(compute_func, inputs, chunk_size)
return nothing
end
schedule(t)
push!(tasks, t)
end
for i in 1:Threads.nthreads()
t = @task cpu_worker(compute_func, inputs, chunk_size)
schedule(t)
for i in 1:(Threads.nthreads() - length(CUDA.devices()))
t = Threads.@spawn cpu_worker(compute_func, inputs, chunk_size)
push!(tasks, t)
end
@ -136,6 +146,9 @@ function full_compute(compute_func, inputs, chunk_size)
end
function bench(compute_function, inputs, chunk_size)
global cpu_gpu_ratio
empty!(cpu_gpu_ratio)
bench = @benchmark begin
full_compute($compute_function, $inputs, $chunk_size)
end gcsample = true seconds = 600
@ -147,44 +160,37 @@ function bench(compute_function, inputs, chunk_size)
med_cpu_chunks = median(getindex.(cpu_gpu_ratio, 1))
med_gpu_chunks = median(getindex.(cpu_gpu_ratio, 2))
log("CPU/GPU ratios: $(cpu_gpu_ratio)")
return (time, rate, s, med_cpu_chunks, med_gpu_chunks)
end
function full_node_bench(process::MetagraphOptimization.AbstractProcessDescription, func, chunk_size)
function full_node_bench(process::MetagraphOptimization.AbstractProcessDescription, func, chunk_size, inputs)
process_name = string(process)
log("\n--- Benchmarking $(process_name) on $(nInputs) with chunk size $(chunk_size) ---")
log("Available Cuda Devices:")
display.(CUDA.devices())
log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")
inputs = Vector{typeof(gen_process_input(process))}()
resize!(inputs, nInputs)
processes = Vector{typeof(process)}()
for i in 1:Threads.nthreads()
push!(processes, copy(process))
end
@inbounds Threads.@threads for i in eachindex(inputs)
inputs[i] = gen_process_input(processes[Threads.nthreads()])
end
log("Benchmarking full node...")
(time, rate, s, med_cpu_chunks, med_gpu_chunks) = bench(func, inputs, chunk_size)
log(
"Benchmarking complete with median time $(time), $(med_cpu_chunks) cpu chunks, and $(med_gpu_chunks) gpu chunks.",
)
push!(
df,
Dict(
:process_name => process_name,
:cpu_threads => Threads.nthreads(),
:cpu_threads => Threads.nthreads() - length(CUDA.devices()),
:gpu_devices => length(CUDA.devices()),
:n_inputs => nInputs,
:chunk_size => chunk_size,
:time => time,
:std => s,
:rate => rate,
:ratio_cpu => med_cpu_chunks / (med_cpu_chunks + med_gpu_chunks),
:ratio_gpu => med_gpu_chunks / (med_cpu_chunks + med_gpu_chunks),
:cpu_chunks => med_cpu_chunks,
:gpu_chunks => med_gpu_chunks,
),
)
@ -206,7 +212,7 @@ machine = Machine(
)
optimizer = ReductionOptimizer()
processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke"]
processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke"]
for proc in processes
process = parse_process(proc, QEDModel())
@ -214,8 +220,21 @@ for proc in processes
optimize_to_fixpoint!(optimizer, graph)
func_gen_time = @elapsed compute_func = get_compute_function(graph, process, machine)
log("Generating $nInputs inputs with $(Threads.nthreads()) threads...")
inputs = Vector{typeof(gen_process_input(process))}()
resize!(inputs, nInputs)
procs = Vector{typeof(process)}()
for i in 1:Threads.nthreads()
push!(procs, copy(process))
end
@inbounds Threads.@threads for i in eachindex(inputs)
inputs[i] = gen_process_input(procs[Threads.nthreads()])
end
for chunk_size in chunkSizes
full_node_bench(process, compute_func, chunk_size)
full_node_bench(process, compute_func, chunk_size, inputs)
CSV.write(results_filename, df)
end
end;

View File

@ -20,5 +20,5 @@ lspci > results/pci_full_node.txt
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
echo "Benchmarking Reduction 128 Threads, *GPU*"
echo "Benchmarking Full Node 128 Threads + *GPUs*"
julia --project --threads=128 examples/full_node_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"

View File

@ -2,12 +2,12 @@
#SBATCH --job-name=qed_bench
#SBATCH --partition=casus_a100
#SBATCH --account=casus
#SBATCH --time=10:00:00
#SBATCH --time=8:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=128
#SBATCH --gres=gpu:4
#SBATCH --mem=256GB
#SBATCH --mem=2048GB
#SBATCH --output=simulation-%A-%a.out
#SBATCH --error=simulation-%A-%a.err

Binary file not shown.

View File

@ -115,10 +115,18 @@ Linearly many FLOP with growing data.
"""
function compute(::ComputeTaskQED_Sum, data...)::ComplexF64
# TODO: want to use sum_kbn here but it doesn't seem to support ComplexF64, do it element-wise?
return sum(data)
s = 0.0im
for d in data
s += d
end
return s
end
function compute(::ComputeTaskQED_Sum, data::AbstractArray)::ComplexF64
# TODO: want to use sum_kbn here but it doesn't seem to support ComplexF64, do it element-wise?
return sum(data)
s = 0.0im
for d in data
s += d
end
return s
end

View File

@ -39,7 +39,7 @@ function get_function_call(node::ComputeTaskNode)
@assert length(children(node)) <= children(task(node)) "Node $(node) has too many children for its task: node has $(length(node.children)) versus task has $(children(task(node)))\nNode's children: $(getfield.(node.children, :children))"
@assert !ismissing(node.device) "Trying to get expression for an unscheduled ComputeTaskNode\nNode: $(node)"
if (length(node.children) <= 50)
if (length(node.children) <= 800)
#only use an SVector when there are few children
return get_function_call(
node.task,