experiments #1
.gitignoreProject.toml
data
bench_results_32.csvbench_results_hemera.csvbench_results_reduction_steps.csvbench_results_reduction_steps_gpu.csvevaluate.jlevaluate_cpu_gpu_exec.jlevaluate_gen.jlevaluate_gen_one_sided_comparison.jlevaluate_gpu.jlevaluate_reduce_bench.jlfwk8999_gpu_results_a198f37f8e02a3dfdc31679d0c7126633832ea84.csvfwk8999results.csvqed_gen_results.csvqed_gen_results_hemera_6a02f3bee6f84c044413deb3dfedddb62d60289b.csvqed_gen_results_hemera_a198f37f8e02a3dfdc31679d0c7126633832ea84.csvqed_gen_results_onesided.csvresults_home.csvresults_home_julia.logsingularity_homepc_results.csv
docs
examples
Project.tomlfull_node_bench.jlimport_bench.jlqed_bench.jlqed_bench_reduction_steps.jlqed_bench_reduction_steps_gpu.jlqed_bench_tape.jlqed_gen_bench.jlreduction.ipynb
experiments
CUDA_container.defdiagram_bench_hemera.shdiagram_bench_hemera_a100.shfull_node.shfull_node_hemera.shgen_diagram_hemera.shreduce_bench_hemera.shreduce_bench_hemera_gpu.shrun_gen_diagram.shrun_qed_exec.shrun_reduce_bench.shrun_reduce_bench_gpu.sh
images
AB->ABBBBB_reduction_bench.pdfAB->ABBB_reduction_bench.pdfcompton_diagram_gen_comparison.pdfcompton_graph_size_reduced.pdfcompton_graph_size_unreduced.pdfcompton_graph_size_versus.pdfcpu_vs_gpu_abc.pdfcpu_vs_gpu_qed.pdfgen_memory.pdfgen_times.pdfgpu_perf_NVIDIA A30.pdfgpu_rate_NVIDIA A30.pdfgpu_times_NVIDIA A30.pdfke->kke_reduction_bench.pdfke->kkke_reduction_bench.pdfke->kkke_reduction_bench_lin.pdfke->kkkke_reduction_bench.pdfke->kkkkke_reduction_bench.pdfqed_ke-kke_exec_10000_inputs.pdfqed_ke-kke_graph_properties.pdfqed_ke-kkke_exec_10000_inputs.pdfqed_ke-kkke_graph_properties.pdfreduction_bench_relative.pdfreduction_bench_relative_gpu.pdf
threaded_execution_data_home
ABC Process: 'AB->AB'_performance.pdfABC Process: 'AB->AB'_rate.pdfABC Process: 'AB->AB'_time.pdfABC Process: 'AB->ABBB'_performance.pdfABC Process: 'AB->ABBB'_rate.pdfABC Process: 'AB->ABBB'_time.pdfABC Process: 'AB->ABBBBB'_performance.pdfABC Process: 'AB->ABBBBB'_rate.pdfABC Process: 'AB->ABBBBB'_time.pdfQED Process: 'ke->ke'_performance.pdfQED Process: 'ke->ke'_rate.pdfQED Process: 'ke->ke'_time.pdfQED Process: 'ke->kke'_performance.pdfQED Process: 'ke->kke'_rate.pdfQED Process: 'ke->kke'_time.pdfQED Process: 'ke->kkke'_performance.pdfQED Process: 'ke->kkke'_rate.pdfQED Process: 'ke->kkke'_time.pdfQED Process: 'ke->kkkke'_performance.pdfQED Process: 'ke->kkkke'_rate.pdfQED Process: 'ke->kkkke'_time.pdfQED Process: 'ke->kkkkke'_performance.pdfQED Process: 'ke->kkkkke'_rate.pdfQED Process: 'ke->kkkkke'_time.pdfQED Process: 'ke->kkkkkke'_performance.pdfQED Process: 'ke->kkkkkke'_rate.pdfQED Process: 'ke->kkkkkke'_time.pdfQED Process: 'ke->kkkkkkke'_performance.pdfQED Process: 'ke->kkkkkkke'_rate.pdfQED Process: 'ke->kkkkkkke'_time.pdfgen_times_16_threads.pdfgen_times_1_threads.pdfgen_times_2_threads.pdfgen_times_4_threads.pdfgen_times_8_threads.pdf
notebooks
src
test
@ -12,7 +12,7 @@ using Base.Threads
|
||||
|
||||
function log(x...)
|
||||
println(now(), " ", join(x, " ")...)
|
||||
#flush(stdout)
|
||||
flush(stdout)
|
||||
return nothing
|
||||
end
|
||||
|
||||
@ -29,6 +29,7 @@ df = DataFrame(
|
||||
rate = Float64[],
|
||||
cpu_chunks = Float64[],
|
||||
gpu_chunks = Float64[],
|
||||
memory_est = Float64[],
|
||||
)
|
||||
|
||||
# if they exist, read existing results and append new ones
|
||||
@ -36,7 +37,7 @@ if isfile(results_filename)
|
||||
df = CSV.read(results_filename, DataFrame)
|
||||
end
|
||||
|
||||
nInputs = 16_777_216 # 2^30
|
||||
nInputs = 2^26
|
||||
|
||||
lck = ReentrantLock()
|
||||
|
||||
@ -109,7 +110,8 @@ function gpu_worker(kernel!, inputs, chunk_size)
|
||||
cuInputs = CuVector(inputs[work_start:work_end])
|
||||
ts = 32
|
||||
bs = Int(chunk_size / 32)
|
||||
CUDA.@sync threads = ts blocks = bs always_inline = true kernel!(cuInputs, cuOutputs, chunk_size)
|
||||
@cuda threads = ts blocks = bs always_inline = true kernel!(cuInputs, cuOutputs, chunk_size)
|
||||
CUDA.device_synchronize()
|
||||
end
|
||||
|
||||
#log("GPU Worker on Device $(CUDA.device()) finished!")
|
||||
@ -156,7 +158,7 @@ function bench(compute_function, kernel!, inputs, chunk_size)
|
||||
|
||||
bench = @benchmark begin
|
||||
full_compute($compute_function, $kernel!, $inputs, $chunk_size)
|
||||
end gcsample = true seconds = 30
|
||||
end gcsample = true seconds = 60
|
||||
|
||||
time = median(bench.times) / 1e9
|
||||
s = std(bench.times) / 1e9
|
||||
@ -164,10 +166,11 @@ function bench(compute_function, kernel!, inputs, chunk_size)
|
||||
|
||||
med_cpu_chunks = median(getindex.(cpu_gpu_ratio, 1))
|
||||
med_gpu_chunks = median(getindex.(cpu_gpu_ratio, 2))
|
||||
mem_estimate = bench.memory
|
||||
|
||||
log("CPU/GPU ratios: $(cpu_gpu_ratio)")
|
||||
|
||||
return (time, rate, s, med_cpu_chunks, med_gpu_chunks)
|
||||
return (time, rate, s, med_cpu_chunks, med_gpu_chunks, mem_estimate)
|
||||
end
|
||||
|
||||
function full_node_bench(process::MetagraphOptimization.AbstractProcessDescription, func, kernel!, chunk_size, inputs)
|
||||
@ -178,7 +181,7 @@ function full_node_bench(process::MetagraphOptimization.AbstractProcessDescripti
|
||||
display.(CUDA.devices())
|
||||
|
||||
log("Benchmarking full node...")
|
||||
(time, rate, s, med_cpu_chunks, med_gpu_chunks) = bench(func, kernel!, inputs, chunk_size)
|
||||
(time, rate, s, med_cpu_chunks, med_gpu_chunks, mem_estimate) = bench(func, kernel!, inputs, chunk_size)
|
||||
log(
|
||||
"Benchmarking complete with median time $(time), $(med_cpu_chunks) cpu chunks, and $(med_gpu_chunks) gpu chunks.",
|
||||
)
|
||||
@ -196,6 +199,7 @@ function full_node_bench(process::MetagraphOptimization.AbstractProcessDescripti
|
||||
:rate => rate,
|
||||
:cpu_chunks => med_cpu_chunks,
|
||||
:gpu_chunks => med_gpu_chunks,
|
||||
:memory_est => mem_estimate,
|
||||
),
|
||||
)
|
||||
|
||||
|
@ -235,7 +235,7 @@ bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_tim
|
||||
|
||||
optimizer = ReductionOptimizer()
|
||||
|
||||
processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke", "ke->kkkkkke"]
|
||||
processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke"]
|
||||
|
||||
for process_str in processes
|
||||
# compton
|
||||
@ -253,7 +253,7 @@ for process_str in processes
|
||||
CSV.write(results_filename, df)
|
||||
end
|
||||
|
||||
processes = ["AB->AB", "AB->ABBB", "AB->ABBBBB"]
|
||||
processes = ["AB->AB", "AB->ABBB", "AB->ABBBBB", "AB->ABBBBBBB"]
|
||||
|
||||
for process_str in processes
|
||||
# AB->AB
|
||||
|
@ -15,11 +15,11 @@ nvidia-smi > results/cuda_gpu_full_node.txt
|
||||
lsblk > results/storage_full_node.txt
|
||||
lspci > results/pci_full_node.txt
|
||||
|
||||
echo "Initiating julia..."
|
||||
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
|
||||
julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
|
||||
#echo "Initiating julia..."
|
||||
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1 # need current dev version of QEDprocesses
|
||||
#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
|
||||
#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1 # add requirements for the bench script
|
||||
#julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
|
||||
|
||||
echo "Benchmarking Full Node 128 Threads + *GPUs*"
|
||||
julia --project -O3 --threads=128 examples/full_node_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
|
||||
|
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user