2024-05-08 12:03:28 +02:00
123 changed files with 15321 additions and 655 deletions
--- a/examples/full_node_bench.jl
+++ b/examples/full_node_bench.jl
@ -12,7 +12,7 @@ using Base.Threads

 function log(x...)
    println(now(), " ", join(x, " ")...)
-    #flush(stdout)
+    flush(stdout)
    return nothing
 end

@ -29,6 +29,7 @@ df = DataFrame(
    rate = Float64[],
    cpu_chunks = Float64[],
    gpu_chunks = Float64[],
+    memory_est = Float64[],
 )

 # if they exist, read existing results and append new ones
@ -36,7 +37,7 @@ if isfile(results_filename)
    df = CSV.read(results_filename, DataFrame)
 end

-nInputs = 16_777_216 # 2^30
+nInputs = 2^26

 lck = ReentrantLock()

@ -109,7 +110,8 @@ function gpu_worker(kernel!, inputs, chunk_size)
        cuInputs = CuVector(inputs[work_start:work_end])
        ts = 32
        bs = Int(chunk_size / 32)
-        CUDA.@sync threads = ts blocks = bs always_inline = true kernel!(cuInputs, cuOutputs, chunk_size)
+        @cuda threads = ts blocks = bs always_inline = true kernel!(cuInputs, cuOutputs, chunk_size)
+        CUDA.device_synchronize()
    end

    #log("GPU Worker on Device $(CUDA.device()) finished!")
@ -156,7 +158,7 @@ function bench(compute_function, kernel!, inputs, chunk_size)

    bench = @benchmark begin
        full_compute($compute_function, $kernel!, $inputs, $chunk_size)
-    end gcsample = true seconds = 30
+    end gcsample = true seconds = 60

    time = median(bench.times) / 1e9
    s = std(bench.times) / 1e9
@ -164,10 +166,11 @@ function bench(compute_function, kernel!, inputs, chunk_size)

    med_cpu_chunks = median(getindex.(cpu_gpu_ratio, 1))
    med_gpu_chunks = median(getindex.(cpu_gpu_ratio, 2))
+    mem_estimate = bench.memory

    log("CPU/GPU ratios: $(cpu_gpu_ratio)")

-    return (time, rate, s, med_cpu_chunks, med_gpu_chunks)
+    return (time, rate, s, med_cpu_chunks, med_gpu_chunks, mem_estimate)
 end

 function full_node_bench(process::MetagraphOptimization.AbstractProcessDescription, func, kernel!, chunk_size, inputs)
@ -178,7 +181,7 @@ function full_node_bench(process::MetagraphOptimization.AbstractProcessDescripti
    display.(CUDA.devices())

    log("Benchmarking full node...")
-    (time, rate, s, med_cpu_chunks, med_gpu_chunks) = bench(func, kernel!, inputs, chunk_size)
+    (time, rate, s, med_cpu_chunks, med_gpu_chunks, mem_estimate) = bench(func, kernel!, inputs, chunk_size)
    log(
        "Benchmarking complete with median time $(time), $(med_cpu_chunks) cpu chunks, and $(med_gpu_chunks) gpu chunks.",
    )
@ -196,6 +199,7 @@ function full_node_bench(process::MetagraphOptimization.AbstractProcessDescripti
            :rate => rate,
            :cpu_chunks => med_cpu_chunks,
            :gpu_chunks => med_gpu_chunks,
+            :memory_est => mem_estimate,
        ),
    )

--- a/examples/qed_bench.jl
+++ b/examples/qed_bench.jl
@ -235,7 +235,7 @@ bench_process(process, "warmup", graph, compute_func, kernel!, gen_time, opt_tim

 optimizer = ReductionOptimizer()

-processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke", "ke->kkkkkke"]
+processes = ["ke->ke", "ke->kke", "ke->kkke", "ke->kkkke", "ke->kkkkke"]

 for process_str in processes
    # compton
@ -253,7 +253,7 @@ for process_str in processes
    CSV.write(results_filename, df)
 end

-processes = ["AB->AB", "AB->ABBB", "AB->ABBBBB"]
+processes = ["AB->AB", "AB->ABBB", "AB->ABBBBB", "AB->ABBBBBBB"]

 for process_str in processes
    # AB->AB
--- a/experiments/full_node.sh
+++ b/experiments/full_node.sh
@ -15,11 +15,11 @@ nvidia-smi > results/cuda_gpu_full_node.txt
 lsblk > results/storage_full_node.txt
 lspci > results/pci_full_node.txt

-echo "Initiating julia..."
-julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1    # need current dev version of QEDprocesses
-julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
-julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1        # add requirements for the bench script
-julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"
+#echo "Initiating julia..."
+#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/QEDjl-project/QEDprocesses.jl/")' >> $LOG_FILE 2>&1 || exit 1    # need current dev version of QEDprocesses
+#julia --threads=8 --project=./ -e 'using Pkg; Pkg.instantiate(); Pkg.add(url="https://github.com/AntonReinhard/QEDbase.jl/tree/fix_bs_multiplication")' >> $LOG_FILE 2>&1 || exit 1 # need a specific fix for abs*bs multiplication for gpu
+#julia --threads=8 -e 'using Pkg; Pkg.add("CSV"); Pkg.add("DataFrames"); Pkg.add("CUDA"); Pkg.add("Random"); Pkg.add("BenchmarkTools"); Pkg.add("Dates")' >> $LOG_FILE 2>&1 || exit 1        # add requirements for the bench script
+#julia --project -e 'using CUDA; CUDA.set_runtime_version!(VersionNumber("12.1"))' >> $LOG_FILE 2>&1 || echo "Failed to set CUDA version number"

 echo "Benchmarking Full Node 128 Threads + *GPUs*"
 julia --project -O3 --threads=128 examples/full_node_bench.jl >> $LOG_FILE 2>&1 || echo "-- Something went wrong, check logs --"
--- a/notebooks/reduction.ipynb
+++ b/notebooks/reduction.ipynb