Add cache strategy information to devices

2023-10-03 17:13:53 +02:00 · 2023-10-03 17:13:53 +02:00 · 314330f00f
commit 314330f00f
parent dd01a5e691
8 changed files with 45 additions and 15 deletions
--- a/src/code_gen/main.jl
+++ b/src/code_gen/main.jl
@ -79,7 +79,7 @@ function gen_input_assignment_code(
            # TODO generate correct access expression
            # TODO how to define cahce strategies?
            device = machine.devices[1]
-            evalExpr = eval(gen_access_expr(device, default_strategy(device), symbol))
+            evalExpr = eval(gen_access_expr(device, cache_strategy(device), symbol))
            push!(assignInputs, Meta.parse("$(evalExpr) = ParticleValue($p, 1.0)"))
        end
    end
@ -102,7 +102,7 @@ function get_compute_function(graph::DAG, process::AbstractProcessDescription, m
    device = machine.devices[1]

    functionId = to_var_name(UUIDs.uuid1(rng[1]))
-    resSym = eval(gen_access_expr(device, default_strategy(device), outputSymbol))
+    resSym = eval(gen_access_expr(device, cache_strategy(device), outputSymbol))
    expr = Meta.parse(
        "function compute_$(functionId)(input::AbstractProcessInput) $assignInputs; $code; return $resSym; end",
    )
--- a/src/devices/cuda/impl.jl
+++ b/src/devices/cuda/impl.jl
@ -2,6 +2,7 @@ using CUDA

 mutable struct CUDAGPU <: AbstractGPU
    device::Any # TODO: what's the cuda device type?
+    cacheStrategy::CacheStrategy
    FLOPS::Float64
 end

@ -9,7 +10,7 @@ push!(DEVICE_TYPES, CUDAGPU)

 CACHE_STRATEGIES[CUDAGPU] = [LocalVariables()]

-default_strategy(::CUDAGPU) = LocalVariables()
+default_strategy(::Type{T}) where {T <: CUDAGPU} = LocalVariables()

 function measure_device!(device::CUDAGPU; verbose::Bool)
    if verbose
@ -40,7 +41,7 @@ function get_devices(deviceType::Type{T}; verbose::Bool = false) where {T <: CUD
        println("Found $(length(CUDADevices)) CUDA devices")
    end
    for device in CUDADevices
-        push!(devices, CUDAGPU(device, -1))
+        push!(devices, CUDAGPU(device, default_strategy(CUDAGPU), -1))
    end

    return devices
--- a/src/devices/impl.jl
+++ b/src/devices/impl.jl
@ -23,3 +23,22 @@ function strategies(t::Type{T}) where {T <: AbstractDevice}

    return CACHE_STRATEGIES[t]
 end
+
+"""
+    cache_strategy(device::AbstractDevice)
+
+Returns the cache strategy set for this device.
+"""
+function cache_strategy(device::AbstractDevice)
+    return device.cacheStrategy
+end
+
+"""
+    set_cache_strategy(device::AbstractDevice, cacheStrategy::CacheStrategy)
+
+Sets the device's cache strategy. After this call, [`cache_strategy`](@ref) should return `cacheStrategy` on the given device.
+"""
+function set_cache_strategy(device::AbstractDevice, cacheStrategy::CacheStrategy)
+    device.cacheStrategy = cacheStrategy
+    return nothing
+end
--- a/src/devices/interface.jl
+++ b/src/devices/interface.jl
@ -1,4 +1,10 @@

+"""
+    AbstractDevice
+
+Abstract base type for every device, like GPUs, CPUs or any other compute devices.
+Every implementation needs to implement various functions and needs a member `cacheStrategy`.
+"""
 abstract type AbstractDevice end

 abstract type AbstractCPU <: AbstractDevice end
@ -49,9 +55,10 @@ See also: [`strategies`](@ref)
 CACHE_STRATEGIES = Dict{Type, Vector{CacheStrategy}}()

 """
-    default_strategy(device::AbstractDevice)
+    default_strategy(deviceType::Type{T}) where {T <: AbstractDevice}

-Interface function that must be implmented for every subtype of [`AbstractDevice`](@ref). Returns the default [`CacheStrategy`](@ref) to use on the given device.
+Interface function that must be implemented for every subtype of [`AbstractDevice`](@ref). Returns the default [`CacheStrategy`](@ref) to use on the given device type.
+See also: [`cache_strategy`](@ref), [`set_cache_strategy`](@ref)
 """
 function default_strategy end

--- a/src/devices/numa/impl.jl
+++ b/src/devices/numa/impl.jl
@ -3,6 +3,7 @@ using NumaAllocators
 mutable struct NumaNode <: AbstractCPU
    numaId::UInt16
    threads::UInt16
+    cacheStrategy::CacheStrategy
    FLOPS::Float64
 end

@ -10,7 +11,7 @@ push!(DEVICE_TYPES, NumaNode)

 CACHE_STRATEGIES[NumaNode] = [LocalVariables()]

-default_strategy(::NumaNode) = LocalVariables()
+default_strategy(::Type{T}) where {T <: NumaNode} = LocalVariables()

 function measure_device!(device::NumaNode; verbose::Bool)
    if verbose
@ -34,7 +35,7 @@ function get_devices(deviceType::Type{T}; verbose::Bool = false) where {T <: Num
        println("Found $(noNumaNodes + 1) NUMA nodes")
    end
    for i in 0:noNumaNodes
-        push!(devices, NumaNode(i, 1, -1))
+        push!(devices, NumaNode(i, 1, default_strategy(NumaNode), -1))
    end

    return devices
--- a/src/devices/oneapi/impl.jl
+++ b/src/devices/oneapi/impl.jl
@ -2,6 +2,7 @@ using oneAPI

 mutable struct oneAPIGPU <: AbstractGPU
    device::Any
+    cacheStrategy::CacheStrategy
    FLOPS::Float64
 end

@ -9,7 +10,7 @@ push!(DEVICE_TYPES, oneAPIGPU)

 CACHE_STRATEGIES[oneAPIGPU] = [LocalVariables()]

-default_strategy(::oneAPIGPU) = LocalVariables()
+default_strategy(::Type{T}) where {T <: oneAPIGPU} = LocalVariables()

 function measure_device!(device::oneAPIGPU; verbose::Bool)
    if verbose
@ -40,7 +41,7 @@ function get_devices(deviceType::Type{T}; verbose::Bool = false) where {T <: one
        println("Found $(length(oneAPIDevices)) oneAPI devices")
    end
    for device in oneAPIDevices
-        push!(devices, oneAPIGPU(device, -1))
+        push!(devices, oneAPIGPU(device, default_strategy(oneAPIGPU), -1))
    end

    return devices
--- a/src/devices/rocm/impl.jl
+++ b/src/devices/rocm/impl.jl
@ -2,6 +2,7 @@ using AMDGPU

 mutable struct ROCmGPU <: AbstractGPU
    device::Any
+    cacheStrategy::CacheStrategy
    FLOPS::Float64
 end

@ -9,7 +10,7 @@ push!(DEVICE_TYPES, ROCmGPU)

 CACHE_STRATEGIES[ROCmGPU] = [LocalVariables()]

-default_strategy(::ROCmGPU) = LocalVariables()
+default_strategy(::Type{T}) where {T <: ROCmGPU} = LocalVariables()

 function measure_device!(device::ROCmGPU; verbose::Bool)
    if verbose
@ -40,7 +41,7 @@ function get_devices(deviceType::Type{T}; verbose::Bool = false) where {T <: ROC
        println("Found $(length(AMDDevices)) AMD devices")
    end
    for device in AMDDevices
-        push!(devices, ROCmGPU(device, -1))
+        push!(devices, ROCmGPU(device, default_strategy(ROCmGPU), -1))
    end

    return devices
--- a/src/task/compute.jl
+++ b/src/task/compute.jl
@ -20,7 +20,7 @@ function get_expression(t::FusedComputeTask, device::AbstractDevice, inExprs::Ve
    expr1 = nothing
    expr2 = nothing

-    cacheStrategy = default_strategy(device)
+    cacheStrategy = cache_strategy(device)

    inExprs1 = Vector()
    for sym in t.t1_inputs
@ -52,7 +52,7 @@ function get_expression(node::ComputeTaskNode, device::AbstractDevice)
    @assert length(node.children) >= children(node.task) "Node $(node) has too few children for its task: node has $(length(node.children)) versus task has $(children(node.task))\nNode's children: $(getfield.(node.children, :children))"

    # TODO get device from the node
-    cacheStrategy = default_strategy(device)
+    cacheStrategy = cache_strategy(device)

    inExprs = Vector()
    for id in getfield.(node.children, :id)
@ -74,7 +74,7 @@ function get_expression(node::DataTaskNode, device::AbstractDevice)
    # TODO: do things to transport data from/to gpu, between numa nodes, etc.
    # TODO get device from the node

-    cacheStrategy = default_strategy(device)
+    cacheStrategy = cache_strategy(device)
    inExpr = nothing
    if (length(node.children) == 1)
        inExpr = eval(gen_access_expr(device, cacheStrategy, Symbol(to_var_name(node.children[1].id))))