Start adding device and machine info
This commit is contained in:
130
src/devices/detect.jl
Normal file
130
src/devices/detect.jl
Normal file
@@ -0,0 +1,130 @@
|
||||
using NumaAllocators
|
||||
using CUDA
|
||||
using ROCm
|
||||
using oneAPI
|
||||
|
||||
"""
|
||||
get_machine_info(verbose::Bool)
|
||||
|
||||
Return the [`Machine`](@ref) currently running on. The parameter `verbose` defaults to true when interactive.
|
||||
"""
|
||||
function get_machine_info(verbose::Bool = Base.is_interactive())
|
||||
devices = Vector{Device}()
|
||||
|
||||
numaDevices = get_numa_devices(verbose)
|
||||
push!(devices, numaDevices)
|
||||
|
||||
cudaDevices = get_cuda_devices(verbose)
|
||||
push!(devices, cudaDevices)
|
||||
|
||||
rocmDevices = get_rocm_devices(verbose)
|
||||
push!(devices, rocmDevices)
|
||||
|
||||
oneapiDevices = get_oneapi_devices(verbose)
|
||||
push!(devices, oneapiDevices)
|
||||
|
||||
noDevices = length(devices)
|
||||
@assert noDevices > 0 "No devices were found, but at least one NUMA node should always be available!"
|
||||
|
||||
return Machine(
|
||||
devices,
|
||||
transferRates::Matrix{Float64}(-1, noDevices, noDevices),
|
||||
)
|
||||
end
|
||||
|
||||
"""
|
||||
get_numa_devices(verbose::Bool)
|
||||
|
||||
Return a Vector of [`NumaNode`](@ref)s available on the current machine. If `verbose` is true, print some additional information.
|
||||
"""
|
||||
function get_numa_devices(verbose::Bool)
|
||||
devices = Vector{Device}()
|
||||
noNumaNodes = highest_numa_node()
|
||||
|
||||
if (verbose)
|
||||
println("Found $(noNumaNodes + 1) NUMA nodes")
|
||||
end
|
||||
for i in 0:noNumaNodes
|
||||
push!(devices, NumaNode(i, 1, -1))
|
||||
end
|
||||
|
||||
return devices
|
||||
end
|
||||
|
||||
"""
|
||||
get_cuda_devices(verbose::Bool)
|
||||
|
||||
Return a Vector of [`CUDAGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information.
|
||||
"""
|
||||
function get_cuda_devices(verbose::Bool)
|
||||
devices = Vector{Device}()
|
||||
|
||||
if !CUDA.functional()
|
||||
if verbose
|
||||
println("CUDA is non-functional")
|
||||
end
|
||||
return devices
|
||||
end
|
||||
|
||||
CUDADevices = CUDA.devices()
|
||||
if verbose
|
||||
println("Found $(length(CUDADevices)) CUDA devices")
|
||||
end
|
||||
for device in CUDADevices
|
||||
push!(devices, CUDAGPU(device, -1))
|
||||
end
|
||||
|
||||
return devices
|
||||
end
|
||||
|
||||
"""
|
||||
get_rocm_devices(verbose::Bool)
|
||||
|
||||
Return a Vector of [`ROCmGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information.
|
||||
"""
|
||||
function get_rocm_devices(verbose::Bool)
|
||||
devices = Vector{Device}()
|
||||
|
||||
if !AMDGPU.functional()
|
||||
if verbose
|
||||
println("AMDGPU is non-functional")
|
||||
end
|
||||
return devices
|
||||
end
|
||||
|
||||
AMDDevices = AMDGPU.devices()
|
||||
if verbose
|
||||
println("Found $(length(AMDDevices)) AMD devices")
|
||||
end
|
||||
for device in AMDDevices
|
||||
push!(devices, ROCmGPU(device, -1))
|
||||
end
|
||||
|
||||
return devices
|
||||
end
|
||||
|
||||
"""
|
||||
get_oneapi_devices(verbose::Bool)
|
||||
|
||||
Return a Vector of [`oneAPIGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information.
|
||||
"""
|
||||
function get_oneapi_devices(verbose::Bool)
|
||||
devices = Vector{Device}()
|
||||
|
||||
if !oneAPI.functional()
|
||||
if verbose
|
||||
println("oneAPI is non-functional")
|
||||
end
|
||||
return devices
|
||||
end
|
||||
|
||||
oneAPIDevices = oneAPI.devices()
|
||||
if verbose
|
||||
println("Found $(length(oneAPIDevices)) oneAPI devices")
|
||||
end
|
||||
for device in oneAPIDevices
|
||||
push!(devices, oneAPIGPU(device, -1))
|
||||
end
|
||||
|
||||
return devices
|
||||
end
|
56
src/devices/measure.jl
Normal file
56
src/devices/measure.jl
Normal file
@@ -0,0 +1,56 @@
|
||||
"""
|
||||
measure_devices(machine::Machine; verbose::Bool)
|
||||
|
||||
Measure FLOPS, RAM, cache sizes and what other properties can be extracted for the devices in the given machine.
|
||||
"""
|
||||
function measure_devices!(
|
||||
machine::Machine;
|
||||
verbose::Bool = Base.is_interactive(),
|
||||
)
|
||||
for device in machine.devices
|
||||
measure_device!(device; verbose = verbose)
|
||||
end
|
||||
|
||||
return nothing
|
||||
end
|
||||
|
||||
"""
|
||||
measure_transfer_rates(machine::Machine; verbose::Bool)
|
||||
|
||||
Measure the transfer rates between devices in the machine.
|
||||
"""
|
||||
function measure_transfer_rates!(
|
||||
machine::Machine;
|
||||
verbose::Bool = Base.is_interactive(),
|
||||
)
|
||||
|
||||
return nothing
|
||||
end
|
||||
|
||||
function measure_device!(device::NumaNode; verbose::Bool)
|
||||
if verbose
|
||||
println("Measuring Numa Node $(device.numaId)")
|
||||
end
|
||||
return nothing
|
||||
end
|
||||
|
||||
function measure_device!(device::CUDAGPU; verbose::Bool)
|
||||
if verbose
|
||||
println("Measuring CUDA GPU $(device.device)")
|
||||
end
|
||||
return nothing
|
||||
end
|
||||
|
||||
function measure_device!(device::ROCmGPU; verbose::Bool)
|
||||
if verbose
|
||||
println("Measuring ROCm GPU $(device.device)")
|
||||
end
|
||||
return nothing
|
||||
end
|
||||
|
||||
function measure_device!(device::oneAPIGPU; verbose::Bool)
|
||||
if verbose
|
||||
println("Measuring oneAPI GPU $(device.device)")
|
||||
end
|
||||
return nothing
|
||||
end
|
40
src/devices/type.jl
Normal file
40
src/devices/type.jl
Normal file
@@ -0,0 +1,40 @@
|
||||
|
||||
abstract type Device end
|
||||
|
||||
abstract type CPU <: Device end
|
||||
|
||||
mutable struct NumaNode <: CPU
|
||||
numaId::UInt16
|
||||
threads::UInt16
|
||||
FLOPS::Float64
|
||||
end
|
||||
|
||||
abstract type GPU <: Device end
|
||||
|
||||
mutable struct CUDAGPU <: GPU
|
||||
device::Any # TODO: what's the cuda device type?
|
||||
FLOPS::Float64
|
||||
end
|
||||
|
||||
mutable struct ROCmGPU <: GPU
|
||||
device::Any
|
||||
FLOPS::Float64
|
||||
end
|
||||
|
||||
mutable struct oneAPIGPU <: GPU
|
||||
device::Any
|
||||
FLOPS::Float64
|
||||
end
|
||||
|
||||
"""
|
||||
Machine
|
||||
|
||||
A representation of a machine to execute on. Contains information about its architecture (CPUs, GPUs, maybe more). This representation can be used to make a more accurate cost prediction of a [`DAG`](@ref) state.
|
||||
|
||||
See also: [`Scheduler`](@ref)
|
||||
"""
|
||||
struct Machine
|
||||
devices::Vector{Device}
|
||||
|
||||
transferRates::Matrix{Float64}
|
||||
end
|
Reference in New Issue
Block a user