Start adding device and machine info

This commit is contained in:
2023-09-17 23:06:14 +02:00
parent bd6c54c1ae
commit f8a591991c
5 changed files with 306 additions and 0 deletions

130
src/devices/detect.jl Normal file
View File

@@ -0,0 +1,130 @@
using NumaAllocators
using CUDA
using ROCm
using oneAPI
"""
get_machine_info(verbose::Bool)
Return the [`Machine`](@ref) currently running on. The parameter `verbose` defaults to true when interactive.
"""
function get_machine_info(verbose::Bool = Base.is_interactive())
devices = Vector{Device}()
numaDevices = get_numa_devices(verbose)
push!(devices, numaDevices)
cudaDevices = get_cuda_devices(verbose)
push!(devices, cudaDevices)
rocmDevices = get_rocm_devices(verbose)
push!(devices, rocmDevices)
oneapiDevices = get_oneapi_devices(verbose)
push!(devices, oneapiDevices)
noDevices = length(devices)
@assert noDevices > 0 "No devices were found, but at least one NUMA node should always be available!"
return Machine(
devices,
transferRates::Matrix{Float64}(-1, noDevices, noDevices),
)
end
"""
get_numa_devices(verbose::Bool)
Return a Vector of [`NumaNode`](@ref)s available on the current machine. If `verbose` is true, print some additional information.
"""
function get_numa_devices(verbose::Bool)
devices = Vector{Device}()
noNumaNodes = highest_numa_node()
if (verbose)
println("Found $(noNumaNodes + 1) NUMA nodes")
end
for i in 0:noNumaNodes
push!(devices, NumaNode(i, 1, -1))
end
return devices
end
"""
get_cuda_devices(verbose::Bool)
Return a Vector of [`CUDAGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information.
"""
function get_cuda_devices(verbose::Bool)
devices = Vector{Device}()
if !CUDA.functional()
if verbose
println("CUDA is non-functional")
end
return devices
end
CUDADevices = CUDA.devices()
if verbose
println("Found $(length(CUDADevices)) CUDA devices")
end
for device in CUDADevices
push!(devices, CUDAGPU(device, -1))
end
return devices
end
"""
get_rocm_devices(verbose::Bool)
Return a Vector of [`ROCmGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information.
"""
function get_rocm_devices(verbose::Bool)
devices = Vector{Device}()
if !AMDGPU.functional()
if verbose
println("AMDGPU is non-functional")
end
return devices
end
AMDDevices = AMDGPU.devices()
if verbose
println("Found $(length(AMDDevices)) AMD devices")
end
for device in AMDDevices
push!(devices, ROCmGPU(device, -1))
end
return devices
end
"""
get_oneapi_devices(verbose::Bool)
Return a Vector of [`oneAPIGPU`](@ref)s available on the current machine. If `verbose` is true, print some additional information.
"""
function get_oneapi_devices(verbose::Bool)
devices = Vector{Device}()
if !oneAPI.functional()
if verbose
println("oneAPI is non-functional")
end
return devices
end
oneAPIDevices = oneAPI.devices()
if verbose
println("Found $(length(oneAPIDevices)) oneAPI devices")
end
for device in oneAPIDevices
push!(devices, oneAPIGPU(device, -1))
end
return devices
end

56
src/devices/measure.jl Normal file
View File

@@ -0,0 +1,56 @@
"""
measure_devices(machine::Machine; verbose::Bool)
Measure FLOPS, RAM, cache sizes and what other properties can be extracted for the devices in the given machine.
"""
function measure_devices!(
machine::Machine;
verbose::Bool = Base.is_interactive(),
)
for device in machine.devices
measure_device!(device; verbose = verbose)
end
return nothing
end
"""
measure_transfer_rates(machine::Machine; verbose::Bool)
Measure the transfer rates between devices in the machine.
"""
function measure_transfer_rates!(
machine::Machine;
verbose::Bool = Base.is_interactive(),
)
return nothing
end
function measure_device!(device::NumaNode; verbose::Bool)
if verbose
println("Measuring Numa Node $(device.numaId)")
end
return nothing
end
function measure_device!(device::CUDAGPU; verbose::Bool)
if verbose
println("Measuring CUDA GPU $(device.device)")
end
return nothing
end
function measure_device!(device::ROCmGPU; verbose::Bool)
if verbose
println("Measuring ROCm GPU $(device.device)")
end
return nothing
end
function measure_device!(device::oneAPIGPU; verbose::Bool)
if verbose
println("Measuring oneAPI GPU $(device.device)")
end
return nothing
end

40
src/devices/type.jl Normal file
View File

@@ -0,0 +1,40 @@
abstract type Device end
abstract type CPU <: Device end
mutable struct NumaNode <: CPU
numaId::UInt16
threads::UInt16
FLOPS::Float64
end
abstract type GPU <: Device end
mutable struct CUDAGPU <: GPU
device::Any # TODO: what's the cuda device type?
FLOPS::Float64
end
mutable struct ROCmGPU <: GPU
device::Any
FLOPS::Float64
end
mutable struct oneAPIGPU <: GPU
device::Any
FLOPS::Float64
end
"""
Machine
A representation of a machine to execute on. Contains information about its architecture (CPUs, GPUs, maybe more). This representation can be used to make a more accurate cost prediction of a [`DAG`](@ref) state.
See also: [`Scheduler`](@ref)
"""
struct Machine
devices::Vector{Device}
transferRates::Matrix{Float64}
end