1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
use core::hash::{BuildHasher, Hash, Hasher};
use crate::{
AddressType, SemanticType, StorageType, Type, TypeHash, VectorSize,
features::{AtomicUsage, Features, TypeUsage},
};
use cubecl_common::profile::TimingMethod;
use enumset::EnumSet;
/// Properties of the device related to the accelerator hardware.
///
/// # Plane size min/max
///
/// This is a range of possible values for the plane size.
///
/// For Nvidia GPUs and HIP, this is a single fixed value.
///
/// For wgpu with AMD GPUs this is a range of possible values, but the actual configured value
/// is undefined and can only be queried at runtime. Should usually be 32, but not guaranteed.
///
/// For Intel GPUs, this is variable based on the number of registers used in the kernel. No way to
/// query this at compile time is currently available. As a result, the minimum value should usually
/// be assumed.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct HardwareProperties {
/// The maximum size of a single load instruction, in bits. Used for optimized vector sizes.
pub load_width: u32,
/// The minimum size of a plane on this device
pub plane_size_min: u32,
/// The maximum size of a plane on this device
pub plane_size_max: u32,
/// minimum number of bindings for a kernel that can be used at once.
pub max_bindings: u32,
/// Maximum amount of shared memory, in bytes
pub max_shared_memory_size: usize,
/// Maximum `CubeCount` in x, y and z dimensions
pub max_cube_count: (u32, u32, u32),
/// Maximum number of total units in a cube
pub max_units_per_cube: u32,
/// Maximum `CubeDim` in x, y, and z dimensions
pub max_cube_dim: (u32, u32, u32),
/// Number of streaming multiprocessors (SM), if available
pub num_streaming_multiprocessors: Option<u32>,
/// Number of available parallel cpu units, if the runtime is CPU.
pub num_cpu_cores: Option<u32>,
/// Number of tensor cores per SM, if any
pub num_tensor_cores: Option<u32>,
/// The minimum tiling dimension for a single axis in tensor cores.
///
/// For a backend that only supports 16x16x16, the value would be 16.
/// For a backend that also supports 32x8x16, the value would be 8.
pub min_tensor_cores_dim: Option<u32>,
/// Maximum vector size supported by the device
pub max_vector_size: VectorSize,
}
/// Properties of the device related to allocation.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct MemoryDeviceProperties {
/// The maximum nr. of bytes that can be allocated in one go.
pub max_page_size: u64,
/// The required memory offset alignment in bytes.
pub alignment: u64,
}
/// Properties of what the device can do, like what `Feature` are
/// supported by it and what its memory properties are.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DeviceProperties {
/// The features supported by the runtime.
pub features: Features,
/// The memory properties of this client.
pub memory: MemoryDeviceProperties,
/// The topology properties of this client.
pub hardware: HardwareProperties,
/// The method used for profiling on the device.
pub timing_method: TimingMethod,
}
impl TypeHash for DeviceProperties {
fn write_hash(_hasher: &mut impl core::hash::Hasher) {
// ignored.
}
}
impl DeviceProperties {
/// Create a new feature set with the given features and memory properties.
pub fn new(
features: Features,
memory_props: MemoryDeviceProperties,
hardware: HardwareProperties,
timing_method: TimingMethod,
) -> Self {
DeviceProperties {
features,
memory: memory_props,
hardware,
timing_method,
}
}
/// Get the usages for a type
pub fn type_usage(&self, ty: StorageType) -> EnumSet<TypeUsage> {
self.features.type_usage(ty)
}
/// Get the usages for an atomic type
pub fn atomic_type_usage(&self, ty: Type) -> EnumSet<AtomicUsage> {
self.features.atomic_type_usage(ty)
}
/// Whether the type is supported in any way
pub fn supports_type(&self, ty: impl Into<Type>) -> bool {
self.features.supports_type(ty)
}
/// Whether the address type is supported in any way
pub fn supports_address(&self, ty: impl Into<AddressType>) -> bool {
self.features.supports_address(ty)
}
/// Register an address type to the features
pub fn register_address_type(&mut self, ty: impl Into<AddressType>) {
self.features.types.address.insert(ty.into());
}
/// Register an address type to the features
pub fn register_atomic_type_usage(&mut self, ty: Type, uses: impl Into<EnumSet<AtomicUsage>>) {
*self.features.types.atomic.entry(ty).or_default() |= uses.into();
}
/// Register a storage type to the features
pub fn register_type_usage(
&mut self,
ty: impl Into<StorageType>,
uses: impl Into<EnumSet<TypeUsage>>,
) {
*self.features.types.storage.entry(ty.into()).or_default() |= uses.into();
}
/// Register a semantic type to the features
pub fn register_semantic_type(&mut self, ty: SemanticType) {
self.features.types.semantic.insert(ty);
}
/// Create a stable hash of all device properties relevant to kernel compilation. Can be used
/// as a stable checksum for a compilation cache.
pub fn checksum(&self) -> u64 {
let state = foldhash::fast::FixedState::default();
let mut hasher = state.build_hasher();
self.features.hash(&mut hasher);
self.hardware.hash(&mut hasher);
hasher.finish()
}
}