Skip to main content

entrenar/gpu/
placement.rs

1//! Job placement algorithm for multi-node adapter training (GPU-SHARE Phase 3, §3.3).
2//!
3//! Scores each node for each adapter job and assigns greedily:
4//!
5//! ```text
6//! score = (free_vram / adapter_budget) × gpu_flops_factor × (1 / current_load)
7//! ```
8//!
9//! Where `gpu_flops_factor` normalizes different GPU types:
10//! - RTX 4090: 1.0 (reference)
11//! - Jetson Orin: 0.06 (8 SMs vs 128)
12//! - CPU (Intel): 0.01
13
14use super::cluster::{ClusterConfig, NodeConfig};
15
16/// FLOPS factor for known GPU types, normalized to RTX 4090.
17fn gpu_flops_factor(gpu_type: &str) -> f64 {
18    match gpu_type.to_lowercase().as_str() {
19        "rtx-4090" | "rtx4090" | "geforce-rtx-4090" => 1.0,
20        "rtx-4080" | "rtx4080" => 0.72,
21        "rtx-3090" | "rtx3090" => 0.55,
22        "rtx-3080" | "rtx3080" => 0.45,
23        "a100" | "a100-80gb" | "a100-40gb" => 1.2,
24        "h100" | "h100-80gb" => 2.0,
25        "jetson-orin" | "orin" => 0.06,
26        "jetson-nano" | "nano" => 0.02,
27        _ => 0.5, // unknown GPU, conservative estimate
28    }
29}
30
31/// A pending adapter job to place on a cluster node.
32#[derive(Debug, Clone)]
33pub struct AdapterJob {
34    /// Index of this adapter in the adapters config.
35    pub adapter_idx: usize,
36    /// Estimated VRAM budget in MB.
37    pub budget_mb: u64,
38    /// Human-readable label.
39    pub label: String,
40}
41
42/// Result of placing an adapter on a node.
43#[derive(Debug, Clone)]
44pub struct PlacementDecision {
45    /// Adapter index.
46    pub adapter_idx: usize,
47    /// Node name assigned.
48    pub node_name: String,
49    /// Placement score (higher = better fit).
50    pub score: f64,
51}
52
53/// Current load state of a node for placement scoring.
54#[derive(Debug, Clone, Default)]
55pub struct NodeLoad {
56    /// Number of adapters currently assigned.
57    pub active_adapters: usize,
58    /// VRAM already reserved (MB).
59    pub reserved_vram_mb: u64,
60}
61
62/// Place adapter jobs across cluster nodes greedily.
63///
64/// Each adapter is assigned to the highest-scoring eligible node.
65/// Nodes are updated with load after each assignment.
66///
67/// # Returns
68/// A vector of placement decisions. Unplaceable adapters are omitted.
69pub fn place_adapters(
70    cluster: &ClusterConfig,
71    jobs: &[AdapterJob],
72    initial_load: &[NodeLoad],
73) -> Vec<PlacementDecision> {
74    let mut loads: Vec<NodeLoad> = cluster
75        .nodes
76        .iter()
77        .enumerate()
78        .map(|(i, _)| initial_load.get(i).cloned().unwrap_or_default())
79        .collect();
80
81    let mut placements = Vec::new();
82
83    for job in jobs {
84        let best = find_best_node(cluster, job, &loads);
85        if let Some((node_idx, score)) = best {
86            let node = &cluster.nodes[node_idx];
87            placements.push(PlacementDecision {
88                adapter_idx: job.adapter_idx,
89                node_name: node.name.clone(),
90                score,
91            });
92            loads[node_idx].active_adapters += 1;
93            loads[node_idx].reserved_vram_mb += job.budget_mb;
94        }
95    }
96
97    placements
98}
99
100fn find_best_node(
101    cluster: &ClusterConfig,
102    job: &AdapterJob,
103    loads: &[NodeLoad],
104) -> Option<(usize, f64)> {
105    let mut best: Option<(usize, f64)> = None;
106
107    for (i, node) in cluster.nodes.iter().enumerate() {
108        let load = &loads[i];
109
110        // Check adapter capacity
111        if load.active_adapters >= node.max_adapters {
112            continue;
113        }
114
115        let score = score_node(node, job.budget_mb, load);
116        if score <= 0.0 {
117            continue;
118        }
119
120        match best {
121            None => best = Some((i, score)),
122            Some((_, best_score)) if score > best_score => best = Some((i, score)),
123            _ => {}
124        }
125    }
126
127    best
128}
129
130/// Score a node for a given adapter budget.
131///
132/// `score = (free_vram / adapter_budget) × gpu_flops_factor × (1 / current_load)`
133///
134/// Returns 0.0 if the node cannot fit the adapter.
135pub fn score_node(node: &NodeConfig, budget_mb: u64, load: &NodeLoad) -> f64 {
136    if budget_mb == 0 {
137        return 0.0;
138    }
139
140    let free_vram = free_vram_mb(node, load);
141    if free_vram < budget_mb {
142        return 0.0;
143    }
144
145    let vram_ratio = free_vram as f64 / budget_mb as f64;
146    let flops = node_flops_factor(node);
147    let load_factor = 1.0 / (1.0 + load.active_adapters as f64);
148
149    vram_ratio * flops * load_factor
150}
151
152/// Available VRAM on a node after accounting for reserves and current load.
153fn free_vram_mb(node: &NodeConfig, load: &NodeLoad) -> u64 {
154    let usable = node.usable_vram_mb();
155    usable.saturating_sub(load.reserved_vram_mb)
156}
157
158/// Aggregate FLOPS factor for a node (max GPU if multi-GPU).
159fn node_flops_factor(node: &NodeConfig) -> f64 {
160    if node.gpus.is_empty() {
161        return 0.01; // CPU-only
162    }
163    node.gpus.iter().map(|g| gpu_flops_factor(&g.gpu_type)).fold(0.0_f64, f64::max)
164}
165
166#[cfg(test)]
167mod tests {
168    #![allow(clippy::unwrap_used)]
169    use super::*;
170    use crate::gpu::cluster::ClusterConfig;
171
172    fn test_cluster() -> ClusterConfig {
173        ClusterConfig::from_yaml(
174            r"
175nodes:
176  - name: desktop
177    host: localhost
178    gpus:
179      - uuid: GPU-abcd-1234
180        type: rtx-4090
181        vram_mb: 24564
182        memory_type: discrete
183    max_adapters: 3
184  - name: jetson
185    host: jetson.local
186    transport: ssh
187    gpus:
188      - uuid: GPU-efgh-5678
189        type: jetson-orin
190        vram_mb: 8192
191        memory_type: unified
192    max_adapters: 1
193  - name: intel-box
194    host: 10.0.0.5
195    transport: ssh
196    user: noah
197    gpus: []
198    cpu_cores: 16
199    ram_mb: 65536
200    max_adapters: 1
201",
202        )
203        .unwrap()
204    }
205
206    #[test]
207    fn test_gpu_flops_known_types() {
208        assert!((gpu_flops_factor("rtx-4090") - 1.0).abs() < f64::EPSILON);
209        assert!((gpu_flops_factor("jetson-orin") - 0.06).abs() < f64::EPSILON);
210        assert!((gpu_flops_factor("h100") - 2.0).abs() < f64::EPSILON);
211        assert!((gpu_flops_factor("unknown-gpu") - 0.5).abs() < f64::EPSILON);
212    }
213
214    #[test]
215    fn test_score_node_desktop() {
216        let cluster = test_cluster();
217        let desktop = &cluster.nodes[0];
218        let load = NodeLoad::default();
219        // usable = 24564 * 0.85 = 20879
220        // score = (20879 / 8000) * 1.0 * (1 / 1) = 2.609
221        let score = score_node(desktop, 8000, &load);
222        assert!(score > 2.5);
223        assert!(score < 2.7);
224    }
225
226    #[test]
227    fn test_score_node_insufficient_vram() {
228        let cluster = test_cluster();
229        let desktop = &cluster.nodes[0];
230        let load = NodeLoad::default();
231        // Request more VRAM than usable
232        let score = score_node(desktop, 25000, &load);
233        assert!((score - 0.0).abs() < f64::EPSILON);
234    }
235
236    #[test]
237    fn test_score_node_with_load() {
238        let cluster = test_cluster();
239        let desktop = &cluster.nodes[0];
240        let load = NodeLoad { active_adapters: 1, reserved_vram_mb: 8000 };
241        // free = 20879 - 8000 = 12879
242        // score = (12879 / 8000) * 1.0 * (1 / 2) = 0.804
243        let score = score_node(desktop, 8000, &load);
244        assert!(score > 0.7);
245        assert!(score < 0.9);
246    }
247
248    #[test]
249    fn test_score_cpu_only_node() {
250        let cluster = test_cluster();
251        let intel = &cluster.nodes[2];
252        let load = NodeLoad::default();
253        // CPU-only: 0 VRAM, budget > 0 → score = 0
254        let score = score_node(intel, 8000, &load);
255        assert!((score - 0.0).abs() < f64::EPSILON);
256    }
257
258    #[test]
259    fn test_place_single_adapter() {
260        let cluster = test_cluster();
261        let jobs =
262            vec![AdapterJob { adapter_idx: 0, budget_mb: 8000, label: "adapter-0".to_string() }];
263        let placements = place_adapters(&cluster, &jobs, &[]);
264        assert_eq!(placements.len(), 1);
265        assert_eq!(placements[0].node_name, "desktop"); // highest score
266        assert_eq!(placements[0].adapter_idx, 0);
267    }
268
269    #[test]
270    fn test_place_multiple_adapters_greedy() {
271        let cluster = test_cluster();
272        let jobs: Vec<AdapterJob> = (0..4)
273            .map(|i| AdapterJob { adapter_idx: i, budget_mb: 6000, label: format!("adapter-{i}") })
274            .collect();
275        let placements = place_adapters(&cluster, &jobs, &[]);
276
277        // Desktop has 3 slots and enough VRAM for 3 × 6 GB
278        // Jetson has 1 slot with ~4915 MB usable — too small for 6 GB budget
279        // So only 3 should be placed on desktop
280        assert_eq!(placements.len(), 3);
281        for p in &placements {
282            assert_eq!(p.node_name, "desktop");
283        }
284    }
285
286    #[test]
287    fn test_place_small_adapters_across_nodes() {
288        let cluster = test_cluster();
289        let jobs: Vec<AdapterJob> = (0..4)
290            .map(|i| AdapterJob {
291                adapter_idx: i,
292                budget_mb: 2000, // small enough for Jetson
293                label: format!("adapter-{i}"),
294            })
295            .collect();
296        let placements = place_adapters(&cluster, &jobs, &[]);
297
298        // Desktop: 3 slots, Jetson: 1 slot → all 4 placed
299        assert_eq!(placements.len(), 4);
300        let desktop_count = placements.iter().filter(|p| p.node_name == "desktop").count();
301        let jetson_count = placements.iter().filter(|p| p.node_name == "jetson").count();
302        assert_eq!(desktop_count, 3);
303        assert_eq!(jetson_count, 1);
304    }
305
306    #[test]
307    fn test_place_no_capacity() {
308        let cluster = test_cluster();
309        let jobs = vec![AdapterJob {
310            adapter_idx: 0,
311            budget_mb: 30000, // too large for any node
312            label: "too-big".to_string(),
313        }];
314        let placements = place_adapters(&cluster, &jobs, &[]);
315        assert!(placements.is_empty());
316    }
317
318    #[test]
319    fn test_place_with_initial_load() {
320        let cluster = test_cluster();
321        let jobs =
322            vec![AdapterJob { adapter_idx: 0, budget_mb: 6000, label: "adapter-0".to_string() }];
323        // Desktop already has 3 adapters (full)
324        let load = vec![
325            NodeLoad { active_adapters: 3, reserved_vram_mb: 18000 },
326            NodeLoad::default(),
327            NodeLoad::default(),
328        ];
329        let placements = place_adapters(&cluster, &jobs, &load);
330        // Desktop full, Jetson too small for 6 GB → nothing placed
331        assert!(placements.is_empty());
332    }
333
334    #[test]
335    fn test_node_flops_factor_multi_gpu() {
336        // A hypothetical node with two different GPUs should use the max
337        let node = NodeConfig {
338            name: "multi".to_string(),
339            host: "localhost".to_string(),
340            transport: super::super::cluster::Transport::Local,
341            user: None,
342            gpus: vec![
343                super::super::cluster::GpuConfig {
344                    uuid: "GPU-1".to_string(),
345                    gpu_type: "rtx-3080".to_string(),
346                    vram_mb: 10240,
347                    memory_type: super::super::cluster::MemoryType::Discrete,
348                },
349                super::super::cluster::GpuConfig {
350                    uuid: "GPU-2".to_string(),
351                    gpu_type: "rtx-4090".to_string(),
352                    vram_mb: 24564,
353                    memory_type: super::super::cluster::MemoryType::Discrete,
354                },
355            ],
356            max_adapters: 4,
357            cpu_cores: None,
358            ram_mb: None,
359        };
360        let flops = node_flops_factor(&node);
361        assert!((flops - 1.0).abs() < f64::EPSILON); // max of 0.45 and 1.0
362    }
363}