gflow 0.4.14

A lightweight, single-node job scheduler written in Rust.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
use super::*;

impl Scheduler {
    pub fn calculate_time_bonus(time_limit: &Option<Duration>) -> u32 {
        match time_limit {
            None => 100, // Jobs without time limits get lowest bonus
            Some(limit) => {
                // Normalize time limit against a 24-hour maximum
                let max_time_secs = 24.0 * 3600.0; // 24 hours in seconds
                let limit_secs = limit.as_secs_f64();
                let normalized = (limit_secs / max_time_secs).min(1.0);

                // Shorter jobs get higher bonus (up to 300)
                // Longer jobs get lower bonus (down to 200)
                // Formula: 200 + (1 - normalized) * 100
                200 + ((1.0 - normalized) * 100.0) as u32
            }
        }
    }

    /// Refresh available memory by calculating memory used by running jobs
    pub fn refresh_available_memory(&mut self) {
        let memory_used: u64 = self
            .job_runtimes
            .iter()
            .filter(|rt| rt.state == JobState::Running)
            .filter_map(|rt| rt.memory_limit_mb)
            .sum();

        self.available_memory_mb = self.total_memory_mb.saturating_sub(memory_used);
    }

    fn current_gpu_occupancy(&self) -> (HashMap<u32, usize>, HashSet<u32>, HashMap<u32, u64>) {
        let mut shared_gpu_occupancy = HashMap::new();
        let mut exclusive_gpu_occupancy = HashSet::new();
        let mut shared_gpu_memory_usage_mb = HashMap::new();

        for rt in self
            .job_runtimes
            .iter()
            .filter(|rt| rt.state == JobState::Running)
        {
            let Some(gpu_ids) = rt.gpu_ids.as_ref() else {
                continue;
            };

            match rt.gpu_sharing_mode {
                GpuSharingMode::Shared => {
                    for &gpu in gpu_ids {
                        *shared_gpu_occupancy.entry(gpu).or_insert(0) += 1;
                        if let Some(limit_mb) = rt.gpu_memory_limit_mb {
                            *shared_gpu_memory_usage_mb.entry(gpu).or_insert(0) += limit_mb;
                        }
                    }
                }
                GpuSharingMode::Exclusive => {
                    for &gpu in gpu_ids {
                        exclusive_gpu_occupancy.insert(gpu);
                    }
                }
            }
        }

        (
            shared_gpu_occupancy,
            exclusive_gpu_occupancy,
            shared_gpu_memory_usage_mb,
        )
    }

    /// Prepare jobs for execution by allocating resources and marking them as Running
    ///
    /// # Warning
    /// This method **mutates scheduler state** by:
    /// - Transitioning jobs from Queued to Running
    /// - Allocating GPU and memory resources
    /// - Setting started_at timestamps
    ///
    /// **IMPORTANT**: You MUST either:
    /// 1. Execute the returned jobs (via executor or `execute_jobs_no_lock`)
    /// 2. Handle failures (via `handle_execution_failures`) if execution fails
    ///
    /// Failure to execute will leave jobs stuck in Running state with resources allocated.
    ///
    /// # Returns
    /// Vector of jobs ready to execute with resources already allocated
    ///
    /// # Example
    /// ```ignore
    /// let jobs = scheduler.prepare_jobs_for_execution();
    /// let results = scheduler.execute_jobs_no_lock(&jobs);
    /// scheduler.handle_execution_failures(&results);
    /// ```
    pub fn prepare_jobs_for_execution(&mut self) -> Vec<Job> {
        // Update reservation statuses first
        self.update_reservation_statuses();

        let mut job_ids_to_execute = Vec::new();
        let available_gpus = self.get_available_gpu_slots();
        let (mut shared_gpu_occupancy, mut exclusive_gpu_occupancy, mut shared_gpu_memory_usage_mb) =
            self.current_gpu_occupancy();
        let gpu_total_memory_mb: HashMap<u32, u64> = self
            .gpu_slots
            .values()
            .filter_map(|slot| slot.total_memory_mb.map(|total_mb| (slot.index, total_mb)))
            .collect();

        // Build finished jobs set by iterating only runtimes (hot data)
        let finished_jobs: std::collections::HashSet<u32> = self
            .job_runtimes
            .iter()
            .filter(|rt| rt.state == JobState::Finished)
            .map(|rt| rt.id)
            .collect();

        // Collect and sort runnable jobs - iterate only runtimes (hot path)
        let mut runnable_jobs: Vec<_> = self
            .job_runtimes
            .iter()
            .enumerate()
            .filter(|(_, rt)| rt.state == JobState::Queued)
            .filter(|(idx, _rt)| {
                // Access spec only when needed for dependency check
                let spec = &self.job_specs[*idx];
                Self::are_dependencies_satisfied_split(spec, &finished_jobs)
            })
            .map(|(_idx, rt)| rt.id)
            .collect();

        // Sort by priority - only access runtime fields (hot data)
        runnable_jobs.sort_by_key(|job_id| {
            let idx = (*job_id - 1) as usize;
            if let Some(rt) = self.job_runtimes.get(idx) {
                let time_bonus = Self::calculate_time_bonus(&rt.time_limit);
                std::cmp::Reverse((rt.priority, time_bonus, std::cmp::Reverse(rt.id)))
            } else {
                std::cmp::Reverse((0, 0, std::cmp::Reverse(*job_id)))
            }
        });

        // Allocate resources for runnable jobs
        let mut available_memory = self.available_memory_mb;
        for job_id in runnable_jobs {
            let idx = (job_id - 1) as usize;

            // First, do immutable checks using only runtime (hot data)
            let (
                has_enough_memory,
                within_group_limit,
                respects_reservations,
                required_memory,
                job_user,
                requested_gpu_count,
                gpu_sharing_mode,
                requested_gpu_memory_mb,
            ) = if let Some(rt) = self.job_runtimes.get(idx) {
                let required_memory = rt.memory_limit_mb.unwrap_or(0);
                let has_enough_memory = required_memory <= available_memory;

                // Access spec only for submitted_by (needed for reservation check)
                let job_user = self
                    .job_specs
                    .get(idx)
                    .map(|s| s.submitted_by.clone())
                    .unwrap_or_default();

                // Check if job respects active reservations
                let respects_reservations =
                    self.check_job_respects_reservations(&job_user, rt.gpus, &available_gpus);

                // Check group concurrency limit using runtime data only
                let within_group_limit = if let Some(ref group_id) = rt.group_id {
                    if let Some(max_concurrent) = rt.max_concurrent {
                        // Use O(1) index lookup
                        let running_in_group =
                            self.group_running_count.get(group_id).copied().unwrap_or(0);

                        if running_in_group >= max_concurrent {
                            tracing::debug!(
                                "Job {} waiting: group {} has {}/{} running jobs",
                                rt.id,
                                group_id,
                                running_in_group,
                                max_concurrent
                            );
                            false
                        } else {
                            true
                        }
                    } else {
                        true // No limit specified
                    }
                } else {
                    true // Not part of a group
                };

                (
                    has_enough_memory,
                    within_group_limit,
                    respects_reservations,
                    required_memory,
                    job_user,
                    rt.gpus,
                    rt.gpu_sharing_mode,
                    rt.gpu_memory_limit_mb,
                )
            } else {
                continue;
            };

            // Now allocate resources if all checks pass
            if has_enough_memory && within_group_limit && respects_reservations {
                // Filter out GPUs that are reserved by other users
                let mut usable_gpus = self.filter_usable_gpus(&job_user, &available_gpus);
                self.reorder_usable_gpus(job_id, &mut usable_gpus);

                // Enforce sharing compatibility:
                // - Shared jobs can use idle or shared-occupied GPUs, but never exclusive-occupied GPUs.
                // - Exclusive jobs can only use fully idle GPUs.
                let compatible_gpus: Vec<u32> = usable_gpus
                    .into_iter()
                    .filter(|gpu| match gpu_sharing_mode {
                        GpuSharingMode::Shared => {
                            if exclusive_gpu_occupancy.contains(gpu) {
                                false
                            } else if let Some(requested_gpu_memory_mb) = requested_gpu_memory_mb {
                                if let Some(total_gpu_memory_mb) = gpu_total_memory_mb.get(gpu) {
                                    let used_memory_mb =
                                        shared_gpu_memory_usage_mb.get(gpu).copied().unwrap_or(0);
                                    used_memory_mb.saturating_add(requested_gpu_memory_mb)
                                        <= *total_gpu_memory_mb
                                } else {
                                    // If total GPU memory is unknown, skip this check.
                                    true
                                }
                            } else {
                                true
                            }
                        }
                        GpuSharingMode::Exclusive => {
                            !exclusive_gpu_occupancy.contains(gpu)
                                && shared_gpu_occupancy.get(gpu).copied().unwrap_or(0) == 0
                        }
                    })
                    .collect();
                let has_enough_gpus = requested_gpu_count as usize <= compatible_gpus.len();

                if !has_enough_gpus {
                    continue;
                }

                let gpus_for_job: GpuIds = compatible_gpus
                    .into_iter()
                    .take(requested_gpu_count as usize)
                    .collect();
                let mut allocated_gpus = None;
                if let Some(rt) = self.job_runtimes.get_mut(idx) {
                    rt.gpu_ids = Some(gpus_for_job.clone());
                    allocated_gpus = Some(gpus_for_job);
                }

                if let Some(ref allocated) = allocated_gpus {
                    match gpu_sharing_mode {
                        GpuSharingMode::Shared => {
                            for &gpu in allocated {
                                *shared_gpu_occupancy.entry(gpu).or_insert(0) += 1;
                                if let Some(requested_gpu_memory_mb) = requested_gpu_memory_mb {
                                    *shared_gpu_memory_usage_mb.entry(gpu).or_insert(0) +=
                                        requested_gpu_memory_mb;
                                }
                            }
                        }
                        GpuSharingMode::Exclusive => {
                            for &gpu in allocated {
                                exclusive_gpu_occupancy.insert(gpu);
                            }
                        }
                    }
                }

                let transitioned = self
                    .transition_job_state(job_id, JobState::Running, None)
                    .unwrap_or(false);

                if transitioned {
                    // Collect job ID instead of cloning immediately
                    job_ids_to_execute.push(job_id);

                    // Update memory tracking after releasing the borrow
                    available_memory = available_memory.saturating_sub(required_memory);
                    self.available_memory_mb =
                        self.available_memory_mb.saturating_sub(required_memory);
                } else {
                    // Roll back provisional GPU allocation if we couldn't transition to Running.
                    if let Some(allocated) = allocated_gpus {
                        match gpu_sharing_mode {
                            GpuSharingMode::Shared => {
                                for gpu in allocated {
                                    if let Some(count) = shared_gpu_occupancy.get_mut(&gpu) {
                                        *count = count.saturating_sub(1);
                                        if *count == 0 {
                                            shared_gpu_occupancy.remove(&gpu);
                                        }
                                    }
                                    if let Some(requested_gpu_memory_mb) = requested_gpu_memory_mb {
                                        if let Some(used_memory_mb) =
                                            shared_gpu_memory_usage_mb.get_mut(&gpu)
                                        {
                                            *used_memory_mb = used_memory_mb
                                                .saturating_sub(requested_gpu_memory_mb);
                                            if *used_memory_mb == 0 {
                                                shared_gpu_memory_usage_mb.remove(&gpu);
                                            }
                                        }
                                    }
                                }
                            }
                            GpuSharingMode::Exclusive => {
                                for gpu in allocated {
                                    exclusive_gpu_occupancy.remove(&gpu);
                                }
                            }
                        }
                    }
                    if let Some(rt) = self.job_runtimes.get_mut(idx) {
                        rt.gpu_ids = None;
                    }
                }
            } else if !has_enough_memory {
                if let Some(rt) = self.job_runtimes.get(idx) {
                    tracing::debug!(
                        "Job {} waiting for memory: needs {}MB, available {}MB",
                        rt.id,
                        required_memory,
                        available_memory
                    );
                }
            } else if !respects_reservations {
                if let Some(rt) = self.job_runtimes.get(idx) {
                    tracing::debug!(
                        "Job {} blocked by active GPU reservations (user: {}, needs {} GPUs)",
                        rt.id,
                        job_user,
                        rt.gpus
                    );
                }
            }
        }

        // Clone jobs only once after all allocations are done
        job_ids_to_execute
            .into_iter()
            .filter_map(|id| self.get_job(id))
            .collect()
    }

    /// Phase 2: Execute jobs (call executor - can be done WITHOUT holding lock)
    /// This is separated so the caller can release locks before doing I/O
    /// Returns execution results WITHOUT modifying state
    pub fn execute_jobs_no_lock(&self, jobs: &[Job]) -> Vec<(u32, Result<(), String>)> {
        if self.executor.is_none() {
            tracing::warn!("Scheduler has no executor, cannot execute jobs");
            return Vec::new();
        }

        let executor = self.executor.as_ref().unwrap();
        let mut results = Vec::new();

        for job in jobs {
            match executor.execute(job) {
                Ok(_) => {
                    tracing::info!("Executing job: {job:?}");
                    results.push((job.id, Ok(())));
                }
                Err(e) => {
                    tracing::error!("Failed to execute job {}: {e:?}", job.id);
                    results.push((job.id, Err(e.to_string())));
                }
            }
        }

        results
    }

    /// Handle execution failures by marking jobs as failed and releasing resources
    /// Should be called WITH a lock after execute_jobs_no_lock
    pub fn handle_execution_failures(&mut self, results: &[(u32, Result<(), String>)]) {
        for (job_id, result) in results {
            if result.is_err() {
                let Some((had_gpus, required_memory)) = (|| {
                    let rt = self.get_job_runtime_mut(*job_id)?;
                    let had_gpus = rt.gpu_ids.take().is_some();
                    let required_memory = rt.memory_limit_mb.unwrap_or(0);
                    Some((had_gpus, required_memory))
                })() else {
                    continue;
                };

                // Keep previous behavior: return `true` (job exists) even if transition isn't valid,
                // and always release resources when they were allocated.
                self.transition_job_state(*job_id, JobState::Failed, None);

                // Return memory if we had allocated GPUs (i.e. we were running).
                if had_gpus {
                    self.available_memory_mb =
                        self.available_memory_mb.saturating_add(required_memory);
                    // Note: GPUs will be returned in next refresh cycle.
                }
            }
        }
    }

    /// Legacy method for backward compatibility - calls both phases
    #[deprecated(
        note = "Use prepare_jobs_for_execution + execute_jobs_no_lock for better performance"
    )]
    pub fn schedule_jobs(&mut self) -> Vec<(u32, Result<(), String>)> {
        // Guard: Check executor exists before mutating state
        if self.executor.is_none() {
            tracing::warn!("Scheduler has no executor, cannot schedule jobs");
            return Vec::new();
        }

        let jobs_to_execute = self.prepare_jobs_for_execution();
        let results = self.execute_jobs_no_lock(&jobs_to_execute);
        self.handle_execution_failures(&results);
        results
    }

    /// Update GPU slot availability
    pub fn update_gpu_slots(&mut self, new_slots: HashMap<GpuUuid, GPUSlot>) {
        self.gpu_slots = new_slots;
    }

    /// Update total and available memory
    pub fn update_memory(&mut self, total_memory_mb: u64) {
        self.total_memory_mb = total_memory_mb;
        self.refresh_available_memory();
    }

    /// Get a reference to gpu_slots for external access
    pub fn gpu_slots_mut(&mut self) -> &mut HashMap<GpuUuid, GPUSlot> {
        &mut self.gpu_slots
    }

    /// Get the state path
    pub fn state_path(&self) -> &PathBuf {
        &self.state_path
    }

    /// Get the next job ID
    pub fn next_job_id(&self) -> u32 {
        self.next_job_id
    }

    /// Get total memory in MB
    pub fn total_memory_mb(&self) -> u64 {
        self.total_memory_mb
    }

    /// Get available memory in MB
    pub fn available_memory_mb(&self) -> u64 {
        self.available_memory_mb
    }

    /// Set the next job ID
    pub fn set_next_job_id(&mut self, id: u32) {
        self.next_job_id = id;
    }

    /// Rebuild user jobs index from current jobs
    /// Should be called after loading state from disk
    pub fn rebuild_user_jobs_index(&mut self) {
        self.user_jobs_index.clear();
        self.state_jobs_index.clear();
        self.project_jobs_index.clear();
        self.dependency_graph.clear();
        self.group_running_count.clear();

        self.check_invariant();

        for (idx, spec) in self.job_specs.iter().enumerate() {
            let rt = &self.job_runtimes[idx];

            // Rebuild user index.
            self.user_jobs_index
                .entry(spec.submitted_by.clone())
                .or_default()
                .push(rt.id);

            // Rebuild state index.
            self.state_jobs_index
                .entry(rt.state)
                .or_default()
                .push(rt.id);

            // Rebuild project index.
            if let Some(ref project) = spec.project {
                self.project_jobs_index
                    .entry(project.clone())
                    .or_default()
                    .push(rt.id);
            }

            // Rebuild dependency graph.
            if spec.depends_on.is_some() || !spec.depends_on_ids.is_empty() {
                let mut deps: Vec<u32> = spec.depends_on_ids.iter().copied().collect();
                if let Some(dep) = spec.depends_on {
                    if !deps.contains(&dep) {
                        deps.push(dep);
                    }
                }
                self.dependency_graph.insert(rt.id, deps);
            }

            // Rebuild group running count index.
            if rt.state == JobState::Running {
                if let Some(group_id) = rt.group_id {
                    *self.group_running_count.entry(group_id).or_insert(0) += 1;
                }
            }
        }
    }

    /// Get the sorted list of job IDs for a state.
    ///
    /// This is primarily intended for API/query paths to avoid scanning all jobs.
    pub fn job_ids_by_state(&self, state: JobState) -> Option<&[u32]> {
        self.state_jobs_index.get(&state).map(|v| v.as_slice())
    }

    /// Get count of jobs by state for monitoring
    pub fn get_job_counts_by_state(&self) -> std::collections::HashMap<JobState, usize> {
        let mut counts = std::collections::HashMap::new();
        for rt in &self.job_runtimes {
            *counts.entry(rt.state).or_insert(0) += 1;
        }
        counts
    }

    /// Get all jobs submitted by a specific user using the index for O(n) performance
    /// where n is the number of jobs by that user (not total jobs)
    pub fn get_jobs_by_user(&self, username: &str) -> Vec<Job> {
        let Some(job_ids) = self.user_jobs_index.get(username) else {
            return Vec::new();
        };

        job_ids.iter().filter_map(|&id| self.get_job(id)).collect()
    }

    /// Get the sorted list of job IDs submitted by a user.
    ///
    /// This is primarily intended for API/query paths to avoid scanning all jobs.
    pub fn job_ids_by_user(&self, username: &str) -> Option<&[u32]> {
        self.user_jobs_index.get(username).map(|v| v.as_slice())
    }

    // ===== GPU Reservation Methods =====
}