use log::debug;
use std::collections::HashMap;
use std::process::Command;
use super::profiles::{HpcPartition, HpcProfile};
#[derive(Debug)]
pub struct SinfoPartition {
pub name: String,
pub cpus: u32,
pub memory_mb: u64,
pub timelimit_secs: u64,
pub gres: Option<String>,
}
#[derive(Debug, Default)]
struct ScontrolPartitionInfo {
min_nodes: Option<u32>,
max_nodes: Option<u32>,
oversubscribe: Option<String>,
default_qos: Option<String>,
}
fn get_sinfo_exec() -> String {
if cfg!(any(test, debug_assertions)) {
std::env::var("TORC_FAKE_SINFO").unwrap_or_else(|_| "sinfo".to_string())
} else {
"sinfo".to_string()
}
}
fn get_scontrol_exec() -> String {
if cfg!(any(test, debug_assertions)) {
std::env::var("TORC_FAKE_SCONTROL").unwrap_or_else(|_| "scontrol".to_string())
} else {
"scontrol".to_string()
}
}
pub fn detect_slurm_profile() -> Option<HpcProfile> {
if Command::new(get_sinfo_exec())
.arg("--version")
.output()
.is_err()
{
return None;
}
match generate_dynamic_slurm_profile(None, None, false) {
Ok(profile) => Some(profile),
Err(e) => {
debug!("Failed to generate dynamic Slurm profile: {}", e);
None
}
}
}
pub fn generate_dynamic_slurm_profile(
name: Option<String>,
display_name: Option<String>,
skip_stdby: bool,
) -> Result<HpcProfile, String> {
let cluster_name = name.unwrap_or_else(|| {
std::env::var("SLURM_CLUSTER_NAME")
.ok()
.or_else(|| {
Command::new(get_scontrol_exec())
.args(["show", "config"])
.output()
.ok()
.and_then(|out| {
String::from_utf8(out.stdout).ok().and_then(|s| {
s.lines()
.find(|l| l.starts_with("ClusterName"))
.and_then(|l| l.split('=').nth(1))
.map(|s| s.trim().to_string())
})
})
})
.unwrap_or_else(|| {
hostname::get()
.map(|h| h.to_string_lossy().to_string())
.unwrap_or_else(|_| "unknown".to_string())
})
});
let final_display_name = if let Some(d) = display_name {
d
} else {
let mut chars = cluster_name.chars();
let capitalized = match chars.next() {
None => cluster_name.clone(),
Some(c) => c.to_uppercase().chain(chars).collect(),
};
format!("{} (Slurm)", capitalized)
};
let sinfo_partitions = parse_sinfo_output()?;
if sinfo_partitions.is_empty() {
return Err("No partitions found. Is Slurm available on this system?".to_string());
}
let mut partition_map: HashMap<String, Vec<&SinfoPartition>> = HashMap::new();
for sp in &sinfo_partitions {
partition_map.entry(sp.name.clone()).or_default().push(sp);
}
let mut partitions = Vec::new();
let mut seen_names: Vec<String> = partition_map.keys().cloned().collect();
seen_names.sort();
for name in seen_names {
if skip_stdby && name.ends_with("-stdby") {
continue;
}
let group = partition_map.get(&name).unwrap();
let scontrol_info = parse_scontrol_partition(&name).unwrap_or_default();
let mut min_cpus = u32::MAX;
let mut min_memory = u64::MAX;
let mut max_walltime = 0u64;
let mut gpus_per_node: Option<u32> = None;
let mut gpu_type: Option<String> = None;
for sp in group {
min_cpus = min_cpus.min(sp.cpus);
min_memory = min_memory.min(sp.memory_mb);
max_walltime = max_walltime.max(sp.timelimit_secs);
let (gp, gt) = parse_gres(&sp.gres);
if let Some(count) = gp {
gpus_per_node = Some(gpus_per_node.map_or(count, |prev| prev.min(count)));
match (&gpu_type, >) {
(None, _) => gpu_type = gt,
(Some(existing), Some(new)) if existing != new => gpu_type = None,
_ => {}
}
}
}
if gpus_per_node.is_none()
&& let Some((inferred_count, inferred_type)) = infer_gpu_from_name(&name)
{
gpus_per_node = Some(inferred_count);
gpu_type = Some(inferred_type);
}
let shared = scontrol_info.oversubscribe.as_ref().is_some_and(|o| {
o.to_lowercase().contains("yes") || o.to_lowercase().contains("force")
}) || name.to_lowercase().contains("shared");
let partition = HpcPartition {
name,
description: String::new(),
cpus_per_node: min_cpus,
memory_mb: min_memory,
max_walltime_secs: max_walltime,
max_nodes: scontrol_info.max_nodes,
max_nodes_per_user: None,
min_nodes: scontrol_info.min_nodes,
gpus_per_node,
gpu_type,
gpu_memory_gb: None,
local_disk_gb: None,
shared,
requires_explicit_request: false,
default_qos: scontrol_info.default_qos.filter(|q| q != "N/A"),
features: vec![],
};
partitions.push(partition);
}
Ok(HpcProfile {
name: cluster_name,
display_name: final_display_name,
description: "Dynamically detected Slurm cluster".to_string(),
detection: vec![], default_account: None,
partitions,
charge_factor_cpu: 1.0,
charge_factor_gpu: 10.0,
metadata: HashMap::new(),
})
}
fn parse_sinfo_output() -> Result<Vec<SinfoPartition>, String> {
let output = Command::new(get_sinfo_exec())
.args(["-e", "-o", "%P|%c|%m|%l|%G|%D", "--noheader"])
.output()
.map_err(|e| format!("Failed to run sinfo: {}", e))?;
if !output.status.success() {
return Err(format!(
"sinfo failed: {}",
String::from_utf8_lossy(&output.stderr)
));
}
let stdout = String::from_utf8_lossy(&output.stdout);
parse_sinfo_string(&stdout)
}
pub fn parse_sinfo_string(input: &str) -> Result<Vec<SinfoPartition>, String> {
let mut partitions = Vec::new();
for line in input.lines() {
let parts: Vec<&str> = line.split('|').collect();
if parts.len() < 6 {
continue;
}
let name = parts[0].trim_end_matches('*').to_string();
let cpus: u32 = parts[1].parse().unwrap_or(1);
let memory_mb: u64 = parts[2].parse().unwrap_or(1024);
let timelimit_secs = parse_slurm_timelimit(parts[3]);
let gres = if parts[4] == "(null)" || parts[4].is_empty() {
None
} else {
Some(parts[4].to_string())
};
partitions.push(SinfoPartition {
name,
cpus,
memory_mb,
timelimit_secs,
gres,
});
}
Ok(partitions)
}
pub fn parse_slurm_timelimit(s: &str) -> u64 {
let s = s.trim();
if s == "infinite" || s == "UNLIMITED" {
return 365 * 24 * 3600; }
let mut days = 0;
let time_part;
if let Some((d_str, t_str)) = s.split_once('-') {
days = d_str.parse().unwrap_or(0);
time_part = t_str;
} else {
time_part = s;
}
let parts: Vec<&str> = time_part.split(':').collect();
let mut hours = 0;
let mut minutes = 0;
let mut seconds = 0;
match parts.len() {
3 => {
hours = parts[0].parse().unwrap_or(0);
minutes = parts[1].parse().unwrap_or(0);
seconds = parts[2].parse().unwrap_or(0);
}
2 => {
minutes = parts[0].parse().unwrap_or(0);
seconds = parts[1].parse().unwrap_or(0);
}
1 => {
minutes = parts[0].parse().unwrap_or(0);
}
_ => {}
}
(days * 24 * 3600) + (hours * 3600) + (minutes * 60) + seconds
}
fn parse_scontrol_partition(name: &str) -> Option<ScontrolPartitionInfo> {
let output = Command::new(get_scontrol_exec())
.args(["show", "partition", name])
.output()
.ok()?;
if !output.status.success() {
return None;
}
let stdout = String::from_utf8_lossy(&output.stdout);
let mut info = ScontrolPartitionInfo::default();
for word in stdout.split_whitespace() {
if let Some((key, value)) = word.split_once('=') {
match key {
"MinNodes" => info.min_nodes = value.parse().ok(),
"MaxNodes" => info.max_nodes = value.parse().ok(),
"OverSubscribe" => info.oversubscribe = Some(value.to_string()),
"QOS" | "QoS" => info.default_qos = Some(value.to_string()),
_ => {}
}
}
}
Some(info)
}
fn infer_gpu_from_name(name: &str) -> Option<(u32, String)> {
let name_lower = name.to_lowercase();
if !name_lower.contains("gpu") {
return None;
}
let gpu_types = [
("h100", "h100", 4),
("a100", "a100", 4),
("v100", "v100", 4),
("a40", "a40", 4),
("a30", "a30", 4),
("l40", "l40", 4),
];
for (pattern, gpu_type, default_count) in gpu_types {
if name_lower.contains(pattern) {
return Some((default_count, gpu_type.to_string()));
}
}
Some((4, "gpu".to_string()))
}
fn parse_gres(gres: &Option<String>) -> (Option<u32>, Option<String>) {
let gres = match gres {
Some(g) => g,
None => return (None, None),
};
for entry in gres.split(',') {
let entry = entry.split('(').next().unwrap_or(entry);
let parts: Vec<&str> = entry.split(':').collect();
if parts.first() != Some(&"gpu") {
continue;
}
match parts.len() {
2 => {
let count: u32 = parts[1].parse().unwrap_or(0);
if count > 0 {
return (Some(count), None);
}
}
3 => {
let gpu_type = parts[1].to_string();
let count: u32 = parts[2].parse().unwrap_or(0);
if count > 0 {
return (Some(count), Some(gpu_type));
}
}
_ => {}
}
}
(None, None)
}
#[derive(Debug, Clone)]
pub struct PartitionAvailability {
pub partition: String,
pub idle: u32,
pub mixed: u32,
pub allocated: u32,
pub down: u32,
pub total: u32,
}
#[derive(Debug, Clone)]
pub struct QueueDepthInfo {
pub partition: String,
pub pending_jobs: u32,
pub pending_nodes: u32,
pub running_jobs: u32,
}
pub fn query_partition_availability(
partition: Option<&str>,
) -> Result<Vec<PartitionAvailability>, String> {
let mut args = vec!["-e", "-o", "%P|%T|%D", "--noheader"];
let partition_arg;
if let Some(p) = partition {
partition_arg = p.to_string();
args.push("-p");
args.push(&partition_arg);
}
let output = Command::new(get_sinfo_exec())
.args(&args)
.output()
.map_err(|e| format!("Failed to run sinfo: {}", e))?;
if !output.status.success() {
return Err(format!(
"sinfo failed: {}",
String::from_utf8_lossy(&output.stderr)
));
}
let stdout = String::from_utf8_lossy(&output.stdout);
parse_partition_availability(&stdout)
}
pub fn parse_partition_availability(input: &str) -> Result<Vec<PartitionAvailability>, String> {
let mut map: HashMap<String, PartitionAvailability> = HashMap::new();
for line in input.lines() {
let line = line.trim();
if line.is_empty() {
continue;
}
let parts: Vec<&str> = line.split('|').collect();
if parts.len() < 3 {
continue;
}
let name = parts[0].trim_end_matches('*').to_string();
let state = parts[1].to_lowercase();
let count: u32 = parts[2].parse().unwrap_or(0);
let entry = map.entry(name.clone()).or_insert(PartitionAvailability {
partition: name,
idle: 0,
mixed: 0,
allocated: 0,
down: 0,
total: 0,
});
entry.total += count;
if state.starts_with("idle") {
entry.idle += count;
} else if state.starts_with("mix") {
entry.mixed += count;
} else if state.starts_with("alloc") {
entry.allocated += count;
} else if state.starts_with("down")
|| state.starts_with("drain")
|| state.starts_with("not_responding")
{
entry.down += count;
}
}
Ok(map.into_values().collect())
}
pub fn query_queue_depth(partition: Option<&str>) -> Result<Vec<QueueDepthInfo>, String> {
let squeue_exec = if cfg!(any(test, debug_assertions)) {
std::env::var("TORC_FAKE_SQUEUE").unwrap_or_else(|_| "squeue".to_string())
} else {
"squeue".to_string()
};
let mut args = vec!["--noheader", "-o", "%P|%T|%D"];
let partition_arg;
if let Some(p) = partition {
partition_arg = p.to_string();
args.push("-p");
args.push(&partition_arg);
}
let output = Command::new(&squeue_exec)
.args(&args)
.output()
.map_err(|e| format!("Failed to run squeue: {}", e))?;
if !output.status.success() {
return Err(format!(
"squeue failed: {}",
String::from_utf8_lossy(&output.stderr)
));
}
let stdout = String::from_utf8_lossy(&output.stdout);
parse_queue_depth(&stdout)
}
pub fn parse_queue_depth(input: &str) -> Result<Vec<QueueDepthInfo>, String> {
let mut map: HashMap<String, QueueDepthInfo> = HashMap::new();
for line in input.lines() {
let line = line.trim();
if line.is_empty() {
continue;
}
let parts: Vec<&str> = line.split('|').collect();
if parts.len() < 3 {
continue;
}
let name = parts[0].trim_end_matches('*').to_string();
let state = parts[1].to_uppercase();
let nodes: u32 = parts[2].parse().unwrap_or(0);
let entry = map.entry(name.clone()).or_insert(QueueDepthInfo {
partition: name,
pending_jobs: 0,
pending_nodes: 0,
running_jobs: 0,
});
if state == "PENDING" || state == "CONFIGURING" {
entry.pending_jobs += 1;
entry.pending_nodes += nodes;
} else if state == "RUNNING" || state == "COMPLETING" {
entry.running_jobs += 1;
}
}
Ok(map.into_values().collect())
}
#[derive(Debug, Clone)]
pub struct SbatchTestResult {
pub estimated_start: Option<chrono::NaiveDateTime>,
pub success: bool,
pub error_message: Option<String>,
pub raw_output: String,
}
fn get_sbatch_exec() -> String {
if cfg!(any(test, debug_assertions)) {
std::env::var("TORC_FAKE_SBATCH").unwrap_or_else(|_| "sbatch".to_string())
} else {
"sbatch".to_string()
}
}
pub fn run_sbatch_test_only(
account: &str,
partition: Option<&str>,
nodes: u32,
walltime: &str,
qos: Option<&str>,
gres: Option<&str>,
) -> SbatchTestResult {
let sbatch = get_sbatch_exec();
let mut cmd = Command::new(&sbatch);
cmd.args([
"--test-only",
"--account",
account,
"--nodes",
&nodes.to_string(),
"--time",
walltime,
"--wrap",
"hostname",
]);
if let Some(p) = partition {
cmd.args(["--partition", p]);
}
if let Some(q) = qos {
cmd.args(["--qos", q]);
}
if let Some(g) = gres {
cmd.args(["--gres", g]);
}
debug!(
"Running sbatch --test-only: account={} partition={} nodes={} walltime={}",
account,
partition.unwrap_or("<default>"),
nodes,
walltime
);
let output = match cmd.output() {
Ok(o) => o,
Err(e) => {
return SbatchTestResult {
estimated_start: None,
success: false,
error_message: Some(format!("Failed to run sbatch: {}", e)),
raw_output: String::new(),
};
}
};
let stdout = String::from_utf8_lossy(&output.stdout).to_string();
let stderr = String::from_utf8_lossy(&output.stderr).to_string();
let combined = format!("{}\n{}", stdout, stderr);
parse_sbatch_test_only(&combined)
}
pub fn parse_sbatch_test_only(output: &str) -> SbatchTestResult {
for line in output.lines() {
let line = line.trim();
if let Some(idx) = line.find("to start at ") {
let after = &line[idx + "to start at ".len()..];
if after.len() >= 19 {
let datetime_str = &after[..19];
if let Ok(dt) =
chrono::NaiveDateTime::parse_from_str(datetime_str, "%Y-%m-%dT%H:%M:%S")
{
return SbatchTestResult {
estimated_start: Some(dt),
success: true,
error_message: None,
raw_output: output.to_string(),
};
}
}
}
if line.contains("error:") || line.contains("Unable to allocate") {
return SbatchTestResult {
estimated_start: None,
success: false,
error_message: Some(line.to_string()),
raw_output: output.to_string(),
};
}
}
SbatchTestResult {
estimated_start: None,
success: false,
error_message: Some("Could not parse sbatch --test-only output".to_string()),
raw_output: output.to_string(),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_slurm_timelimit() {
assert_eq!(parse_slurm_timelimit("infinite"), 365 * 24 * 3600);
assert_eq!(parse_slurm_timelimit("UNLIMITED"), 365 * 24 * 3600);
assert_eq!(parse_slurm_timelimit("1-00:00:00"), 24 * 3600);
assert_eq!(parse_slurm_timelimit("04:30:00"), 4 * 3600 + 30 * 60);
assert_eq!(parse_slurm_timelimit("30:00"), 30 * 60);
assert_eq!(parse_slurm_timelimit("45"), 45 * 60);
}
#[test]
fn test_parse_gres_simple() {
let (count, gpu_type) = parse_gres(&Some("gpu:4".to_string()));
assert_eq!(count, Some(4));
assert_eq!(gpu_type, None);
}
#[test]
fn test_parse_gres_with_type() {
let (count, gpu_type) = parse_gres(&Some("gpu:a100:2".to_string()));
assert_eq!(count, Some(2));
assert_eq!(gpu_type, Some("a100".to_string()));
}
#[test]
fn test_parse_gres_with_socket_info() {
let (count, gpu_type) = parse_gres(&Some("gpu:h100:2(S:0-3)".to_string()));
assert_eq!(count, Some(2));
assert_eq!(gpu_type, Some("h100".to_string()));
}
#[test]
fn test_parse_gres_multiple() {
let (count, gpu_type) = parse_gres(&Some("nvme:1,gpu:4".to_string()));
assert_eq!(count, Some(4));
assert_eq!(gpu_type, None);
}
#[test]
fn test_parse_gres_none() {
let (count, gpu_type) = parse_gres(&None);
assert_eq!(count, None);
assert_eq!(gpu_type, None);
}
#[test]
fn test_infer_gpu_from_name() {
assert_eq!(
infer_gpu_from_name("gpu-h100"),
Some((4, "h100".to_string()))
);
assert_eq!(
infer_gpu_from_name("standard-gpu"),
Some((4, "gpu".to_string()))
);
assert_eq!(infer_gpu_from_name("compute"), None);
}
#[test]
fn test_parse_partition_availability() {
let input = "\
standard|idle|45
standard|mixed|12
standard|allocated|180
standard|down|3
gpu-h100|idle|2
gpu-h100|mixed|1
gpu-h100|allocated|15
";
let result = parse_partition_availability(input).unwrap();
assert_eq!(result.len(), 2);
let std_part = result.iter().find(|p| p.partition == "standard").unwrap();
assert_eq!(std_part.idle, 45);
assert_eq!(std_part.mixed, 12);
assert_eq!(std_part.allocated, 180);
assert_eq!(std_part.down, 3);
assert_eq!(std_part.total, 240);
let gpu_part = result.iter().find(|p| p.partition == "gpu-h100").unwrap();
assert_eq!(gpu_part.idle, 2);
assert_eq!(gpu_part.mixed, 1);
assert_eq!(gpu_part.allocated, 15);
assert_eq!(gpu_part.total, 18);
}
#[test]
fn test_parse_partition_availability_with_default_marker() {
let input = "standard*|idle|10\nstandard*|allocated|50\n";
let result = parse_partition_availability(input).unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].partition, "standard");
assert_eq!(result[0].idle, 10);
}
#[test]
fn test_parse_queue_depth() {
let input = "\
standard|PENDING|4
standard|PENDING|8
standard|RUNNING|1
standard|RUNNING|1
gpu-h100|PENDING|2
gpu-h100|RUNNING|1
";
let result = parse_queue_depth(input).unwrap();
assert_eq!(result.len(), 2);
let std_q = result.iter().find(|q| q.partition == "standard").unwrap();
assert_eq!(std_q.pending_jobs, 2);
assert_eq!(std_q.pending_nodes, 12);
assert_eq!(std_q.running_jobs, 2);
let gpu_q = result.iter().find(|q| q.partition == "gpu-h100").unwrap();
assert_eq!(gpu_q.pending_jobs, 1);
assert_eq!(gpu_q.pending_nodes, 2);
assert_eq!(gpu_q.running_jobs, 1);
}
#[test]
fn test_parse_queue_depth_empty() {
let result = parse_queue_depth("").unwrap();
assert!(result.is_empty());
}
#[test]
fn test_parse_sbatch_test_only_success() {
let output = "sbatch: Job 12345 to start at 2026-03-17T14:30:00 using 167 processors on nodes node[001-167]";
let result = parse_sbatch_test_only(output);
assert!(result.success);
assert!(result.estimated_start.is_some());
let dt = result.estimated_start.unwrap();
assert_eq!(dt.to_string(), "2026-03-17 14:30:00");
}
#[test]
fn test_parse_sbatch_test_only_simple_format() {
let output = "sbatch: Job 99999 to start at 2026-04-01T08:00:00 on nodes compute-001";
let result = parse_sbatch_test_only(output);
assert!(result.success);
let dt = result.estimated_start.unwrap();
assert_eq!(dt.to_string(), "2026-04-01 08:00:00");
}
#[test]
fn test_parse_sbatch_test_only_error() {
let output = "sbatch: error: Batch job submission failed: Invalid account or account/partition combination specified";
let result = parse_sbatch_test_only(output);
assert!(!result.success);
assert!(result.estimated_start.is_none());
assert!(result.error_message.is_some());
}
#[test]
fn test_parse_sbatch_test_only_unparseable() {
let output = "some unexpected output";
let result = parse_sbatch_test_only(output);
assert!(!result.success);
assert!(result.estimated_start.is_none());
}
#[test]
fn test_parse_sbatch_test_only_multiline() {
let output = "\
sbatch: Pending job allocation 0
sbatch: Job 54321 to start at 2026-03-18T09:15:00 using 4 processors on nodes node[100-103]
";
let result = parse_sbatch_test_only(output);
assert!(result.success);
let dt = result.estimated_start.unwrap();
assert_eq!(dt.to_string(), "2026-03-18 09:15:00");
}
}