kueue_lib/
config.rs

1//! Shared config file "config.toml".
2//!
3//! All binary crates share a common config file, which is separated into
4//! groups. The "common_settings" group contains settings related to multiple
5//! crates while "server_settings", "worker_settings", "client_settings", and
6//! "restart_workers" contain settings associated with their respective crates.
7
8use anyhow::{bail, Result};
9use config::{builder::BuilderState, ConfigBuilder};
10use directories::ProjectDirs;
11use rand::{distributions::Alphanumeric, thread_rng, Rng};
12use serde::{Deserialize, Serialize};
13use std::{
14    fs::{create_dir_all, File},
15    io::Write,
16    net::SocketAddr,
17    path::PathBuf, collections::BTreeMap,
18};
19use tokio::net::lookup_host;
20
21/// The Config struct represents the read TOML config file
22/// and holds the settings for all individual crates.
23#[derive(Clone, Serialize, Deserialize, Debug)]
24pub struct Config {
25    /// Common settings shared among all crates.
26    pub common_settings: CommonSettings,
27    /// Settings related to the server crate.
28    pub server_settings: ServerSettings,
29    /// Settings related to the worker crate.
30    pub worker_settings: WorkerSettings,
31    /// Settings related to the client crate.
32    pub client_settings: ClientSettings,
33    /// Setting related to the optional "restart_workers" crate.
34    pub restart_workers: Option<RestartWorkers>,
35    /// Custom global resources defined in the config.
36    pub global_resources: Option<BTreeMap<String, u64>>,
37}
38
39/// Common settings shared among all crates.
40#[derive(Clone, Serialize, Deserialize, Debug)]
41pub struct CommonSettings {
42    /// Shared secret used to authenticate client and worker against the server.
43    pub shared_secret: String,
44    /// Host name (or IP address) of the server, used by client and worker.
45    pub server_name: String,
46    /// Network port used by the server.
47    pub server_port: u16,
48    /// Verbosity level of log messages.
49    /// Options: `trace`, `debug`, `info`, `warn`, and `error`.
50    pub log_level: String,
51}
52
53impl CommonSettings {
54    /// Default common settings.
55    fn default_settings<St: BuilderState>(
56        builder: ConfigBuilder<St>,
57    ) -> Result<ConfigBuilder<St>, config::ConfigError> {
58        // Generate initial random shared secret.
59        let random_secret: String = thread_rng()
60            .sample_iter(&Alphanumeric)
61            .take(64)
62            .map(char::from)
63            .collect();
64
65        // TODO: Raise default levels when more mature.
66        let default_log_level = if cfg!(debug_assertions) {
67            "debug"
68        } else {
69            "info"
70        };
71
72        builder
73            .set_default("common_settings.shared_secret", random_secret)?
74            .set_default("common_settings.server_name", "localhost")?
75            .set_default("common_settings.server_port", 11236)?
76            .set_default("common_settings.log_level", default_log_level)
77    }
78}
79
80/// Settings related to the server crate.
81#[derive(Clone, Serialize, Deserialize, Debug)]
82pub struct ServerSettings {
83    /// Space-separated list of IP addresses to listen on. As long as at least
84    /// one of the given addresses can be bound, the server will keep running.
85    /// Defaults to: `0.0.0.0` (IPv4) and `[::]` (IPv6).
86    pub bind_addresses: String,
87    /// The server performs maintenance every `maintenance_interval_seconds`
88    /// to recover jobs from disconnected workers and clean up finished jobs.
89    pub maintenance_interval_seconds: u64,
90    /// Time in seconds before a worker connection is considered timed-out.
91    pub worker_timeout_seconds: u64,
92    /// Time in seconds before a job offer to a worker is considered timed-out.
93    pub job_offer_timeout_seconds: u64,
94    /// Time in minutes before a finished job is removed from the list of jobs.
95    pub job_cleanup_after_minutes: u64,
96    /// Defines an global upper limit of parallel jobs across all workers. If
97    /// this limit is reached, no more jobs will be started on any worker, even
98    /// if enough other resources would be available.
99    pub global_max_parallel_jobs: u64,
100}
101
102impl ServerSettings {
103    /// Default server settings.
104    fn default_settings<St: BuilderState>(
105        builder: ConfigBuilder<St>,
106    ) -> Result<ConfigBuilder<St>, config::ConfigError> {
107        builder
108            .set_default("server_settings.bind_addresses", "0.0.0.0 [::]")?
109            .set_default("server_settings.maintenance_interval_seconds", 60)?
110            .set_default("server_settings.worker_timeout_seconds", 5 * 60)?
111            .set_default("server_settings.job_offer_timeout_seconds", 60)?
112            .set_default("server_settings.job_cleanup_after_minutes", 48 * 60)?
113            .set_default("server_settings.global_max_parallel_jobs", 100)
114    }
115}
116
117/// Settings related to the worker crate.
118#[derive(Clone, Serialize, Deserialize, Debug)]
119pub struct WorkerSettings {
120    /// The worker sends system update messages regularly to the server. It
121    /// contains information about system load, available resources, etc. This
122    /// is also used as a keep-alive signal to the server and thus should be
123    /// smaller than the server's `worker_timeout_seconds` setting.
124    pub system_update_interval_seconds: u64,
125    /// Defines an absolute upper limit of parallel jobs for this worker. If
126    /// this limit is reached, no more jobs will be started on the worker, even
127    /// if enough other resources would be available.
128    pub worker_max_parallel_jobs: u64,
129    /// When set to `true`, the current system utilization is considered when
130    /// calculating available resources for job scheduling. Available resources
131    /// will be calculated as "total system resources - max(busy resources,
132    /// resources reserved by running jobs)". This setting can be useful for
133    /// shared machines, which are not exclusively used with Kueue. If this is
134    /// set to `false`, current system utilization is ignored and available
135    /// resources are simply calculated as "total resources - resources reserved
136    /// by running jobs".
137    pub dynamic_check_free_resources: bool,
138    /// When calculating the amount of available CPUs based on current system
139    /// occupation, this factor is applied to the measured CPU utilization. For
140    /// instance, with a value of `2.0`, 50% CPU utilization would raise the
141    /// calculated system occupation to 100%, leaving no room for any jobs.
142    /// This setting has no effect if `dynamic_check_free_resources` is `false`.
143    pub dynamic_cpu_load_scale_factor: f64,
144}
145
146impl WorkerSettings {
147    /// Default worker settings.
148    fn default_settings<St: BuilderState>(
149        builder: ConfigBuilder<St>,
150    ) -> Result<ConfigBuilder<St>, config::ConfigError> {
151        builder
152            .set_default("worker_settings.system_update_interval_seconds", 60)?
153            .set_default("worker_settings.worker_max_parallel_jobs", 10)?
154            .set_default("worker_settings.dynamic_check_free_resources", true)?
155            .set_default("worker_settings.dynamic_cpu_load_scale_factor", 1.0)
156    }
157}
158
159/// Settings related to the client crate.
160#[derive(Clone, Serialize, Deserialize, Debug)]
161pub struct ClientSettings {
162    /// Default number of cpu cores a job requires, if not specified.
163    pub job_default_cpus: u64,
164    /// Default amount of RAM memory a job requires, if not specified.
165    pub job_default_ram_mb: u64,
166}
167
168impl ClientSettings {
169    /// Default client settings.
170    fn default_settings<St: BuilderState>(
171        builder: ConfigBuilder<St>,
172    ) -> Result<ConfigBuilder<St>, config::ConfigError> {
173        builder
174            .set_default("client_settings.job_default_cpus", 8)?
175            .set_default("client_settings.job_default_ram_mb", 8 * 1024)
176    }
177}
178
179/// Setting related to the optional "restart_workers" crate.
180#[derive(Clone, Serialize, Deserialize, Debug)]
181pub struct RestartWorkers {
182    /// Username to use to connect to worker machines using SSH.
183    pub ssh_user: String,
184    /// Hostnames or IP addresses of worker machines, separated by space.
185    pub hostnames: String,
186    /// Number of minutes to wait before checking the status of the worker processes again.
187    pub sleep_minutes_before_recheck: Option<f64>,
188}
189
190impl Config {
191    /// Create a new config struct to hold all settings. If available, settings
192    /// are parsed from the given `config_path` or default settings are applied.
193    /// If `config_path` is None, the default path for the config file is used.
194    pub fn new(config_path: Option<PathBuf>) -> Result<Self, config::ConfigError> {
195        let s = config::Config::builder();
196
197        // Set default settings.
198        let s = CommonSettings::default_settings(s)?;
199        let s = ServerSettings::default_settings(s)?;
200        let s = WorkerSettings::default_settings(s)?;
201        let s = ClientSettings::default_settings(s)?;
202
203        // Add config file as source.
204        let config_path = config_path.unwrap_or(default_path());
205        let s = s
206            .add_source(
207                config::File::with_name(config_path.to_string_lossy().as_ref()).required(false),
208            )
209            .build()?;
210
211        // Deserialize into Config.
212        s.try_deserialize()
213    }
214
215    /// If `config_path` does not exist, write the current config with all
216    /// settings and values to the given `config_path`. If `config_path` is
217    /// None, the default path for the config file is used.
218    pub fn create_template(&self, config_path: Option<PathBuf>) -> Result<()> {
219        let config_path = config_path.unwrap_or(default_path());
220        let toml = toml::to_string(&self)?;
221
222        if let Some(config_dir) = config_path.parent() {
223            if !config_dir.is_dir() {
224                create_dir_all(config_dir)?;
225            }
226        }
227
228        if !config_path.is_file() {
229            let mut file = File::create(config_path)?;
230            file.write_all(toml.as_bytes())?;
231        }
232
233        Ok(())
234    }
235
236    /// Get `log::Level` from the config.
237    pub fn get_log_level(&self) -> Result<log::Level> {
238        match self.common_settings.log_level.to_lowercase().as_str() {
239            "trace" => Ok(log::Level::Trace),
240            "debug" => Ok(log::Level::Debug),
241            "info" => Ok(log::Level::Info),
242            "warn" => Ok(log::Level::Warn),
243            "error" => Ok(log::Level::Error),
244            _ => bail!("Log level must be one of: trace, debug, info, warn, error"),
245        }
246    }
247
248    /// Build server address from `server_name` and `server_port`
249    /// and return it as `SocketAddr`.
250    pub async fn get_server_address(&self) -> Result<SocketAddr> {
251        let host = format!(
252            "{}:{}",
253            self.common_settings.server_name, self.common_settings.server_port
254        );
255        let mut addr_iter = lookup_host(host).await?;
256        match addr_iter.next() {
257            Some(socket_address) => Ok(socket_address),
258            None => bail!(
259                "Could not resolve server address: {}",
260                self.common_settings.server_name
261            ),
262        }
263    }
264}
265
266/// Returns the system-specific default path of the config file.
267pub fn default_path() -> PathBuf {
268    let config_file_name = if cfg!(debug_assertions) {
269        "config-devel.toml"
270    } else {
271        "config.toml"
272    };
273
274    if let Some(project_dirs) = ProjectDirs::from("", "", "kueue") {
275        project_dirs.config_dir().join(config_file_name)
276    } else {
277        config_file_name.into()
278    }
279}