kueue_lib/config.rs
1//! Shared config file "config.toml".
2//!
3//! All binary crates share a common config file, which is separated into
4//! groups. The "common_settings" group contains settings related to multiple
5//! crates while "server_settings", "worker_settings", "client_settings", and
6//! "restart_workers" contain settings associated with their respective crates.
7
8use anyhow::{bail, Result};
9use config::{builder::BuilderState, ConfigBuilder};
10use directories::ProjectDirs;
11use rand::{distributions::Alphanumeric, thread_rng, Rng};
12use serde::{Deserialize, Serialize};
13use std::{
14 fs::{create_dir_all, File},
15 io::Write,
16 net::SocketAddr,
17 path::PathBuf, collections::BTreeMap,
18};
19use tokio::net::lookup_host;
20
21/// The Config struct represents the read TOML config file
22/// and holds the settings for all individual crates.
23#[derive(Clone, Serialize, Deserialize, Debug)]
24pub struct Config {
25 /// Common settings shared among all crates.
26 pub common_settings: CommonSettings,
27 /// Settings related to the server crate.
28 pub server_settings: ServerSettings,
29 /// Settings related to the worker crate.
30 pub worker_settings: WorkerSettings,
31 /// Settings related to the client crate.
32 pub client_settings: ClientSettings,
33 /// Setting related to the optional "restart_workers" crate.
34 pub restart_workers: Option<RestartWorkers>,
35 /// Custom global resources defined in the config.
36 pub global_resources: Option<BTreeMap<String, u64>>,
37}
38
39/// Common settings shared among all crates.
40#[derive(Clone, Serialize, Deserialize, Debug)]
41pub struct CommonSettings {
42 /// Shared secret used to authenticate client and worker against the server.
43 pub shared_secret: String,
44 /// Host name (or IP address) of the server, used by client and worker.
45 pub server_name: String,
46 /// Network port used by the server.
47 pub server_port: u16,
48 /// Verbosity level of log messages.
49 /// Options: `trace`, `debug`, `info`, `warn`, and `error`.
50 pub log_level: String,
51}
52
53impl CommonSettings {
54 /// Default common settings.
55 fn default_settings<St: BuilderState>(
56 builder: ConfigBuilder<St>,
57 ) -> Result<ConfigBuilder<St>, config::ConfigError> {
58 // Generate initial random shared secret.
59 let random_secret: String = thread_rng()
60 .sample_iter(&Alphanumeric)
61 .take(64)
62 .map(char::from)
63 .collect();
64
65 // TODO: Raise default levels when more mature.
66 let default_log_level = if cfg!(debug_assertions) {
67 "debug"
68 } else {
69 "info"
70 };
71
72 builder
73 .set_default("common_settings.shared_secret", random_secret)?
74 .set_default("common_settings.server_name", "localhost")?
75 .set_default("common_settings.server_port", 11236)?
76 .set_default("common_settings.log_level", default_log_level)
77 }
78}
79
80/// Settings related to the server crate.
81#[derive(Clone, Serialize, Deserialize, Debug)]
82pub struct ServerSettings {
83 /// Space-separated list of IP addresses to listen on. As long as at least
84 /// one of the given addresses can be bound, the server will keep running.
85 /// Defaults to: `0.0.0.0` (IPv4) and `[::]` (IPv6).
86 pub bind_addresses: String,
87 /// The server performs maintenance every `maintenance_interval_seconds`
88 /// to recover jobs from disconnected workers and clean up finished jobs.
89 pub maintenance_interval_seconds: u64,
90 /// Time in seconds before a worker connection is considered timed-out.
91 pub worker_timeout_seconds: u64,
92 /// Time in seconds before a job offer to a worker is considered timed-out.
93 pub job_offer_timeout_seconds: u64,
94 /// Time in minutes before a finished job is removed from the list of jobs.
95 pub job_cleanup_after_minutes: u64,
96 /// Defines an global upper limit of parallel jobs across all workers. If
97 /// this limit is reached, no more jobs will be started on any worker, even
98 /// if enough other resources would be available.
99 pub global_max_parallel_jobs: u64,
100}
101
102impl ServerSettings {
103 /// Default server settings.
104 fn default_settings<St: BuilderState>(
105 builder: ConfigBuilder<St>,
106 ) -> Result<ConfigBuilder<St>, config::ConfigError> {
107 builder
108 .set_default("server_settings.bind_addresses", "0.0.0.0 [::]")?
109 .set_default("server_settings.maintenance_interval_seconds", 60)?
110 .set_default("server_settings.worker_timeout_seconds", 5 * 60)?
111 .set_default("server_settings.job_offer_timeout_seconds", 60)?
112 .set_default("server_settings.job_cleanup_after_minutes", 48 * 60)?
113 .set_default("server_settings.global_max_parallel_jobs", 100)
114 }
115}
116
117/// Settings related to the worker crate.
118#[derive(Clone, Serialize, Deserialize, Debug)]
119pub struct WorkerSettings {
120 /// The worker sends system update messages regularly to the server. It
121 /// contains information about system load, available resources, etc. This
122 /// is also used as a keep-alive signal to the server and thus should be
123 /// smaller than the server's `worker_timeout_seconds` setting.
124 pub system_update_interval_seconds: u64,
125 /// Defines an absolute upper limit of parallel jobs for this worker. If
126 /// this limit is reached, no more jobs will be started on the worker, even
127 /// if enough other resources would be available.
128 pub worker_max_parallel_jobs: u64,
129 /// When set to `true`, the current system utilization is considered when
130 /// calculating available resources for job scheduling. Available resources
131 /// will be calculated as "total system resources - max(busy resources,
132 /// resources reserved by running jobs)". This setting can be useful for
133 /// shared machines, which are not exclusively used with Kueue. If this is
134 /// set to `false`, current system utilization is ignored and available
135 /// resources are simply calculated as "total resources - resources reserved
136 /// by running jobs".
137 pub dynamic_check_free_resources: bool,
138 /// When calculating the amount of available CPUs based on current system
139 /// occupation, this factor is applied to the measured CPU utilization. For
140 /// instance, with a value of `2.0`, 50% CPU utilization would raise the
141 /// calculated system occupation to 100%, leaving no room for any jobs.
142 /// This setting has no effect if `dynamic_check_free_resources` is `false`.
143 pub dynamic_cpu_load_scale_factor: f64,
144}
145
146impl WorkerSettings {
147 /// Default worker settings.
148 fn default_settings<St: BuilderState>(
149 builder: ConfigBuilder<St>,
150 ) -> Result<ConfigBuilder<St>, config::ConfigError> {
151 builder
152 .set_default("worker_settings.system_update_interval_seconds", 60)?
153 .set_default("worker_settings.worker_max_parallel_jobs", 10)?
154 .set_default("worker_settings.dynamic_check_free_resources", true)?
155 .set_default("worker_settings.dynamic_cpu_load_scale_factor", 1.0)
156 }
157}
158
159/// Settings related to the client crate.
160#[derive(Clone, Serialize, Deserialize, Debug)]
161pub struct ClientSettings {
162 /// Default number of cpu cores a job requires, if not specified.
163 pub job_default_cpus: u64,
164 /// Default amount of RAM memory a job requires, if not specified.
165 pub job_default_ram_mb: u64,
166}
167
168impl ClientSettings {
169 /// Default client settings.
170 fn default_settings<St: BuilderState>(
171 builder: ConfigBuilder<St>,
172 ) -> Result<ConfigBuilder<St>, config::ConfigError> {
173 builder
174 .set_default("client_settings.job_default_cpus", 8)?
175 .set_default("client_settings.job_default_ram_mb", 8 * 1024)
176 }
177}
178
179/// Setting related to the optional "restart_workers" crate.
180#[derive(Clone, Serialize, Deserialize, Debug)]
181pub struct RestartWorkers {
182 /// Username to use to connect to worker machines using SSH.
183 pub ssh_user: String,
184 /// Hostnames or IP addresses of worker machines, separated by space.
185 pub hostnames: String,
186 /// Number of minutes to wait before checking the status of the worker processes again.
187 pub sleep_minutes_before_recheck: Option<f64>,
188}
189
190impl Config {
191 /// Create a new config struct to hold all settings. If available, settings
192 /// are parsed from the given `config_path` or default settings are applied.
193 /// If `config_path` is None, the default path for the config file is used.
194 pub fn new(config_path: Option<PathBuf>) -> Result<Self, config::ConfigError> {
195 let s = config::Config::builder();
196
197 // Set default settings.
198 let s = CommonSettings::default_settings(s)?;
199 let s = ServerSettings::default_settings(s)?;
200 let s = WorkerSettings::default_settings(s)?;
201 let s = ClientSettings::default_settings(s)?;
202
203 // Add config file as source.
204 let config_path = config_path.unwrap_or(default_path());
205 let s = s
206 .add_source(
207 config::File::with_name(config_path.to_string_lossy().as_ref()).required(false),
208 )
209 .build()?;
210
211 // Deserialize into Config.
212 s.try_deserialize()
213 }
214
215 /// If `config_path` does not exist, write the current config with all
216 /// settings and values to the given `config_path`. If `config_path` is
217 /// None, the default path for the config file is used.
218 pub fn create_template(&self, config_path: Option<PathBuf>) -> Result<()> {
219 let config_path = config_path.unwrap_or(default_path());
220 let toml = toml::to_string(&self)?;
221
222 if let Some(config_dir) = config_path.parent() {
223 if !config_dir.is_dir() {
224 create_dir_all(config_dir)?;
225 }
226 }
227
228 if !config_path.is_file() {
229 let mut file = File::create(config_path)?;
230 file.write_all(toml.as_bytes())?;
231 }
232
233 Ok(())
234 }
235
236 /// Get `log::Level` from the config.
237 pub fn get_log_level(&self) -> Result<log::Level> {
238 match self.common_settings.log_level.to_lowercase().as_str() {
239 "trace" => Ok(log::Level::Trace),
240 "debug" => Ok(log::Level::Debug),
241 "info" => Ok(log::Level::Info),
242 "warn" => Ok(log::Level::Warn),
243 "error" => Ok(log::Level::Error),
244 _ => bail!("Log level must be one of: trace, debug, info, warn, error"),
245 }
246 }
247
248 /// Build server address from `server_name` and `server_port`
249 /// and return it as `SocketAddr`.
250 pub async fn get_server_address(&self) -> Result<SocketAddr> {
251 let host = format!(
252 "{}:{}",
253 self.common_settings.server_name, self.common_settings.server_port
254 );
255 let mut addr_iter = lookup_host(host).await?;
256 match addr_iter.next() {
257 Some(socket_address) => Ok(socket_address),
258 None => bail!(
259 "Could not resolve server address: {}",
260 self.common_settings.server_name
261 ),
262 }
263 }
264}
265
266/// Returns the system-specific default path of the config file.
267pub fn default_path() -> PathBuf {
268 let config_file_name = if cfg!(debug_assertions) {
269 "config-devel.toml"
270 } else {
271 "config.toml"
272 };
273
274 if let Some(project_dirs) = ProjectDirs::from("", "", "kueue") {
275 project_dirs.config_dir().join(config_file_name)
276 } else {
277 config_file_name.into()
278 }
279}