treadmill_rs/api/
switchboard_supervisor.rs

1//! Types used in the interface between the coordinator and supervisor
2//! components.
3
4use crate::connector::JobError;
5use crate::image::manifest::ImageId;
6use crate::util::chrono::duration as human_duration;
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use uuid::Uuid;
10
11pub mod websocket {
12    pub static TREADMILL_WEBSOCKET_PROTOCOL: &str = "treadmillv1";
13    pub static TREADMILL_WEBSOCKET_CONFIG: &str = "tml-socket-config";
14}
15
16// -- (Configuration)  -----------------------------------------------------------------------------
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct SocketConfig {
20    /// PING-PONG keepalive configuration.
21    pub keepalive: KeepaliveConfig,
22}
23
24#[derive(Debug, Clone, Serialize, Deserialize)]
25pub struct KeepaliveConfig {
26    /// How often the switchboard should send PING messages to the supervisor.
27    #[serde(with = "human_duration")]
28    pub ping_interval: chrono::TimeDelta,
29    /// Minimum time without a PONG response before the switchboard closes the connection.
30    #[serde(with = "human_duration")]
31    pub keepalive: chrono::TimeDelta,
32}
33
34// -- StartJobRequest ------------------------------------------------------------------------------
35
36#[derive(Serialize, Deserialize, Clone)]
37pub struct ParameterValue {
38    pub value: String,
39    pub secret: bool,
40}
41
42impl std::fmt::Debug for ParameterValue {
43    /// Custom implementation of [`std::fmt::Debug`] for [`ParameterValue`] to
44    /// avoid leaking secrets in logs:
45    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
46        let mut debug_struct = f.debug_struct("ParameterValue");
47        debug_struct.field("secret", &self.secret);
48
49        // TODO: Requires nightly feature debug_closure_helpers
50        // debug_struct.field_with("value", |f| {
51        //     if self.secret {
52        //         write!(f, "***")
53        //     } else {
54        //         <String as std::fmt::Debug>::fmt(&self.value, f)
55        //     }
56        // });
57
58        // For now, print the secret as if it were a string (with
59        // quotation marks) with contents "***":
60        debug_struct.field("value", if self.secret { &"***" } else { &self.value });
61
62        debug_struct.finish()
63    }
64}
65
66#[derive(Serialize, Deserialize, Debug, Clone)]
67#[serde(tag = "type")]
68#[serde(rename_all = "snake_case")]
69pub enum ImageSpecification {
70    /// Whether to resume a previously started job.
71    ResumeJob { job_id: Uuid },
72
73    /// Which image to base this job off. If the image is not locally cached
74    /// at the supervisor, it will be fetched using its manifest prior to
75    /// executing the job.
76    ///
77    /// Images are content-addressed by the SHA-256 digest of their
78    /// manifest.
79    ///
80    /// Note that if a job is being restarted, it will use this variant.
81    Image { image_id: ImageId },
82}
83#[derive(Serialize, Deserialize, Debug, Clone)]
84#[serde(rename_all = "snake_case")]
85pub struct RestartPolicy {
86    pub remaining_restart_count: usize,
87}
88#[derive(Serialize, Deserialize, Debug, Clone)]
89#[serde(rename_all = "snake_case")]
90pub struct StartJobMessage {
91    /// Unique identifier of the job to be started.
92    ///
93    /// To restart a previously failed or interrupted job, pass the same ID
94    /// in as the old job and set `resume_job = true`. The supervisor may
95    /// refuse to start a job with a re-used ID if `resume_job` is
96    /// deasserted, and must refuse to start a job when it cannot be resumed
97    /// and `resume_job` is asserted.
98    pub job_id: Uuid,
99
100    pub image_spec: ImageSpecification,
101
102    /// The set of initial SSH keys to deploy onto the image.
103    ///
104    /// The image's configuration of the Treadmill puppet daemon determines
105    /// how and whether these keys will be loaded.
106    pub ssh_keys: Vec<String>,
107
108    pub restart_policy: RestartPolicy,
109
110    /// A hash map of parameters provided to this job execution. These
111    /// parameters are provided to the puppet daemon.
112    pub parameters: HashMap<String, ParameterValue>,
113}
114
115// -- StopJobRequest -------------------------------------------------------------------------------
116
117#[derive(Serialize, Deserialize, Debug, Clone)]
118#[serde(rename_all = "snake_case")]
119pub struct StopJobMessage {
120    /// Unique identifier of the job to be stopped:
121    pub job_id: Uuid,
122}
123
124// -- Job/Supervisor Status ------------------------------------------------------------------------
125
126#[derive(Serialize, Deserialize, Debug, Clone)]
127#[serde(rename_all = "snake_case")]
128pub enum JobInitializingStage {
129    /// Generic starting stage, for when no other stage is applicable:
130    Starting,
131
132    /// Fetching the specified image:
133    FetchingImage,
134
135    /// Acquiring resources, such as the root file system, to launch the
136    /// board environment.
137    Allocating,
138
139    /// Provisioning the environment, such as making any changes to the base
140    /// system according to the user-provided customizations.
141    Provisioning,
142
143    /// The host is booting. The next transition should
144    /// either be into the `Ready` or `Failed` states.
145    Booting,
146}
147
148#[derive(Serialize, Deserialize, Debug, Clone)]
149#[serde(tag = "state")]
150#[serde(rename_all = "snake_case")]
151pub enum RunningJobState {
152    Initializing { stage: JobInitializingStage },
153    Ready,
154    // Ready { connection_info: Vec<JobSessionConnectionInfo>, },
155    Terminating,
156    Terminated,
157}
158#[derive(Serialize, Deserialize, Debug, Copy, Clone)]
159#[serde(rename_all = "snake_case")]
160pub enum JobUserExitStatus {
161    Success,
162    Error,
163    Unknown,
164}
165#[derive(Serialize, Deserialize, Debug, Clone)]
166#[serde(tag = "type")]
167#[serde(rename_all = "snake_case")]
168pub enum SupervisorJobEvent {
169    StateTransition {
170        new_state: RunningJobState,
171        status_message: Option<String>,
172    },
173    DeclareExitStatus {
174        user_exit_status: JobUserExitStatus,
175        host_output: Option<String>,
176    },
177    // Technically a state transition
178    Error {
179        error: JobError,
180    },
181    ConsoleLog {
182        console_bytes: Vec<u8>,
183    },
184}
185
186#[derive(Debug, Clone, Serialize, Deserialize)]
187#[serde(tag = "type")]
188#[serde(rename_all = "snake_case")]
189pub enum ReportedSupervisorStatus {
190    OngoingJob {
191        job_id: Uuid,
192        job_state: RunningJobState,
193    },
194    Idle,
195}
196#[derive(Debug, Clone, Serialize, Deserialize)]
197#[serde(tag = "type")]
198#[serde(rename_all = "snake_case")]
199pub enum SupervisorEvent {
200    JobEvent {
201        job_id: Uuid,
202        event: SupervisorJobEvent,
203    },
204}
205
206// -- General Request/Response ---------------------------------------------------------------------
207
208#[derive(Serialize, Deserialize, Debug, Clone)]
209#[serde(rename_all = "snake_case")]
210pub struct Request<T> {
211    pub request_id: Uuid,
212    pub message: T,
213}
214#[derive(Serialize, Deserialize, Debug, Clone)]
215#[serde(rename_all = "snake_case")]
216pub struct Response<T> {
217    pub response_to_request_id: Uuid,
218    pub message: T,
219}
220#[derive(Serialize, Deserialize, Debug, Clone)]
221#[serde(rename_all = "snake_case")]
222#[serde(tag = "type", content = "message")]
223pub enum Message {
224    StartJob(StartJobMessage),
225
226    StopJob(StopJobMessage),
227
228    StatusRequest(Request<()>),
229    StatusResponse(Response<ReportedSupervisorStatus>),
230
231    SupervisorEvent(SupervisorEvent),
232}
233impl Message {
234    pub fn request_id(&self) -> Option<Uuid> {
235        match self {
236            Message::StatusRequest(r) => Some(r.request_id),
237            Message::StartJob(_)
238            | Message::StopJob(_)
239            | Message::StatusResponse(_)
240            | Message::SupervisorEvent(_) => None,
241        }
242    }
243    pub fn to_response_message(self) -> Result<Response<ResponseMessage>, Message> {
244        match self {
245            Message::StatusResponse(Response {
246                response_to_request_id,
247                message,
248            }) => Ok(Response {
249                response_to_request_id,
250                message: ResponseMessage::StatusResponse(message),
251            }),
252            x => Err(x),
253        }
254    }
255}
256#[derive(Debug)]
257#[non_exhaustive]
258pub enum ResponseMessage {
259    StatusResponse(ReportedSupervisorStatus),
260}