Skip to main content

s4_server/
state_loader.rs

1//! v0.8.4 #72 — load manager snapshot files with **per-manager fault
2//! isolation**.
3//!
4//! ## Why
5//!
6//! Pre-#72 each of the nine `--*-state-file` loaders in `main.rs` used
7//! the `from_json(&raw).map_err(|e| format!(...))?` pattern: a single
8//! corrupted, truncated, or schema-incompatible snapshot would bubble
9//! `Err` out of the boot sequence and **kill the gateway start-up**.
10//! The operator was forced to either restore the file from backup or
11//! manually `rm` it before the gateway would even bind its listener —
12//! a loud restart-loop that took the entire data-plane down for one
13//! manager's bad JSON.
14//!
15//! ## What changed
16//!
17//! [`load_or_fresh`] turns the read-side `Err`/parse-side `Err` into:
18//!
19//! 1. a `tracing::warn!` log line carrying the manager name, the
20//!    file path, and the underlying error (operators grep for
21//!    `state file parse failed` in logs);
22//! 2. a bump to the
23//!    `s4_state_file_load_failures_total{manager,reason}` Prometheus
24//!    counter (operators alert on `rate(... > 0)` so silent boot-time
25//!    fall-backs surface in dashboards);
26//! 3. a fresh `T::default()` manager — the gateway boots with empty
27//!    in-memory state for the affected manager and the operator's
28//!    snapshot file is **left in place** for post-mortem inspection
29//!    (we never touch the operator's bytes — recovering / re-importing
30//!    is their call).
31//!
32//! Every other manager keeps loading normally. One bad file no longer
33//! cascades into a gateway-wide DoS.
34//!
35//! ## What did NOT change
36//!
37//! - `--mfa-default-secret-file` keeps its **fail-closed** read path.
38//!   A missing or unreadable MFA secret means MFA verification cannot
39//!   succeed; silently booting with no secret would let DELETEs slip
40//!   past the MFA gate. That call site stays inside the MFA loader
41//!   block and continues to surface a hard error.
42//! - The on-disk snapshot is never deleted, renamed, or rewritten by
43//!   the boot path. Operators decide whether to `rm` the bad file or
44//!   restore from a known-good copy.
45
46use std::path::Path;
47
48/// Read a `--*-state-file <PATH>` snapshot, returning `Ok(None)` for
49/// the three "start fresh" cases and `Ok(Some(json))` for the actual
50/// restore-from-snapshot case:
51///
52/// 1. empty path (`--flag=`)
53/// 2. file doesn't exist
54/// 3. file exists but is empty / whitespace-only
55///
56/// The third case used to surface as a `from_json("")` parse error
57/// ("EOF while parsing"), which forced operators to hand-write a
58/// non-trivial empty-snapshot JSON before the manager would attach.
59/// `touch /tmp/foo.json && --flag /tmp/foo.json` is now equivalent to
60/// "fresh manager, dump snapshots back here" once the SIGUSR1 hook
61/// lands.
62///
63/// Originally lived in `main.rs` as a binary-private helper (v0.7
64/// dogfood follow-up); promoted to the library crate in v0.8.4 #72 so
65/// [`load_or_fresh`] can compose it without forcing main.rs to
66/// re-export.
67pub fn read_state_file_or_fresh(path: &Path) -> Result<Option<String>, std::io::Error> {
68    if path.as_os_str().is_empty() || !path.exists() {
69        return Ok(None);
70    }
71    let raw = std::fs::read_to_string(path)?;
72    if raw.trim().is_empty() {
73        Ok(None)
74    } else {
75        Ok(Some(raw))
76    }
77}
78
79/// v0.8.4 #72: load a manager snapshot with **per-manager graceful
80/// degradation**. See module docs for the contract.
81///
82/// `manager_name` is the static label used in both the `tracing::warn`
83/// log and the `s4_state_file_load_failures_total{manager}` Prometheus
84/// label — keep it short and stable (e.g. `"versioning"`,
85/// `"object_lock"`, `"mfa_delete"`).
86///
87/// `parse` is the manager's `from_json` constructor: a `FnOnce(&str)
88/// -> Result<T, serde_json::Error>` pointer / closure that converts a
89/// snapshot string into the typed manager. On parse failure the
90/// `serde_json::Error` is logged (the operator can grep the file at
91/// `path` for the exact byte offset) and the function returns
92/// `T::default()`.
93///
94/// `T: Default` is enforced because every snapshot-loaded manager in
95/// the gateway has a meaningful "empty in-memory state" — that's
96/// precisely the boot state operators would have hit if they had not
97/// passed `--*-state-file` at all.
98pub fn load_or_fresh<T, F>(manager_name: &'static str, path: &Path, parse: F) -> T
99where
100    T: Default,
101    F: FnOnce(&str) -> Result<T, serde_json::Error>,
102{
103    let raw = match read_state_file_or_fresh(path) {
104        Ok(Some(s)) => s,
105        Ok(None) => {
106            tracing::info!(
107                manager = manager_name,
108                path = %path.display(),
109                "state file missing or empty; starting fresh",
110            );
111            return T::default();
112        }
113        Err(e) => {
114            tracing::warn!(
115                manager = manager_name,
116                path = %path.display(),
117                error = %e,
118                "state file read failed; starting fresh — file left in place for inspection",
119            );
120            crate::metrics::record_state_file_load_failure(manager_name, "read_error");
121            return T::default();
122        }
123    };
124    match parse(&raw) {
125        Ok(mgr) => mgr,
126        Err(e) => {
127            tracing::warn!(
128                manager = manager_name,
129                path = %path.display(),
130                error = %e,
131                "state file parse failed (corrupted JSON); starting fresh — file left in place for inspection",
132            );
133            crate::metrics::record_state_file_load_failure(manager_name, "parse_error");
134            T::default()
135        }
136    }
137}
138
139#[cfg(test)]
140mod tests {
141    use super::*;
142    use std::io::Write as _;
143
144    /// Minimal `T: Default + from_json`-shaped manager for the unit
145    /// tests below. Mirrors the real managers' API surface (a
146    /// `from_json` returning `serde_json::Error` and a `Default`
147    /// fresh-state).
148    #[derive(Debug, Default, PartialEq, Eq)]
149    struct ToyManager {
150        items: Vec<String>,
151    }
152
153    impl ToyManager {
154        fn from_json(s: &str) -> Result<Self, serde_json::Error> {
155            let items: Vec<String> = serde_json::from_str(s)?;
156            Ok(Self { items })
157        }
158    }
159
160    #[test]
161    fn load_or_fresh_with_valid_json_returns_parsed() {
162        let dir = tempfile::tempdir().expect("tempdir");
163        let path = dir.path().join("snap.json");
164        std::fs::write(&path, r#"["a","b","c"]"#).expect("write");
165
166        let got: ToyManager = load_or_fresh("toy", &path, ToyManager::from_json);
167        assert_eq!(
168            got,
169            ToyManager {
170                items: vec!["a".into(), "b".into(), "c".into()],
171            },
172            "valid snapshot must round-trip into the typed manager",
173        );
174    }
175
176    #[test]
177    fn load_or_fresh_with_corrupted_json_logs_warn_and_returns_default() {
178        // Truncated JSON — the parser will fail with an EOF / syntax
179        // error which load_or_fresh must catch and convert into a
180        // default manager (NOT propagate as an error).
181        let dir = tempfile::tempdir().expect("tempdir");
182        let path = dir.path().join("snap.json");
183        let mut f = std::fs::File::create(&path).expect("create");
184        f.write_all(br#"{ "broken json"#).expect("write");
185        drop(f);
186        // Confirm the file actually survives the call (the operator
187        // gets the bytes back for inspection / restore from backup).
188        let pre_bytes = std::fs::read(&path).expect("pre read");
189
190        let got: ToyManager = load_or_fresh("toy", &path, ToyManager::from_json);
191        assert_eq!(
192            got,
193            ToyManager::default(),
194            "corrupted snapshot must fall back to T::default(), not propagate Err",
195        );
196
197        let post_bytes = std::fs::read(&path).expect("post read");
198        assert_eq!(
199            pre_bytes, post_bytes,
200            "the operator's snapshot bytes MUST be left untouched on parse failure",
201        );
202    }
203
204    #[test]
205    fn load_or_fresh_with_missing_file_returns_default() {
206        // Path that explicitly does not exist — read_state_file_or_fresh
207        // returns Ok(None) so we hit the "info! + default" branch (not
208        // the "warn! + bump metric" branch).
209        let dir = tempfile::tempdir().expect("tempdir");
210        let path = dir.path().join("does-not-exist.json");
211
212        let got: ToyManager = load_or_fresh("toy", &path, ToyManager::from_json);
213        assert_eq!(
214            got,
215            ToyManager::default(),
216            "missing snapshot must fall back to T::default()",
217        );
218    }
219
220    #[test]
221    fn load_or_fresh_with_empty_file_returns_default() {
222        // touch <PATH> then load — read_state_file_or_fresh returns
223        // Ok(None) for whitespace-only files; load_or_fresh must NOT
224        // hand the empty string to the parser (which would return
225        // "EOF while parsing").
226        let dir = tempfile::tempdir().expect("tempdir");
227        let path = dir.path().join("empty.json");
228        std::fs::write(&path, "   \n  \t\n").expect("write");
229
230        let got: ToyManager = load_or_fresh("toy", &path, ToyManager::from_json);
231        assert_eq!(got, ToyManager::default());
232    }
233
234    #[test]
235    fn read_state_file_or_fresh_normalises_empty_path() {
236        // Empty `--flag=` is parsed by clap as a Path of `""`.
237        let raw = read_state_file_or_fresh(Path::new("")).expect("ok");
238        assert!(raw.is_none(), "empty path must surface as Ok(None)");
239    }
240}