s4_server/state_loader.rs
1//! v0.8.4 #72 — load manager snapshot files with **per-manager fault
2//! isolation**.
3//!
4//! ## Why
5//!
6//! Pre-#72 each of the nine `--*-state-file` loaders in `main.rs` used
7//! the `from_json(&raw).map_err(|e| format!(...))?` pattern: a single
8//! corrupted, truncated, or schema-incompatible snapshot would bubble
9//! `Err` out of the boot sequence and **kill the gateway start-up**.
10//! The operator was forced to either restore the file from backup or
11//! manually `rm` it before the gateway would even bind its listener —
12//! a loud restart-loop that took the entire data-plane down for one
13//! manager's bad JSON.
14//!
15//! ## What changed
16//!
17//! [`load_or_fresh`] turns the read-side `Err`/parse-side `Err` into:
18//!
19//! 1. a `tracing::warn!` log line carrying the manager name, the
20//! file path, and the underlying error (operators grep for
21//! `state file parse failed` in logs);
22//! 2. a bump to the
23//! `s4_state_file_load_failures_total{manager,reason}` Prometheus
24//! counter (operators alert on `rate(... > 0)` so silent boot-time
25//! fall-backs surface in dashboards);
26//! 3. a fresh `T::default()` manager — the gateway boots with empty
27//! in-memory state for the affected manager and the operator's
28//! snapshot file is **left in place** for post-mortem inspection
29//! (we never touch the operator's bytes — recovering / re-importing
30//! is their call).
31//!
32//! Every other manager keeps loading normally. One bad file no longer
33//! cascades into a gateway-wide DoS.
34//!
35//! ## What did NOT change
36//!
37//! - `--mfa-default-secret-file` keeps its **fail-closed** read path.
38//! A missing or unreadable MFA secret means MFA verification cannot
39//! succeed; silently booting with no secret would let DELETEs slip
40//! past the MFA gate. That call site stays inside the MFA loader
41//! block and continues to surface a hard error.
42//! - The on-disk snapshot is never deleted, renamed, or rewritten by
43//! the boot path. Operators decide whether to `rm` the bad file or
44//! restore from a known-good copy.
45
46use std::path::Path;
47
48/// Read a `--*-state-file <PATH>` snapshot, returning `Ok(None)` for
49/// the three "start fresh" cases and `Ok(Some(json))` for the actual
50/// restore-from-snapshot case:
51///
52/// 1. empty path (`--flag=`)
53/// 2. file doesn't exist
54/// 3. file exists but is empty / whitespace-only
55///
56/// The third case used to surface as a `from_json("")` parse error
57/// ("EOF while parsing"), which forced operators to hand-write a
58/// non-trivial empty-snapshot JSON before the manager would attach.
59/// `touch /tmp/foo.json && --flag /tmp/foo.json` is now equivalent to
60/// "fresh manager, dump snapshots back here" once the SIGUSR1 hook
61/// lands.
62///
63/// Originally lived in `main.rs` as a binary-private helper (v0.7
64/// dogfood follow-up); promoted to the library crate in v0.8.4 #72 so
65/// [`load_or_fresh`] can compose it without forcing main.rs to
66/// re-export.
67pub fn read_state_file_or_fresh(path: &Path) -> Result<Option<String>, std::io::Error> {
68 if path.as_os_str().is_empty() || !path.exists() {
69 return Ok(None);
70 }
71 let raw = std::fs::read_to_string(path)?;
72 if raw.trim().is_empty() {
73 Ok(None)
74 } else {
75 Ok(Some(raw))
76 }
77}
78
79/// v0.8.4 #72: load a manager snapshot with **per-manager graceful
80/// degradation**. See module docs for the contract.
81///
82/// `manager_name` is the static label used in both the `tracing::warn`
83/// log and the `s4_state_file_load_failures_total{manager}` Prometheus
84/// label — keep it short and stable (e.g. `"versioning"`,
85/// `"object_lock"`, `"mfa_delete"`).
86///
87/// `parse` is the manager's `from_json` constructor: a `FnOnce(&str)
88/// -> Result<T, serde_json::Error>` pointer / closure that converts a
89/// snapshot string into the typed manager. On parse failure the
90/// `serde_json::Error` is logged (the operator can grep the file at
91/// `path` for the exact byte offset) and the function returns
92/// `T::default()`.
93///
94/// `T: Default` is enforced because every snapshot-loaded manager in
95/// the gateway has a meaningful "empty in-memory state" — that's
96/// precisely the boot state operators would have hit if they had not
97/// passed `--*-state-file` at all.
98pub fn load_or_fresh<T, F>(manager_name: &'static str, path: &Path, parse: F) -> T
99where
100 T: Default,
101 F: FnOnce(&str) -> Result<T, serde_json::Error>,
102{
103 let raw = match read_state_file_or_fresh(path) {
104 Ok(Some(s)) => s,
105 Ok(None) => {
106 tracing::info!(
107 manager = manager_name,
108 path = %path.display(),
109 "state file missing or empty; starting fresh",
110 );
111 return T::default();
112 }
113 Err(e) => {
114 tracing::warn!(
115 manager = manager_name,
116 path = %path.display(),
117 error = %e,
118 "state file read failed; starting fresh — file left in place for inspection",
119 );
120 crate::metrics::record_state_file_load_failure(manager_name, "read_error");
121 return T::default();
122 }
123 };
124 match parse(&raw) {
125 Ok(mgr) => mgr,
126 Err(e) => {
127 tracing::warn!(
128 manager = manager_name,
129 path = %path.display(),
130 error = %e,
131 "state file parse failed (corrupted JSON); starting fresh — file left in place for inspection",
132 );
133 crate::metrics::record_state_file_load_failure(manager_name, "parse_error");
134 T::default()
135 }
136 }
137}
138
139#[cfg(test)]
140mod tests {
141 use super::*;
142 use std::io::Write as _;
143
144 /// Minimal `T: Default + from_json`-shaped manager for the unit
145 /// tests below. Mirrors the real managers' API surface (a
146 /// `from_json` returning `serde_json::Error` and a `Default`
147 /// fresh-state).
148 #[derive(Debug, Default, PartialEq, Eq)]
149 struct ToyManager {
150 items: Vec<String>,
151 }
152
153 impl ToyManager {
154 fn from_json(s: &str) -> Result<Self, serde_json::Error> {
155 let items: Vec<String> = serde_json::from_str(s)?;
156 Ok(Self { items })
157 }
158 }
159
160 #[test]
161 fn load_or_fresh_with_valid_json_returns_parsed() {
162 let dir = tempfile::tempdir().expect("tempdir");
163 let path = dir.path().join("snap.json");
164 std::fs::write(&path, r#"["a","b","c"]"#).expect("write");
165
166 let got: ToyManager = load_or_fresh("toy", &path, ToyManager::from_json);
167 assert_eq!(
168 got,
169 ToyManager {
170 items: vec!["a".into(), "b".into(), "c".into()],
171 },
172 "valid snapshot must round-trip into the typed manager",
173 );
174 }
175
176 #[test]
177 fn load_or_fresh_with_corrupted_json_logs_warn_and_returns_default() {
178 // Truncated JSON — the parser will fail with an EOF / syntax
179 // error which load_or_fresh must catch and convert into a
180 // default manager (NOT propagate as an error).
181 let dir = tempfile::tempdir().expect("tempdir");
182 let path = dir.path().join("snap.json");
183 let mut f = std::fs::File::create(&path).expect("create");
184 f.write_all(br#"{ "broken json"#).expect("write");
185 drop(f);
186 // Confirm the file actually survives the call (the operator
187 // gets the bytes back for inspection / restore from backup).
188 let pre_bytes = std::fs::read(&path).expect("pre read");
189
190 let got: ToyManager = load_or_fresh("toy", &path, ToyManager::from_json);
191 assert_eq!(
192 got,
193 ToyManager::default(),
194 "corrupted snapshot must fall back to T::default(), not propagate Err",
195 );
196
197 let post_bytes = std::fs::read(&path).expect("post read");
198 assert_eq!(
199 pre_bytes, post_bytes,
200 "the operator's snapshot bytes MUST be left untouched on parse failure",
201 );
202 }
203
204 #[test]
205 fn load_or_fresh_with_missing_file_returns_default() {
206 // Path that explicitly does not exist — read_state_file_or_fresh
207 // returns Ok(None) so we hit the "info! + default" branch (not
208 // the "warn! + bump metric" branch).
209 let dir = tempfile::tempdir().expect("tempdir");
210 let path = dir.path().join("does-not-exist.json");
211
212 let got: ToyManager = load_or_fresh("toy", &path, ToyManager::from_json);
213 assert_eq!(
214 got,
215 ToyManager::default(),
216 "missing snapshot must fall back to T::default()",
217 );
218 }
219
220 #[test]
221 fn load_or_fresh_with_empty_file_returns_default() {
222 // touch <PATH> then load — read_state_file_or_fresh returns
223 // Ok(None) for whitespace-only files; load_or_fresh must NOT
224 // hand the empty string to the parser (which would return
225 // "EOF while parsing").
226 let dir = tempfile::tempdir().expect("tempdir");
227 let path = dir.path().join("empty.json");
228 std::fs::write(&path, " \n \t\n").expect("write");
229
230 let got: ToyManager = load_or_fresh("toy", &path, ToyManager::from_json);
231 assert_eq!(got, ToyManager::default());
232 }
233
234 #[test]
235 fn read_state_file_or_fresh_normalises_empty_path() {
236 // Empty `--flag=` is parsed by clap as a Path of `""`.
237 let raw = read_state_file_or_fresh(Path::new("")).expect("ok");
238 assert!(raw.is_none(), "empty path must surface as Ok(None)");
239 }
240}