Skip to main content

orbok_workers/
model_verifier.rs

1//! Startup model verification (design decision §3, RFC-021).
2//!
3//! Runs at every startup. Checks that the files required for the
4//! configured embedding model are present and non-empty. Does **not**
5//! run SHA-256 hash verification — that is reserved for the explicit
6//! "Validate" action in the Models view (keeps startup under 5 ms).
7//!
8//! The two required files inside the model directory:
9//! - `onnx/model.onnx`  — the weights (typically 20–140 MB)
10//! - `tokenizer.json`   — the tokenizer config (~2 MB)
11
12use std::path::Path;
13
14/// Files that must be present in the model directory.
15pub const REQUIRED_MODEL_FILES: &[&str] = &["onnx/model.onnx", "tokenizer.json"];
16
17/// Outcome of a startup model verification check.
18#[derive(Debug, Clone, PartialEq)]
19pub enum VerifyOutcome {
20    /// Both required files exist and have size > 0. Semantic search
21    /// can be enabled when the inference backend is loaded.
22    Ready,
23
24    /// No model directory has ever been configured.
25    /// Show the setup wizard — state: "not configured".
26    NotConfigured,
27
28    /// The directory was configured but one or more required files are
29    /// absent or empty.
30    FilesInvalid {
31        /// The configured model directory path.
32        model_dir: String,
33        /// Which required files failed the check.
34        issues: Vec<FileIssue>,
35    },
36}
37
38/// A single file that failed the verification check.
39#[derive(Debug, Clone, PartialEq)]
40pub struct FileIssue {
41    /// Relative path within the model directory (e.g. `onnx/model.onnx`).
42    pub relative_path: String,
43    /// Human-readable reason.
44    pub reason: FileIssueKind,
45}
46
47/// Reason a required model file failed verification.
48#[derive(Debug, Clone, PartialEq)]
49pub enum FileIssueKind {
50    NotFound,
51    Empty,
52    PermissionDenied,
53}
54
55impl FileIssueKind {
56    pub fn as_str(&self) -> &'static str {
57        match self {
58            FileIssueKind::NotFound => "not found",
59            FileIssueKind::Empty => "empty file (0 bytes)",
60            FileIssueKind::PermissionDenied => "permission denied",
61        }
62    }
63}
64
65/// Verify the embedding model directory at startup.
66///
67/// `model_dir` comes from [`OrbokSettings::embedding_model_dir`].
68///
69/// # Timing
70/// Typical execution: < 2 ms (two `stat` calls). No SHA-256 hashing.
71pub fn verify_embedding_model(model_dir: Option<&str>) -> VerifyOutcome {
72    let dir_str = match model_dir {
73        Some(d) if !d.trim().is_empty() => d,
74        _ => return VerifyOutcome::NotConfigured,
75    };
76    let dir = Path::new(dir_str);
77    let mut issues = Vec::new();
78    for rel in REQUIRED_MODEL_FILES {
79        let full = dir.join(rel);
80        match std::fs::metadata(&full) {
81            Ok(meta) if meta.len() == 0 => {
82                issues.push(FileIssue {
83                    relative_path: rel.to_string(),
84                    reason: FileIssueKind::Empty,
85                });
86            }
87            Ok(_) => {} // present and non-empty — OK
88            Err(e) if e.kind() == std::io::ErrorKind::PermissionDenied => {
89                issues.push(FileIssue {
90                    relative_path: rel.to_string(),
91                    reason: FileIssueKind::PermissionDenied,
92                });
93            }
94            Err(_) => {
95                issues.push(FileIssue {
96                    relative_path: rel.to_string(),
97                    reason: FileIssueKind::NotFound,
98                });
99            }
100        }
101    }
102    if issues.is_empty() {
103        VerifyOutcome::Ready
104    } else {
105        VerifyOutcome::FilesInvalid {
106            model_dir: dir_str.to_string(),
107            issues,
108        }
109    }
110}
111
112/// Run the verifier and return a brief log-friendly summary string
113/// (never includes file contents — NFR-014).
114pub fn verify_outcome_summary(outcome: &VerifyOutcome) -> String {
115    match outcome {
116        VerifyOutcome::Ready => "embedding model OK".into(),
117        VerifyOutcome::NotConfigured => "embedding model not configured".into(),
118        VerifyOutcome::FilesInvalid { issues, .. } => {
119            let problems: Vec<_> = issues.iter().map(|i| i.reason.as_str()).collect();
120            format!("embedding model invalid: {}", problems.join(", "))
121        }
122    }
123}
124
125#[cfg(test)]
126mod tests {
127    use super::*;
128
129    #[test]
130    fn verify_none_is_not_configured() {
131        assert_eq!(verify_embedding_model(None), VerifyOutcome::NotConfigured);
132    }
133
134    #[test]
135    fn verify_empty_string_is_not_configured() {
136        assert_eq!(verify_embedding_model(Some("")), VerifyOutcome::NotConfigured);
137        assert_eq!(verify_embedding_model(Some("  ")), VerifyOutcome::NotConfigured);
138    }
139
140    #[test]
141    fn verify_nonexistent_dir_reports_both_files_missing() {
142        let outcome = verify_embedding_model(Some("/nonexistent/orbok-models"));
143        match outcome {
144            VerifyOutcome::FilesInvalid { issues, .. } => {
145                assert_eq!(issues.len(), 2);
146                assert!(issues.iter().all(|i| i.reason == FileIssueKind::NotFound));
147            }
148            other => panic!("expected FilesInvalid, got {other:?}"),
149        }
150    }
151
152    #[test]
153    fn verify_dir_with_valid_files_returns_ready() {
154        let dir = tempfile::tempdir().unwrap();
155        let onnx_dir = dir.path().join("onnx");
156        std::fs::create_dir_all(&onnx_dir).unwrap();
157        std::fs::write(onnx_dir.join("model.onnx"), vec![0u8; 1024]).unwrap();
158        std::fs::write(dir.path().join("tokenizer.json"), b"{}").unwrap();
159        assert_eq!(
160            verify_embedding_model(Some(&dir.path().to_string_lossy())),
161            VerifyOutcome::Ready
162        );
163    }
164
165    #[test]
166    fn verify_empty_model_file_reports_invalid() {
167        let dir = tempfile::tempdir().unwrap();
168        let onnx_dir = dir.path().join("onnx");
169        std::fs::create_dir_all(&onnx_dir).unwrap();
170        std::fs::write(onnx_dir.join("model.onnx"), b"").unwrap(); // empty!
171        std::fs::write(dir.path().join("tokenizer.json"), b"{}").unwrap();
172        match verify_embedding_model(Some(&dir.path().to_string_lossy())) {
173            VerifyOutcome::FilesInvalid { issues, .. } => {
174                assert_eq!(issues.len(), 1);
175                assert_eq!(issues[0].relative_path, "onnx/model.onnx");
176                assert_eq!(issues[0].reason, FileIssueKind::Empty);
177            }
178            other => panic!("expected FilesInvalid, got {other:?}"),
179        }
180    }
181
182    #[test]
183    fn verify_missing_tokenizer_reports_invalid() {
184        let dir = tempfile::tempdir().unwrap();
185        let onnx_dir = dir.path().join("onnx");
186        std::fs::create_dir_all(&onnx_dir).unwrap();
187        std::fs::write(onnx_dir.join("model.onnx"), vec![1u8; 512]).unwrap();
188        // tokenizer.json deliberately absent
189        match verify_embedding_model(Some(&dir.path().to_string_lossy())) {
190            VerifyOutcome::FilesInvalid { issues, .. } => {
191                assert_eq!(issues.len(), 1);
192                assert_eq!(issues[0].relative_path, "tokenizer.json");
193            }
194            other => panic!("expected FilesInvalid, got {other:?}"),
195        }
196    }
197
198    #[test]
199    fn summary_strings_are_log_safe() {
200        // Verify summary strings never include file paths (only status).
201        let summary = verify_outcome_summary(&VerifyOutcome::FilesInvalid {
202            model_dir: "/secret/path".into(),
203            issues: vec![FileIssue {
204                relative_path: "onnx/model.onnx".into(),
205                reason: FileIssueKind::NotFound,
206            }],
207        });
208        assert!(!summary.contains("/secret/path"),
209            "summary must not include the model dir path");
210    }
211}
212
213// ── Deep verification (RFC-029: checksum integrity check) ────────────
214
215/// A manifest file written alongside downloaded model files.
216/// Contains the SHA-256 hash of each file at download time so the
217/// explicit "Validate" action can detect corruption or tampering.
218///
219/// File: `{model_dir}/orbok-manifest.json`
220#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
221pub struct ModelManifest {
222    /// Map of relative file path → lowercase hex SHA-256 digest.
223    pub sha256: std::collections::HashMap<String, String>,
224}
225
226impl ModelManifest {
227    /// Load a manifest from the model directory. Returns `None` if the
228    /// manifest file does not exist (e.g. model was placed manually).
229    pub fn load(model_dir: &Path) -> Option<Self> {
230        let path = model_dir.join("orbok-manifest.json");
231        let bytes = std::fs::read(&path).ok()?;
232        serde_json::from_slice(&bytes).ok()
233    }
234
235    /// Persist the manifest to the model directory.
236    pub fn save(&self, model_dir: &Path) -> std::io::Result<()> {
237        let path = model_dir.join("orbok-manifest.json");
238        let json = serde_json::to_vec_pretty(self)
239            .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
240        std::fs::write(&path, &json)
241    }
242
243    /// Compute the SHA-256 hex digest of a file.
244    pub fn sha256_of_file(path: &Path) -> std::io::Result<String> {
245        use sha2::{Digest, Sha256};
246        let bytes = std::fs::read(path)?;
247        let digest: String = Sha256::digest(&bytes)
248            .iter()
249            .map(|b| format!("{b:02x}"))
250            .collect();
251        Ok(digest)
252    }
253}
254
255/// Result of a deep (checksum) verification.
256#[derive(Debug, Clone, PartialEq)]
257pub enum DeepVerifyOutcome {
258    /// All files present and checksums match the stored manifest.
259    Valid,
260    /// No manifest found — model was placed manually; cannot verify checksums.
261    NoManifest,
262    /// One or more checksums did not match.
263    ChecksumMismatch(Vec<String>),
264    /// A required file is missing (manifest exists but file is gone).
265    FileMissing(Vec<String>),
266}
267
268/// Run a deep integrity check using the stored manifest (RFC-029).
269/// Called only from the explicit "Validate" button in the Models view —
270/// never at startup (would add 50–500 ms for large models).
271pub fn verify_embedding_model_deep(model_dir: &Path) -> DeepVerifyOutcome {
272    let manifest = match ModelManifest::load(model_dir) {
273        Some(m) => m,
274        None => return DeepVerifyOutcome::NoManifest,
275    };
276
277    let mut missing = Vec::new();
278    let mut mismatched = Vec::new();
279
280    for (rel, expected) in &manifest.sha256 {
281        let full = model_dir.join(rel);
282        match ModelManifest::sha256_of_file(&full) {
283            Ok(actual) if actual == *expected => {} // OK
284            Ok(_) => mismatched.push(rel.clone()),
285            Err(_) => missing.push(rel.clone()),
286        }
287    }
288
289    if !missing.is_empty() {
290        DeepVerifyOutcome::FileMissing(missing)
291    } else if !mismatched.is_empty() {
292        DeepVerifyOutcome::ChecksumMismatch(mismatched)
293    } else {
294        DeepVerifyOutcome::Valid
295    }
296}
297
298#[cfg(test)]
299mod deep_verify_tests {
300    use super::*;
301
302    #[test]
303    fn manifest_round_trips() {
304        let dir = tempfile::tempdir().unwrap();
305        let mut sha256 = std::collections::HashMap::new();
306        sha256.insert("onnx/model.onnx".into(), "abc123".into());
307        let m = ModelManifest { sha256 };
308        m.save(dir.path()).unwrap();
309        let loaded = ModelManifest::load(dir.path()).unwrap();
310        assert_eq!(loaded.sha256["onnx/model.onnx"], "abc123");
311    }
312
313    #[test]
314    fn no_manifest_returns_no_manifest() {
315        let dir = tempfile::tempdir().unwrap();
316        assert_eq!(
317            verify_embedding_model_deep(dir.path()),
318            DeepVerifyOutcome::NoManifest
319        );
320    }
321
322    #[test]
323    fn valid_checksums_return_valid() {
324        let dir = tempfile::tempdir().unwrap();
325        let content = b"model weights";
326        std::fs::write(dir.path().join("model.bin"), content).unwrap();
327        let hash = ModelManifest::sha256_of_file(&dir.path().join("model.bin")).unwrap();
328        let mut sha256 = std::collections::HashMap::new();
329        sha256.insert("model.bin".into(), hash);
330        ModelManifest { sha256 }.save(dir.path()).unwrap();
331        assert_eq!(
332            verify_embedding_model_deep(dir.path()),
333            DeepVerifyOutcome::Valid
334        );
335    }
336
337    #[test]
338    fn corrupted_file_returns_mismatch() {
339        let dir = tempfile::tempdir().unwrap();
340        std::fs::write(dir.path().join("model.bin"), b"original").unwrap();
341        let hash = ModelManifest::sha256_of_file(&dir.path().join("model.bin")).unwrap();
342        let mut sha256 = std::collections::HashMap::new();
343        sha256.insert("model.bin".into(), hash);
344        ModelManifest { sha256 }.save(dir.path()).unwrap();
345        // Corrupt the file
346        std::fs::write(dir.path().join("model.bin"), b"corrupted!").unwrap();
347        assert_eq!(
348            verify_embedding_model_deep(dir.path()),
349            DeepVerifyOutcome::ChecksumMismatch(vec!["model.bin".into()])
350        );
351    }
352}