Skip to main content

orbok_workers/
model_verifier.rs

1//! Startup model verification (design decision §3, RFC-021).
2//!
3//! Runs at every startup. Checks that the files required for the
4//! configured embedding model are present and non-empty. Does **not**
5//! run SHA-256 hash verification — that is reserved for the explicit
6//! "Validate" action in the Models view (keeps startup under 5 ms).
7//!
8//! The two required files inside the model directory:
9//! - `onnx/model.onnx`  — the weights (typically 20–140 MB)
10//! - `tokenizer.json`   — the tokenizer config (~2 MB)
11
12use std::path::Path;
13
14/// Files that must be present in the model directory.
15pub const REQUIRED_MODEL_FILES: &[&str] = &["onnx/model.onnx", "tokenizer.json"];
16
17/// Outcome of a startup model verification check.
18#[derive(Debug, Clone, PartialEq)]
19pub enum VerifyOutcome {
20    /// Both required files exist and have size > 0. Semantic search
21    /// can be enabled when the inference backend is loaded.
22    Ready,
23
24    /// No model directory has ever been configured.
25    /// Show the setup wizard — state: "not configured".
26    NotConfigured,
27
28    /// The directory was configured but one or more required files are
29    /// absent or empty.
30    FilesInvalid {
31        /// The configured model directory path.
32        model_dir: String,
33        /// Which required files failed the check.
34        issues: Vec<FileIssue>,
35    },
36}
37
38/// A single file that failed the verification check.
39#[derive(Debug, Clone, PartialEq)]
40pub struct FileIssue {
41    /// Relative path within the model directory (e.g. `onnx/model.onnx`).
42    pub relative_path: String,
43    /// Human-readable reason.
44    pub reason: FileIssueKind,
45}
46
47/// Reason a required model file failed verification.
48#[derive(Debug, Clone, PartialEq)]
49pub enum FileIssueKind {
50    NotFound,
51    Empty,
52    PermissionDenied,
53}
54
55impl FileIssueKind {
56    pub fn as_str(&self) -> &'static str {
57        match self {
58            FileIssueKind::NotFound => "not found",
59            FileIssueKind::Empty => "empty file (0 bytes)",
60            FileIssueKind::PermissionDenied => "permission denied",
61        }
62    }
63}
64
65/// Verify the embedding model directory at startup.
66///
67/// `model_dir` comes from [`OrbokSettings::embedding_model_dir`].
68///
69/// # Timing
70/// Typical execution: < 2 ms (two `stat` calls). No SHA-256 hashing.
71pub fn verify_embedding_model(model_dir: Option<&str>) -> VerifyOutcome {
72    let dir_str = match model_dir {
73        Some(d) if !d.trim().is_empty() => d,
74        _ => return VerifyOutcome::NotConfigured,
75    };
76    let dir = Path::new(dir_str);
77    let mut issues = Vec::new();
78    for rel in REQUIRED_MODEL_FILES {
79        let full = dir.join(rel);
80        match std::fs::metadata(&full) {
81            Ok(meta) if meta.len() == 0 => {
82                issues.push(FileIssue {
83                    relative_path: rel.to_string(),
84                    reason: FileIssueKind::Empty,
85                });
86            }
87            Ok(_) => {} // present and non-empty — OK
88            Err(e) if e.kind() == std::io::ErrorKind::PermissionDenied => {
89                issues.push(FileIssue {
90                    relative_path: rel.to_string(),
91                    reason: FileIssueKind::PermissionDenied,
92                });
93            }
94            Err(_) => {
95                issues.push(FileIssue {
96                    relative_path: rel.to_string(),
97                    reason: FileIssueKind::NotFound,
98                });
99            }
100        }
101    }
102    if issues.is_empty() {
103        VerifyOutcome::Ready
104    } else {
105        VerifyOutcome::FilesInvalid {
106            model_dir: dir_str.to_string(),
107            issues,
108        }
109    }
110}
111
112/// Run the verifier and return a brief log-friendly summary string
113/// (never includes file contents — NFR-014).
114pub fn verify_outcome_summary(outcome: &VerifyOutcome) -> String {
115    match outcome {
116        VerifyOutcome::Ready => "embedding model OK".into(),
117        VerifyOutcome::NotConfigured => "embedding model not configured".into(),
118        VerifyOutcome::FilesInvalid { issues, .. } => {
119            let problems: Vec<_> = issues.iter().map(|i| i.reason.as_str()).collect();
120            format!("embedding model invalid: {}", problems.join(", "))
121        }
122    }
123}
124
125#[cfg(test)]
126mod tests {
127    use super::*;
128
129    #[test]
130    fn verify_none_is_not_configured() {
131        assert_eq!(verify_embedding_model(None), VerifyOutcome::NotConfigured);
132    }
133
134    #[test]
135    fn verify_empty_string_is_not_configured() {
136        assert_eq!(
137            verify_embedding_model(Some("")),
138            VerifyOutcome::NotConfigured
139        );
140        assert_eq!(
141            verify_embedding_model(Some("  ")),
142            VerifyOutcome::NotConfigured
143        );
144    }
145
146    #[test]
147    fn verify_nonexistent_dir_reports_both_files_missing() {
148        let outcome = verify_embedding_model(Some("/nonexistent/orbok-models"));
149        match outcome {
150            VerifyOutcome::FilesInvalid { issues, .. } => {
151                assert_eq!(issues.len(), 2);
152                assert!(issues.iter().all(|i| i.reason == FileIssueKind::NotFound));
153            }
154            other => panic!("expected FilesInvalid, got {other:?}"),
155        }
156    }
157
158    #[test]
159    fn verify_dir_with_valid_files_returns_ready() {
160        let dir = tempfile::tempdir().unwrap();
161        let onnx_dir = dir.path().join("onnx");
162        std::fs::create_dir_all(&onnx_dir).unwrap();
163        std::fs::write(onnx_dir.join("model.onnx"), vec![0u8; 1024]).unwrap();
164        std::fs::write(dir.path().join("tokenizer.json"), b"{}").unwrap();
165        assert_eq!(
166            verify_embedding_model(Some(&dir.path().to_string_lossy())),
167            VerifyOutcome::Ready
168        );
169    }
170
171    #[test]
172    fn verify_empty_model_file_reports_invalid() {
173        let dir = tempfile::tempdir().unwrap();
174        let onnx_dir = dir.path().join("onnx");
175        std::fs::create_dir_all(&onnx_dir).unwrap();
176        std::fs::write(onnx_dir.join("model.onnx"), b"").unwrap(); // empty!
177        std::fs::write(dir.path().join("tokenizer.json"), b"{}").unwrap();
178        match verify_embedding_model(Some(&dir.path().to_string_lossy())) {
179            VerifyOutcome::FilesInvalid { issues, .. } => {
180                assert_eq!(issues.len(), 1);
181                assert_eq!(issues[0].relative_path, "onnx/model.onnx");
182                assert_eq!(issues[0].reason, FileIssueKind::Empty);
183            }
184            other => panic!("expected FilesInvalid, got {other:?}"),
185        }
186    }
187
188    #[test]
189    fn verify_missing_tokenizer_reports_invalid() {
190        let dir = tempfile::tempdir().unwrap();
191        let onnx_dir = dir.path().join("onnx");
192        std::fs::create_dir_all(&onnx_dir).unwrap();
193        std::fs::write(onnx_dir.join("model.onnx"), vec![1u8; 512]).unwrap();
194        // tokenizer.json deliberately absent
195        match verify_embedding_model(Some(&dir.path().to_string_lossy())) {
196            VerifyOutcome::FilesInvalid { issues, .. } => {
197                assert_eq!(issues.len(), 1);
198                assert_eq!(issues[0].relative_path, "tokenizer.json");
199            }
200            other => panic!("expected FilesInvalid, got {other:?}"),
201        }
202    }
203
204    #[test]
205    fn summary_strings_are_log_safe() {
206        // Verify summary strings never include file paths (only status).
207        let summary = verify_outcome_summary(&VerifyOutcome::FilesInvalid {
208            model_dir: "/secret/path".into(),
209            issues: vec![FileIssue {
210                relative_path: "onnx/model.onnx".into(),
211                reason: FileIssueKind::NotFound,
212            }],
213        });
214        assert!(
215            !summary.contains("/secret/path"),
216            "summary must not include the model dir path"
217        );
218    }
219}
220
221// ── Deep verification (RFC-029: checksum integrity check) ────────────
222
223/// A manifest file written alongside downloaded model files.
224/// Contains the SHA-256 hash of each file at download time so the
225/// explicit "Validate" action can detect corruption or tampering.
226///
227/// File: `{model_dir}/orbok-manifest.json`
228#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
229pub struct ModelManifest {
230    /// Map of relative file path → lowercase hex SHA-256 digest.
231    pub sha256: std::collections::HashMap<String, String>,
232}
233
234impl ModelManifest {
235    /// Load a manifest from the model directory. Returns `None` if the
236    /// manifest file does not exist (e.g. model was placed manually).
237    pub fn load(model_dir: &Path) -> Option<Self> {
238        let path = model_dir.join("orbok-manifest.json");
239        let bytes = std::fs::read(&path).ok()?;
240        serde_json::from_slice(&bytes).ok()
241    }
242
243    /// Persist the manifest to the model directory.
244    pub fn save(&self, model_dir: &Path) -> std::io::Result<()> {
245        let path = model_dir.join("orbok-manifest.json");
246        let json = serde_json::to_vec_pretty(self)
247            .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
248        std::fs::write(&path, &json)
249    }
250
251    /// Compute the SHA-256 hex digest of a file.
252    pub fn sha256_of_file(path: &Path) -> std::io::Result<String> {
253        use sha2::{Digest, Sha256};
254        let bytes = std::fs::read(path)?;
255        let digest: String = Sha256::digest(&bytes)
256            .iter()
257            .map(|b| format!("{b:02x}"))
258            .collect();
259        Ok(digest)
260    }
261}
262
263/// Result of a deep (checksum) verification.
264#[derive(Debug, Clone, PartialEq)]
265pub enum DeepVerifyOutcome {
266    /// All files present and checksums match the stored manifest.
267    Valid,
268    /// No manifest found — model was placed manually; cannot verify checksums.
269    NoManifest,
270    /// One or more checksums did not match.
271    ChecksumMismatch(Vec<String>),
272    /// A required file is missing (manifest exists but file is gone).
273    FileMissing(Vec<String>),
274}
275
276/// Run a deep integrity check using the stored manifest (RFC-029).
277/// Called only from the explicit "Validate" button in the Models view —
278/// never at startup (would add 50–500 ms for large models).
279pub fn verify_embedding_model_deep(model_dir: &Path) -> DeepVerifyOutcome {
280    let manifest = match ModelManifest::load(model_dir) {
281        Some(m) => m,
282        None => return DeepVerifyOutcome::NoManifest,
283    };
284
285    let mut missing = Vec::new();
286    let mut mismatched = Vec::new();
287
288    for (rel, expected) in &manifest.sha256 {
289        let full = model_dir.join(rel);
290        match ModelManifest::sha256_of_file(&full) {
291            Ok(actual) if actual == *expected => {} // OK
292            Ok(_) => mismatched.push(rel.clone()),
293            Err(_) => missing.push(rel.clone()),
294        }
295    }
296
297    if !missing.is_empty() {
298        DeepVerifyOutcome::FileMissing(missing)
299    } else if !mismatched.is_empty() {
300        DeepVerifyOutcome::ChecksumMismatch(mismatched)
301    } else {
302        DeepVerifyOutcome::Valid
303    }
304}
305
306#[cfg(test)]
307mod deep_verify_tests {
308    use super::*;
309
310    #[test]
311    fn manifest_round_trips() {
312        let dir = tempfile::tempdir().unwrap();
313        let mut sha256 = std::collections::HashMap::new();
314        sha256.insert("onnx/model.onnx".into(), "abc123".into());
315        let m = ModelManifest { sha256 };
316        m.save(dir.path()).unwrap();
317        let loaded = ModelManifest::load(dir.path()).unwrap();
318        assert_eq!(loaded.sha256["onnx/model.onnx"], "abc123");
319    }
320
321    #[test]
322    fn no_manifest_returns_no_manifest() {
323        let dir = tempfile::tempdir().unwrap();
324        assert_eq!(
325            verify_embedding_model_deep(dir.path()),
326            DeepVerifyOutcome::NoManifest
327        );
328    }
329
330    #[test]
331    fn valid_checksums_return_valid() {
332        let dir = tempfile::tempdir().unwrap();
333        let content = b"model weights";
334        std::fs::write(dir.path().join("model.bin"), content).unwrap();
335        let hash = ModelManifest::sha256_of_file(&dir.path().join("model.bin")).unwrap();
336        let mut sha256 = std::collections::HashMap::new();
337        sha256.insert("model.bin".into(), hash);
338        ModelManifest { sha256 }.save(dir.path()).unwrap();
339        assert_eq!(
340            verify_embedding_model_deep(dir.path()),
341            DeepVerifyOutcome::Valid
342        );
343    }
344
345    #[test]
346    fn corrupted_file_returns_mismatch() {
347        let dir = tempfile::tempdir().unwrap();
348        std::fs::write(dir.path().join("model.bin"), b"original").unwrap();
349        let hash = ModelManifest::sha256_of_file(&dir.path().join("model.bin")).unwrap();
350        let mut sha256 = std::collections::HashMap::new();
351        sha256.insert("model.bin".into(), hash);
352        ModelManifest { sha256 }.save(dir.path()).unwrap();
353        // Corrupt the file
354        std::fs::write(dir.path().join("model.bin"), b"corrupted!").unwrap();
355        assert_eq!(
356            verify_embedding_model_deep(dir.path()),
357            DeepVerifyOutcome::ChecksumMismatch(vec!["model.bin".into()])
358        );
359    }
360}