Skip to main content

orbok_workers/
model_verifier.rs

1//! Startup model verification (design decision §3, RFC-021).
2//!
3//! Runs at every startup. Checks that the files required for the
4//! configured embedding model are present and non-empty. Does **not**
5//! run SHA-256 hash verification — that is reserved for the explicit
6//! "Validate" action in the Models view (keeps startup under 5 ms).
7//!
8//! The two required files inside the model directory:
9//! - `onnx/model.onnx`  — the weights (typically 20–140 MB)
10//! - `tokenizer.json`   — the tokenizer config (~2 MB)
11
12use std::path::Path;
13
14/// Files that must be present in the model directory.
15pub const REQUIRED_MODEL_FILES: &[&str] = &["onnx/model.onnx", "tokenizer.json"];
16
17/// Outcome of a startup model verification check.
18#[derive(Debug, Clone, PartialEq)]
19pub enum VerifyOutcome {
20    /// Both required files exist and have size > 0. Semantic search
21    /// can be enabled when the inference backend is loaded.
22    Ready,
23
24    /// No model directory has ever been configured.
25    /// Show the setup wizard — state: "not configured".
26    NotConfigured,
27
28    /// The directory was configured but one or more required files are
29    /// absent or empty.
30    FilesInvalid {
31        /// The configured model directory path.
32        model_dir: String,
33        /// Which required files failed the check.
34        issues: Vec<FileIssue>,
35    },
36}
37
38/// A single file that failed the verification check.
39#[derive(Debug, Clone, PartialEq)]
40pub struct FileIssue {
41    /// Relative path within the model directory (e.g. `onnx/model.onnx`).
42    pub relative_path: String,
43    /// Human-readable reason.
44    pub reason: FileIssueKind,
45}
46
47/// Reason a required model file failed verification.
48#[derive(Debug, Clone, PartialEq)]
49pub enum FileIssueKind {
50    NotFound,
51    Empty,
52    PermissionDenied,
53}
54
55impl FileIssueKind {
56    pub fn as_str(&self) -> &'static str {
57        match self {
58            FileIssueKind::NotFound => "not found",
59            FileIssueKind::Empty => "empty file (0 bytes)",
60            FileIssueKind::PermissionDenied => "permission denied",
61        }
62    }
63}
64
65/// Verify the embedding model directory at startup.
66///
67/// `model_dir` comes from [`OrbokSettings::embedding_model_dir`].
68///
69/// # Timing
70/// Typical execution: < 2 ms (two `stat` calls). No SHA-256 hashing.
71pub fn verify_embedding_model(model_dir: Option<&str>) -> VerifyOutcome {
72    let dir_str = match model_dir {
73        Some(d) if !d.trim().is_empty() => d,
74        _ => return VerifyOutcome::NotConfigured,
75    };
76    let dir = Path::new(dir_str);
77    let mut issues = Vec::new();
78    for rel in REQUIRED_MODEL_FILES {
79        let full = dir.join(rel);
80        match std::fs::metadata(&full) {
81            Ok(meta) if meta.len() == 0 => {
82                issues.push(FileIssue {
83                    relative_path: rel.to_string(),
84                    reason: FileIssueKind::Empty,
85                });
86            }
87            Ok(_) => {} // present and non-empty — OK
88            Err(e) if e.kind() == std::io::ErrorKind::PermissionDenied => {
89                issues.push(FileIssue {
90                    relative_path: rel.to_string(),
91                    reason: FileIssueKind::PermissionDenied,
92                });
93            }
94            Err(_) => {
95                issues.push(FileIssue {
96                    relative_path: rel.to_string(),
97                    reason: FileIssueKind::NotFound,
98                });
99            }
100        }
101    }
102    if issues.is_empty() {
103        VerifyOutcome::Ready
104    } else {
105        VerifyOutcome::FilesInvalid {
106            model_dir: dir_str.to_string(),
107            issues,
108        }
109    }
110}
111
112/// Run the verifier and return a brief log-friendly summary string
113/// (never includes file contents — NFR-014).
114pub fn verify_outcome_summary(outcome: &VerifyOutcome) -> String {
115    match outcome {
116        VerifyOutcome::Ready => "embedding model OK".into(),
117        VerifyOutcome::NotConfigured => "embedding model not configured".into(),
118        VerifyOutcome::FilesInvalid { issues, .. } => {
119            let problems: Vec<_> = issues.iter().map(|i| i.reason.as_str()).collect();
120            format!("embedding model invalid: {}", problems.join(", "))
121        }
122    }
123}
124
125#[cfg(test)]
126mod tests {
127    use super::*;
128
129    #[test]
130    fn verify_none_is_not_configured() {
131        assert_eq!(verify_embedding_model(None), VerifyOutcome::NotConfigured);
132    }
133
134    #[test]
135    fn verify_empty_string_is_not_configured() {
136        assert_eq!(verify_embedding_model(Some("")), VerifyOutcome::NotConfigured);
137        assert_eq!(verify_embedding_model(Some("  ")), VerifyOutcome::NotConfigured);
138    }
139
140    #[test]
141    fn verify_nonexistent_dir_reports_both_files_missing() {
142        let outcome = verify_embedding_model(Some("/nonexistent/orbok-models"));
143        match outcome {
144            VerifyOutcome::FilesInvalid { issues, .. } => {
145                assert_eq!(issues.len(), 2);
146                assert!(issues.iter().all(|i| i.reason == FileIssueKind::NotFound));
147            }
148            other => panic!("expected FilesInvalid, got {other:?}"),
149        }
150    }
151
152    #[test]
153    fn verify_dir_with_valid_files_returns_ready() {
154        let dir = tempfile::tempdir().unwrap();
155        let onnx_dir = dir.path().join("onnx");
156        std::fs::create_dir_all(&onnx_dir).unwrap();
157        std::fs::write(onnx_dir.join("model.onnx"), vec![0u8; 1024]).unwrap();
158        std::fs::write(dir.path().join("tokenizer.json"), b"{}").unwrap();
159        assert_eq!(
160            verify_embedding_model(Some(&dir.path().to_string_lossy())),
161            VerifyOutcome::Ready
162        );
163    }
164
165    #[test]
166    fn verify_empty_model_file_reports_invalid() {
167        let dir = tempfile::tempdir().unwrap();
168        let onnx_dir = dir.path().join("onnx");
169        std::fs::create_dir_all(&onnx_dir).unwrap();
170        std::fs::write(onnx_dir.join("model.onnx"), b"").unwrap(); // empty!
171        std::fs::write(dir.path().join("tokenizer.json"), b"{}").unwrap();
172        match verify_embedding_model(Some(&dir.path().to_string_lossy())) {
173            VerifyOutcome::FilesInvalid { issues, .. } => {
174                assert_eq!(issues.len(), 1);
175                assert_eq!(issues[0].relative_path, "onnx/model.onnx");
176                assert_eq!(issues[0].reason, FileIssueKind::Empty);
177            }
178            other => panic!("expected FilesInvalid, got {other:?}"),
179        }
180    }
181
182    #[test]
183    fn verify_missing_tokenizer_reports_invalid() {
184        let dir = tempfile::tempdir().unwrap();
185        let onnx_dir = dir.path().join("onnx");
186        std::fs::create_dir_all(&onnx_dir).unwrap();
187        std::fs::write(onnx_dir.join("model.onnx"), vec![1u8; 512]).unwrap();
188        // tokenizer.json deliberately absent
189        match verify_embedding_model(Some(&dir.path().to_string_lossy())) {
190            VerifyOutcome::FilesInvalid { issues, .. } => {
191                assert_eq!(issues.len(), 1);
192                assert_eq!(issues[0].relative_path, "tokenizer.json");
193            }
194            other => panic!("expected FilesInvalid, got {other:?}"),
195        }
196    }
197
198    #[test]
199    fn summary_strings_are_log_safe() {
200        // Verify summary strings never include file paths (only status).
201        let summary = verify_outcome_summary(&VerifyOutcome::FilesInvalid {
202            model_dir: "/secret/path".into(),
203            issues: vec![FileIssue {
204                relative_path: "onnx/model.onnx".into(),
205                reason: FileIssueKind::NotFound,
206            }],
207        });
208        assert!(!summary.contains("/secret/path"),
209            "summary must not include the model dir path");
210    }
211}