Skip to main content

vaultdb_core/
schema.rs

1//! Schema inference and validation. `infer_schema` walks records to discover
2//! field types and cardinalities; `validate_record` checks a record against a
3//! schema; `schema_to_yaml` renders a schema to YAML for persistence.
4
5use std::collections::BTreeMap;
6use std::path::Path;
7
8use serde::{Deserialize, Serialize};
9
10use crate::error::{Result, VaultdbError};
11use crate::record::Value;
12
13/// Top-level schema file structure.
14#[derive(Debug, Serialize, Deserialize)]
15pub struct VaultSchema {
16    pub collections: BTreeMap<String, CollectionSchema>,
17}
18
19/// Schema for a single collection (a folder + optional filter).
20#[derive(Debug, Serialize, Deserialize)]
21pub struct CollectionSchema {
22    #[serde(default, skip_serializing_if = "Option::is_none")]
23    pub description: Option<String>,
24    pub folder: String,
25    #[serde(default, skip_serializing_if = "Vec::is_empty")]
26    pub filter: Vec<String>,
27    #[serde(default, skip_serializing_if = "Vec::is_empty")]
28    pub required: Vec<String>,
29    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
30    pub fields: BTreeMap<String, FieldSchema>,
31}
32
33/// Schema for a single field.
34#[derive(Debug, Serialize, Deserialize)]
35pub struct FieldSchema {
36    #[serde(rename = "type")]
37    pub field_type: String,
38    #[serde(rename = "enum")]
39    #[serde(default, skip_serializing_if = "Vec::is_empty")]
40    pub enum_values: Vec<Value>,
41    #[serde(default, skip_serializing_if = "Option::is_none")]
42    pub min: Option<f64>,
43    #[serde(default, skip_serializing_if = "Option::is_none")]
44    pub max: Option<f64>,
45    #[serde(default, skip_serializing_if = "Option::is_none")]
46    pub required: Option<bool>,
47}
48
49/// Load schema from a file.
50///
51/// Errors are mapped to `VaultdbError::SchemaError` with a human-readable
52/// reason — the underlying YAML parser is an implementation detail and is
53/// deliberately not exposed in the public error type, so consumers don't
54/// transitively depend on whichever YAML crate vaultdb chooses today.
55pub fn load_schema(path: &Path) -> Result<VaultSchema> {
56    let content = std::fs::read_to_string(path).map_err(|_| {
57        VaultdbError::SchemaError(format!("cannot read schema file: {}", path.display()))
58    })?;
59    serde_yaml::from_str(&content)
60        .map_err(|e| VaultdbError::SchemaError(format!("parsing {}: {}", path.display(), e)))
61}
62
63/// Serialize a schema to YAML string.
64pub fn schema_to_yaml(schema: &VaultSchema) -> Result<String> {
65    serde_yaml::to_string(schema)
66        .map_err(|e| VaultdbError::SchemaError(format!("rendering schema as YAML: {}", e)))
67}
68
69/// A single validation violation.
70#[derive(Debug)]
71pub struct Violation {
72    pub file: String,
73    pub field: String,
74    pub message: String,
75}
76
77impl std::fmt::Display for Violation {
78    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
79        write!(f, "{}: {} — {}", self.file, self.field, self.message)
80    }
81}
82
83/// Validate a record's fields against a collection schema.
84pub fn validate_record(
85    filename: &str,
86    fields: &BTreeMap<String, Value>,
87    schema: &CollectionSchema,
88) -> Vec<Violation> {
89    let mut violations = Vec::new();
90
91    // Check required fields
92    for req in &schema.required {
93        match fields.get(req) {
94            None | Some(Value::Null) => {
95                violations.push(Violation {
96                    file: filename.to_string(),
97                    field: req.clone(),
98                    message: "required field is missing or null".into(),
99                });
100            }
101            _ => {}
102        }
103    }
104
105    // Check field constraints
106    for (field_name, field_schema) in &schema.fields {
107        let value = match fields.get(field_name) {
108            Some(v) if !matches!(v, Value::Null) => v,
109            _ => continue, // skip absent/null fields (required check handles those)
110        };
111
112        // Type check
113        let actual_type = value.type_name();
114        let expected_type = &field_schema.field_type;
115        if !type_matches(actual_type, expected_type) {
116            violations.push(Violation {
117                file: filename.to_string(),
118                field: field_name.clone(),
119                message: format!("expected type '{}', got '{}'", expected_type, actual_type),
120            });
121        }
122
123        // Enum check
124        if !field_schema.enum_values.is_empty() {
125            let display = value.display_value();
126            let matches_enum = field_schema.enum_values.iter().any(|e| match e {
127                Value::String(s) => s == &display,
128                Value::Integer(i) => i.to_string() == display,
129                Value::Float(f) => f.to_string() == display,
130                Value::Bool(b) => b.to_string() == display,
131                _ => false,
132            });
133            if !matches_enum {
134                violations.push(Violation {
135                    file: filename.to_string(),
136                    field: field_name.clone(),
137                    message: format!(
138                        "value '{}' not in allowed values: {:?}",
139                        display,
140                        field_schema
141                            .enum_values
142                            .iter()
143                            .map(value_display)
144                            .collect::<Vec<_>>()
145                    ),
146                });
147            }
148        }
149
150        // Min/max check for numeric fields
151        if let Some(min) = field_schema.min
152            && let Some(num) = value.as_float()
153            && num < min
154        {
155            violations.push(Violation {
156                file: filename.to_string(),
157                field: field_name.clone(),
158                message: format!("value {} is below minimum {}", num, min),
159            });
160        }
161        if let Some(max) = field_schema.max
162            && let Some(num) = value.as_float()
163            && num > max
164        {
165            violations.push(Violation {
166                file: filename.to_string(),
167                field: field_name.clone(),
168                message: format!("value {} exceeds maximum {}", num, max),
169            });
170        }
171    }
172
173    violations
174}
175
176fn value_display(v: &Value) -> String {
177    match v {
178        Value::String(s) => s.clone(),
179        Value::Integer(i) => i.to_string(),
180        Value::Float(f) => f.to_string(),
181        Value::Bool(b) => b.to_string(),
182        Value::Null => "null".to_string(),
183        other => format!("{:?}", other),
184    }
185}
186
187fn type_matches(actual: &str, expected: &str) -> bool {
188    match expected {
189        "string" => actual == "string",
190        "integer" => actual == "integer",
191        "float" => actual == "float" || actual == "integer",
192        "number" => actual == "integer" || actual == "float",
193        "bool" => actual == "bool",
194        "list" => actual == "list",
195        "map" => actual == "map",
196        _ => true, // unknown type — don't enforce
197    }
198}
199
200/// Infer a schema from a set of records.
201pub fn infer_schema(folder_name: &str, records: &[crate::record::Record]) -> CollectionSchema {
202    let mut field_types: BTreeMap<String, BTreeMap<String, usize>> = BTreeMap::new();
203    let mut field_values: BTreeMap<String, Vec<String>> = BTreeMap::new();
204    let mut field_count: BTreeMap<String, usize> = BTreeMap::new();
205    let total = records.len();
206
207    for record in records {
208        for (key, value) in &record.fields {
209            let type_name = value.type_name().to_string();
210            *field_types
211                .entry(key.clone())
212                .or_default()
213                .entry(type_name)
214                .or_insert(0) += 1;
215            *field_count.entry(key.clone()).or_insert(0) += 1;
216
217            if !matches!(value, Value::Null | Value::List(_) | Value::Map(_)) {
218                field_values
219                    .entry(key.clone())
220                    .or_default()
221                    .push(value.display_value());
222            }
223        }
224    }
225
226    let mut fields = BTreeMap::new();
227    let mut required = Vec::new();
228
229    for (key, types) in &field_types {
230        // Determine the dominant type
231        let dominant_type = types
232            .iter()
233            .filter(|(t, _)| *t != "null")
234            .max_by_key(|(_, count)| *count)
235            .map(|(t, _)| t.clone())
236            .unwrap_or_else(|| "string".to_string());
237
238        // Check if field is present in all records with non-null values
239        let non_null_count = types
240            .iter()
241            .filter(|(t, _)| *t != "null")
242            .map(|(_, c)| c)
243            .sum::<usize>();
244
245        if non_null_count == total && total > 0 {
246            required.push(key.clone());
247        }
248
249        // Infer enum if there are few unique values
250        let enum_values = if let Some(values) = field_values.get(key) {
251            let mut unique: Vec<String> = values.clone();
252            unique.sort();
253            unique.dedup();
254            if unique.len() <= 10 && unique.len() < values.len() / 2 {
255                unique
256                    .into_iter()
257                    .map(|v| {
258                        // Try to parse as integer
259                        if let Ok(n) = v.parse::<i64>() {
260                            Value::Integer(n)
261                        } else {
262                            Value::String(v)
263                        }
264                    })
265                    .collect()
266            } else {
267                vec![]
268            }
269        } else {
270            vec![]
271        };
272
273        fields.insert(
274            key.clone(),
275            FieldSchema {
276                field_type: dominant_type,
277                enum_values,
278                min: None,
279                max: None,
280                required: None,
281            },
282        );
283    }
284
285    CollectionSchema {
286        description: Some(format!("Auto-inferred schema for {}", folder_name)),
287        folder: folder_name.to_string(),
288        filter: vec![],
289        required,
290        fields,
291    }
292}
293
294#[cfg(test)]
295mod tests {
296    use super::*;
297    use crate::record::{Record, Value};
298    use std::path::PathBuf;
299
300    fn make_record(fields: Vec<(&str, Value)>) -> Record {
301        let mut map = BTreeMap::new();
302        for (k, v) in fields {
303            map.insert(k.to_string(), v);
304        }
305        Record {
306            path: PathBuf::from("/vault/notes/test.md"),
307            fields: map,
308            raw_content: None,
309        }
310    }
311
312    #[test]
313    fn validate_required_field_missing() {
314        let schema = CollectionSchema {
315            description: None,
316            folder: "notes".into(),
317            filter: vec![],
318            required: vec!["status".into()],
319            fields: BTreeMap::new(),
320        };
321
322        let record = make_record(vec![("tags", Value::String("x".into()))]);
323        let violations = validate_record("test.md", &record.fields, &schema);
324        assert_eq!(violations.len(), 1);
325        assert!(violations[0].message.contains("required"));
326    }
327
328    #[test]
329    fn validate_type_mismatch() {
330        let mut fields = BTreeMap::new();
331        fields.insert(
332            "year".into(),
333            FieldSchema {
334                field_type: "integer".into(),
335                enum_values: vec![],
336                min: None,
337                max: None,
338                required: None,
339            },
340        );
341
342        let schema = CollectionSchema {
343            description: None,
344            folder: "notes".into(),
345            filter: vec![],
346            required: vec![],
347            fields,
348        };
349
350        let record = make_record(vec![("year", Value::String("not a number".into()))]);
351        let violations = validate_record("test.md", &record.fields, &schema);
352        assert_eq!(violations.len(), 1);
353        assert!(violations[0].message.contains("type"));
354    }
355
356    #[test]
357    fn validate_enum_violation() {
358        let mut fields = BTreeMap::new();
359        fields.insert(
360            "status".into(),
361            FieldSchema {
362                field_type: "string".into(),
363                enum_values: vec![
364                    Value::String("to-watch".into()),
365                    Value::String("watched".into()),
366                ],
367                min: None,
368                max: None,
369                required: None,
370            },
371        );
372
373        let schema = CollectionSchema {
374            description: None,
375            folder: "notes".into(),
376            filter: vec![],
377            required: vec![],
378            fields,
379        };
380
381        let record = make_record(vec![("status", Value::String("invalid".into()))]);
382        let violations = validate_record("test.md", &record.fields, &schema);
383        assert_eq!(violations.len(), 1);
384        assert!(violations[0].message.contains("not in allowed"));
385    }
386
387    #[test]
388    fn validate_min_max() {
389        let mut fields = BTreeMap::new();
390        fields.insert(
391            "rating".into(),
392            FieldSchema {
393                field_type: "number".into(),
394                enum_values: vec![],
395                min: Some(1.0),
396                max: Some(10.0),
397                required: None,
398            },
399        );
400
401        let schema = CollectionSchema {
402            description: None,
403            folder: "notes".into(),
404            filter: vec![],
405            required: vec![],
406            fields,
407        };
408
409        let record = make_record(vec![("rating", Value::Integer(15))]);
410        let violations = validate_record("test.md", &record.fields, &schema);
411        assert_eq!(violations.len(), 1);
412        assert!(violations[0].message.contains("exceeds maximum"));
413    }
414
415    #[test]
416    fn validate_passes_clean_record() {
417        let mut fields = BTreeMap::new();
418        fields.insert(
419            "status".into(),
420            FieldSchema {
421                field_type: "string".into(),
422                enum_values: vec![Value::String("to-watch".into())],
423                min: None,
424                max: None,
425                required: None,
426            },
427        );
428
429        let schema = CollectionSchema {
430            description: None,
431            folder: "notes".into(),
432            filter: vec![],
433            required: vec!["status".into()],
434            fields,
435        };
436
437        let record = make_record(vec![("status", Value::String("to-watch".into()))]);
438        let violations = validate_record("test.md", &record.fields, &schema);
439        assert!(violations.is_empty());
440    }
441
442    #[test]
443    fn infer_schema_basic() {
444        let records = vec![
445            make_record(vec![
446                ("status", Value::String("active".into())),
447                ("year", Value::Integer(2020)),
448            ]),
449            make_record(vec![
450                ("status", Value::String("draft".into())),
451                ("year", Value::Integer(2021)),
452            ]),
453        ];
454
455        let schema = infer_schema("notes", &records);
456        assert_eq!(schema.fields.get("status").unwrap().field_type, "string");
457        assert_eq!(schema.fields.get("year").unwrap().field_type, "integer");
458        assert!(schema.required.contains(&"status".to_string()));
459        assert!(schema.required.contains(&"year".to_string()));
460    }
461}