Skip to main content

llm_wiki/
type_registry.rs

1use std::collections::{BTreeMap, HashMap};
2use std::path::Path;
3
4use anyhow::{Result, bail};
5use jsonschema::Validator;
6use serde_yaml::Value;
7use sha2::{Digest, Sha256};
8
9use crate::config;
10use crate::default_schemas;
11
12/// A compiled type entry in the registry.
13pub struct RegisteredType {
14    pub(crate) schema_path: String,
15    pub(crate) description: String,
16    pub(crate) validator: Validator,
17    pub(crate) aliases: HashMap<String, String>,
18    pub(crate) required_fields: Vec<String>,
19    pub(crate) content_hash: String,
20    pub(crate) edges: Vec<EdgeDecl>,
21}
22
23/// A graph edge declaration from `x-graph-edges` in a type schema.
24#[derive(Debug, Clone)]
25pub struct EdgeDecl {
26    /// Frontmatter field name containing the target slugs.
27    pub field: String,
28    /// Relation label for edges produced from this declaration.
29    pub relation: String,
30    /// Direction of the edge: `"outgoing"` or `"incoming"`.
31    pub direction: String,
32    /// Page types that are valid targets (empty = any type).
33    pub target_types: Vec<String>,
34}
35
36/// Per-wiki type registry — discovers types from `schemas/*.json` via
37/// `x-wiki-types`, with optional `[types.*]` overrides from `wiki.toml`.
38pub struct SpaceTypeRegistry {
39    types: HashMap<String, RegisteredType>,
40    schema_hash: String,
41    type_hashes: HashMap<String, String>,
42}
43
44impl std::fmt::Debug for SpaceTypeRegistry {
45    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
46        f.debug_struct("SpaceTypeRegistry")
47            .field("types", &self.types.keys().collect::<Vec<_>>())
48            .field("schema_hash", &self.schema_hash)
49            .finish()
50    }
51}
52
53impl SpaceTypeRegistry {
54    /// Build from a wiki repository root. Scans `schemas/*.json`, merges
55    /// `wiki.toml` overrides.
56    pub fn build(repo_root: &Path) -> Result<Self> {
57        let schemas_dir = repo_root.join("schemas");
58        let mut types = HashMap::new();
59
60        if schemas_dir.is_dir() {
61            discover_from_dir(&schemas_dir, &mut types)?;
62        } else {
63            discover_from_embedded(&mut types)?;
64        }
65
66        // Apply wiki.toml overrides
67        let wiki_cfg = config::load_wiki(repo_root)?;
68        for (type_name, entry) in &wiki_cfg.types {
69            let schema_path = repo_root.join(&entry.schema);
70            let content = std::fs::read_to_string(&schema_path)?;
71            let registered = compile_schema(&entry.schema, &entry.description, &content)?;
72            types.insert(type_name.clone(), registered);
73        }
74
75        // Enforce base schema invariant
76        if !types.contains_key("default") {
77            // Inject embedded base.json as fallback
78            let schemas = default_schemas::default_schemas();
79            let base = schemas["base.json"];
80            let registered =
81                compile_schema("schemas/base.json", "Fallback for unrecognized types", base)?;
82            types.insert("default".to_string(), registered);
83        } else {
84            validate_base_invariant(&types["default"])?;
85        }
86
87        let (schema_hash, type_hashes) = compute_hashes(&types);
88
89        Ok(Self {
90            types,
91            schema_hash,
92            type_hashes,
93        })
94    }
95
96    /// Build from embedded default schemas only (no disk access).
97    /// Used when no wiki is mounted or for backward compatibility.
98    pub fn from_embedded() -> Self {
99        let mut types = HashMap::new();
100        discover_from_embedded(&mut types).expect("embedded schemas are valid");
101        let (schema_hash, type_hashes) = compute_hashes(&types);
102        Self {
103            types,
104            schema_hash,
105            type_hashes,
106        }
107    }
108
109    /// Build from pre-constructed parts (used by space_builder).
110    pub(crate) fn from_parts(
111        types: HashMap<String, RegisteredType>,
112        schema_hash: String,
113        type_hashes: HashMap<String, String>,
114    ) -> Self {
115        Self {
116            types,
117            schema_hash,
118            type_hashes,
119        }
120    }
121
122    /// Return true if `type_name` is registered in this registry.
123    pub fn is_known(&self, type_name: &str) -> bool {
124        self.types.contains_key(type_name)
125    }
126
127    /// List all registered type names with descriptions.
128    pub fn list_types(&self) -> Vec<(&str, &str)> {
129        let mut out: Vec<_> = self
130            .types
131            .iter()
132            .map(|(name, rt)| (name.as_str(), rt.description.as_str()))
133            .collect();
134        out.sort_by_key(|(name, _)| *name);
135        out
136    }
137
138    /// Get the aliases for a type (source field → canonical field).
139    pub fn aliases(&self, type_name: &str) -> Option<&HashMap<String, String>> {
140        self.types.get(type_name).map(|rt| &rt.aliases)
141    }
142
143    /// Get the schema file path for a type (relative to repo root).
144    pub fn schema_path(&self, type_name: &str) -> Option<&str> {
145        self.types.get(type_name).map(|rt| rt.schema_path.as_str())
146    }
147
148    /// Return the global SHA-256 hash of all registered type schemas.
149    pub fn schema_hash(&self) -> &str {
150        &self.schema_hash
151    }
152
153    /// Return the per-type content hash map (type name → SHA-256 hash).
154    pub fn type_hashes(&self) -> &HashMap<String, String> {
155        &self.type_hashes
156    }
157
158    /// Get required field names for a type.
159    pub fn required_fields(&self, type_name: &str) -> Vec<String> {
160        self.types
161            .get(type_name)
162            .map(|rt| rt.required_fields.clone())
163            .unwrap_or_default()
164    }
165
166    /// Get edge declarations for a type.
167    pub fn edges(&self, type_name: &str) -> &[EdgeDecl] {
168        self.types
169            .get(type_name)
170            .map(|rt| rt.edges.as_slice())
171            .unwrap_or(&[])
172    }
173
174    /// Validate frontmatter against the type's JSON Schema.
175    ///
176    /// - Resolves the page type (falls back to "default")
177    /// - Validates against the compiled schema
178    /// - In loose mode, unknown types produce warnings
179    /// - In strict mode, unknown types produce errors
180    ///
181    /// Returns a list of warnings. Bails on hard errors.
182    pub fn validate(&self, fm: &BTreeMap<String, Value>, strictness: &str) -> Result<Vec<String>> {
183        let mut warnings = Vec::new();
184
185        // title is always required — hard error regardless of strictness
186        let has_title = fm
187            .get("title")
188            .and_then(|v| v.as_str())
189            .map(|s| !s.is_empty())
190            .unwrap_or(false);
191        // For skill pages, check "name" as alias for title
192        let has_name = fm
193            .get("name")
194            .and_then(|v| v.as_str())
195            .map(|s| !s.is_empty())
196            .unwrap_or(false);
197        if !has_title && !has_name {
198            bail!("title is required");
199        }
200
201        let page_type = fm.get("type").and_then(|v| v.as_str()).unwrap_or("");
202
203        // Determine which registered type to use
204        let resolved_type = if page_type.is_empty() {
205            warnings.push("missing field: type (defaulting to \"page\")".into());
206            "default"
207        } else if self.types.contains_key(page_type) {
208            page_type
209        } else {
210            if strictness == "strict" {
211                bail!("unknown type '{page_type}'");
212            }
213            warnings.push(format!("unknown type '{page_type}'"));
214            "default"
215        };
216
217        if let Some(rt) = self.types.get(resolved_type) {
218            let json_fm = yaml_fm_to_json(fm)?;
219            let errors: Vec<_> = rt.validator.iter_errors(&json_fm).collect();
220            if !errors.is_empty() {
221                if strictness == "strict" {
222                    bail!("schema validation failed: {}", errors[0]);
223                }
224                for e in &errors {
225                    warnings.push(format!("schema validation: {e}"));
226                }
227            }
228        }
229
230        Ok(warnings)
231    }
232}
233
234impl Default for SpaceTypeRegistry {
235    fn default() -> Self {
236        Self::from_embedded()
237    }
238}
239
240// ── Discovery ─────────────────────────────────────────────────────────────────
241
242fn discover_from_dir(
243    schemas_dir: &Path,
244    types: &mut HashMap<String, RegisteredType>,
245) -> Result<()> {
246    let mut entries: Vec<_> = std::fs::read_dir(schemas_dir)?
247        .filter_map(|e| e.ok())
248        .filter(|e| e.path().extension().and_then(|ext| ext.to_str()) == Some("json"))
249        .collect();
250    entries.sort_by_key(|e| e.file_name());
251
252    for entry in entries {
253        let path = entry.path();
254        let filename = path.file_name().unwrap().to_string_lossy();
255        let content = std::fs::read_to_string(&path)?;
256        let schema_value: serde_json::Value = serde_json::from_str(&content)?;
257
258        let schema_rel = format!("schemas/{filename}");
259        let content_hash = sha256_hex(content.as_bytes());
260
261        if let Some(wiki_types) = schema_value.get("x-wiki-types").and_then(|v| v.as_object()) {
262            let aliases = extract_aliases(&schema_value);
263            let required_fields = extract_required(&schema_value);
264            let edges = extract_edges(&schema_value);
265
266            for (type_name, desc) in wiki_types {
267                let description = desc.as_str().unwrap_or("").to_string();
268                let validator = Validator::new(&schema_value)
269                    .map_err(|e| anyhow::anyhow!("invalid schema {filename}: {e}"))?;
270                types.insert(
271                    type_name.clone(),
272                    RegisteredType {
273                        schema_path: schema_rel.clone(),
274                        description,
275                        validator,
276                        aliases: aliases.clone(),
277                        required_fields: required_fields.clone(),
278                        content_hash: content_hash.clone(),
279                        edges: edges.clone(),
280                    },
281                );
282            }
283        }
284    }
285
286    Ok(())
287}
288
289fn discover_from_embedded(types: &mut HashMap<String, RegisteredType>) -> Result<()> {
290    for entry in default_schemas::default_type_entries() {
291        let filename = entry
292            .schema_file
293            .strip_prefix("schemas/")
294            .unwrap_or(&entry.schema_file);
295        let schemas = default_schemas::default_schemas();
296        let content = schemas
297            .get(filename)
298            .ok_or_else(|| anyhow::anyhow!("embedded schema not found: {filename}"))?;
299        let registered = compile_schema(&entry.schema_file, &entry.description, content)?;
300        types.insert(entry.type_name, registered);
301    }
302    Ok(())
303}
304
305pub(crate) fn compile_schema(
306    schema_path: &str,
307    description: &str,
308    content: &str,
309) -> Result<RegisteredType> {
310    let content_hash = sha256_hex(content.as_bytes());
311    let schema_value: serde_json::Value = serde_json::from_str(content)?;
312    compile_schema_from_value(schema_path, description, &schema_value, &content_hash)
313}
314
315pub(crate) fn compile_schema_from_value(
316    schema_path: &str,
317    description: &str,
318    schema_value: &serde_json::Value,
319    content_hash: &str,
320) -> Result<RegisteredType> {
321    let validator = Validator::new(schema_value)
322        .map_err(|e| anyhow::anyhow!("invalid schema {schema_path}: {e}"))?;
323    let aliases = extract_aliases(schema_value);
324    let required_fields = extract_required(schema_value);
325    let edges = extract_edges(schema_value);
326
327    Ok(RegisteredType {
328        schema_path: schema_path.to_string(),
329        description: description.to_string(),
330        validator,
331        aliases,
332        required_fields,
333        content_hash: content_hash.to_string(),
334        edges,
335    })
336}
337
338pub(crate) fn extract_aliases(schema: &serde_json::Value) -> HashMap<String, String> {
339    schema
340        .get("x-index-aliases")
341        .and_then(|v| v.as_object())
342        .map(|obj| {
343            obj.iter()
344                .filter_map(|(k, v)| v.as_str().map(|s| (k.clone(), s.to_string())))
345                .collect()
346        })
347        .unwrap_or_default()
348}
349
350pub(crate) fn extract_required(schema: &serde_json::Value) -> Vec<String> {
351    schema
352        .get("required")
353        .and_then(|v| v.as_array())
354        .map(|arr| {
355            arr.iter()
356                .filter_map(|v| v.as_str().map(|s| s.to_string()))
357                .collect()
358        })
359        .unwrap_or_default()
360}
361
362pub(crate) fn extract_edges(schema: &serde_json::Value) -> Vec<EdgeDecl> {
363    schema
364        .get("x-graph-edges")
365        .and_then(|v| v.as_object())
366        .map(|obj| {
367            obj.iter()
368                .map(|(field, decl)| {
369                    let relation = decl
370                        .get("relation")
371                        .and_then(|v| v.as_str())
372                        .unwrap_or("links-to")
373                        .to_string();
374                    let direction = decl
375                        .get("direction")
376                        .and_then(|v| v.as_str())
377                        .unwrap_or("outgoing")
378                        .to_string();
379                    let target_types = decl
380                        .get("target_types")
381                        .and_then(|v| v.as_array())
382                        .map(|arr| {
383                            arr.iter()
384                                .filter_map(|v| v.as_str().map(|s| s.to_string()))
385                                .collect()
386                        })
387                        .unwrap_or_default();
388                    EdgeDecl {
389                        field: field.clone(),
390                        relation,
391                        direction,
392                        target_types,
393                    }
394                })
395                .collect()
396        })
397        .unwrap_or_default()
398}
399
400/// Validate that a custom default type requires at least `title` and `type`.
401pub(crate) fn validate_base_invariant(rt: &RegisteredType) -> Result<()> {
402    if !rt.required_fields.contains(&"title".to_string()) {
403        bail!(
404            "base schema '{}' must require 'title' — \
405             the default type is the fallback for all unknown types",
406            rt.schema_path
407        );
408    }
409    if !rt.required_fields.contains(&"type".to_string()) {
410        bail!(
411            "base schema '{}' must require 'type' — \
412             the default type is the fallback for all unknown types",
413            rt.schema_path
414        );
415    }
416    Ok(())
417}
418
419// ── Hashing ───────────────────────────────────────────────────────────────────
420
421/// SHA-256 of bytes, returned as lowercase hex (64 chars).
422pub(crate) fn sha256_hex(data: &[u8]) -> String {
423    hex::encode(Sha256::digest(data))
424}
425
426pub(crate) fn compute_hashes(
427    types: &HashMap<String, RegisteredType>,
428) -> (String, HashMap<String, String>) {
429    let entries: HashMap<String, (String, HashMap<String, String>, String)> = types
430        .iter()
431        .map(|(name, rt)| {
432            (
433                name.clone(),
434                (
435                    rt.schema_path.clone(),
436                    rt.aliases.clone(),
437                    rt.content_hash.clone(),
438                ),
439            )
440        })
441        .collect();
442    hash_type_entries(&entries)
443}
444
445/// Shared hashing core for compute_hashes and compute_disk_hashes.
446/// Per-type hash = SHA-256(schema_path + sorted_aliases + content_hash).
447/// Global hash = SHA-256(all per-type hashes sorted by name).
448fn hash_type_entries(
449    entries: &HashMap<String, (String, HashMap<String, String>, String)>,
450) -> (String, HashMap<String, String>) {
451    let sorted: BTreeMap<_, _> = entries.iter().collect();
452    let mut type_hashes = HashMap::new();
453    let mut global_hasher = Sha256::new();
454
455    for (name, (schema_path, aliases, content_hash)) in &sorted {
456        let mut h = Sha256::new();
457        h.update(schema_path.as_bytes());
458        let sorted_aliases: BTreeMap<_, _> = aliases.iter().collect();
459        for (k, v) in &sorted_aliases {
460            h.update(k.as_bytes());
461            h.update(v.as_bytes());
462        }
463        h.update(content_hash.as_bytes());
464        let type_hash = hex::encode(h.finalize());
465        type_hashes.insert(name.to_string(), type_hash.clone());
466        global_hasher.update(type_hash.as_bytes());
467    }
468
469    (hex::encode(global_hasher.finalize()), type_hashes)
470}
471
472/// Compute schema hashes directly from disk without building a full registry.
473/// Returns (global_hash, per_type_hashes).
474///
475/// Algorithm:
476/// 1. Scan `schemas/*.json` (sorted) — compute content hash per file
477/// 2. Read `x-wiki-types` to map type_name → content_hash
478/// 3. Apply `wiki.toml` `[types.*]` overrides
479/// 4. Per-type hash = SHA-256(schema_path + sorted_aliases + content_hash)
480/// 5. Global hash = SHA-256(all per-type hashes sorted by name)
481///
482/// Falls back to embedded schemas if `schemas/` dir is absent.
483pub fn compute_disk_hashes(repo_root: &Path) -> Result<(String, HashMap<String, String>)> {
484    let schemas_dir = repo_root.join("schemas");
485
486    // Collect (type_name -> (schema_path, aliases, content_hash))
487    let mut type_data: HashMap<String, (String, HashMap<String, String>, String)> = HashMap::new();
488
489    if schemas_dir.is_dir() {
490        let mut entries: Vec<_> = std::fs::read_dir(&schemas_dir)?
491            .filter_map(|e| e.ok())
492            .filter(|e| e.path().extension().and_then(|ext| ext.to_str()) == Some("json"))
493            .collect();
494        entries.sort_by_key(|e| e.file_name());
495
496        for entry in entries {
497            let path = entry.path();
498            let filename = path.file_name().unwrap().to_string_lossy().to_string();
499            let content = std::fs::read_to_string(&path)?;
500            let content_hash = sha256_hex(content.as_bytes());
501            let schema_rel = format!("schemas/{filename}");
502            let schema_value: serde_json::Value = serde_json::from_str(&content)?;
503
504            if let Some(wiki_types) = schema_value.get("x-wiki-types").and_then(|v| v.as_object()) {
505                let aliases = extract_aliases(&schema_value);
506                for (type_name, _) in wiki_types {
507                    type_data.insert(
508                        type_name.clone(),
509                        (schema_rel.clone(), aliases.clone(), content_hash.clone()),
510                    );
511                }
512            }
513        }
514    } else {
515        // Embedded fallback
516        for (filename, content) in default_schemas::default_schemas() {
517            let content_hash = sha256_hex(content.as_bytes());
518            let schema_rel = format!("schemas/{filename}");
519            let schema_value: serde_json::Value = serde_json::from_str(content)?;
520
521            if let Some(wiki_types) = schema_value.get("x-wiki-types").and_then(|v| v.as_object()) {
522                let aliases = extract_aliases(&schema_value);
523                for (type_name, _) in wiki_types {
524                    type_data.insert(
525                        type_name.clone(),
526                        (schema_rel.clone(), aliases.clone(), content_hash.clone()),
527                    );
528                }
529            }
530        }
531    }
532
533    // Apply wiki.toml overrides
534    let wiki_cfg = config::load_wiki(repo_root)?;
535    for (type_name, entry) in &wiki_cfg.types {
536        let schema_path = repo_root.join(&entry.schema);
537        let content = std::fs::read_to_string(&schema_path)?;
538        let content_hash = sha256_hex(content.as_bytes());
539        let schema_value: serde_json::Value = serde_json::from_str(&content)?;
540        let aliases = extract_aliases(&schema_value);
541        type_data.insert(
542            type_name.clone(),
543            (entry.schema.clone(), aliases, content_hash),
544        );
545    }
546
547    // Ensure default type exists (same logic as build)
548    if !type_data.contains_key("default") {
549        let schemas = default_schemas::default_schemas();
550        let base = schemas["base.json"];
551        let content_hash = sha256_hex(base.as_bytes());
552        let schema_value: serde_json::Value = serde_json::from_str(base)?;
553        let aliases = extract_aliases(&schema_value);
554        type_data.insert(
555            "default".to_string(),
556            ("schemas/base.json".to_string(), aliases, content_hash),
557        );
558    }
559
560    // Compute per-type and global hashes
561    Ok(hash_type_entries(&type_data))
562}
563
564// ── Helpers ───────────────────────────────────────────────────────────────────
565
566fn yaml_fm_to_json(fm: &BTreeMap<String, Value>) -> Result<serde_json::Value> {
567    // Round-trip through serde: yaml::Value → String → json::Value
568    let yaml_str = serde_yaml::to_string(fm)?;
569    let json: serde_json::Value = serde_yaml::from_str(&yaml_str)?;
570    Ok(json)
571}