Skip to main content

weave_content/
registry.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3
4use rayon::prelude::*;
5
6use crate::entity::{Entity, Label};
7use crate::parser::ParseError;
8
9/// Maximum length of entity filename stem (without `.md`).
10const MAX_FILENAME_LEN: usize = 200;
11
12/// A loaded entity with its source file path.
13#[derive(Debug)]
14pub struct RegistryEntry {
15    pub entity: Entity,
16    pub path: PathBuf,
17    pub tags: Vec<String>,
18}
19
20/// Entity registry: holds all shared entities loaded from `people/` and
21/// `organizations/` directories. Provides name-based lookup for cross-file
22/// resolution.
23#[derive(Debug)]
24pub struct EntityRegistry {
25    entries: Vec<RegistryEntry>,
26    /// Name → index into `entries`. Names are case-sensitive.
27    name_index: HashMap<String, usize>,
28    /// Content root directory for computing file-path slugs.
29    content_root: Option<PathBuf>,
30}
31
32impl EntityRegistry {
33    /// Build a registry from a content root directory.
34    ///
35    /// Scans `{root}/people/**/*.md` and `{root}/organizations/**/*.md`, parses
36    /// each file, validates for duplicates and filename mismatches. Supports both
37    /// flat and nested (country-based) directory layouts.
38    pub fn load(root: &Path) -> Result<Self, Vec<ParseError>> {
39        let mut entries = Vec::new();
40        let mut errors = Vec::new();
41
42        let actor_dir = root.join("people");
43        let institution_dir = root.join("organizations");
44
45        load_directory(&actor_dir, Label::Person, &mut entries, &mut errors);
46        load_directory(
47            &institution_dir,
48            Label::Organization,
49            &mut entries,
50            &mut errors,
51        );
52
53        // Build name index and detect duplicates
54        let name_index = build_name_index(&entries, &mut errors);
55
56        if errors.iter().any(|e| e.message.starts_with("duplicate")) {
57            return Err(errors);
58        }
59
60        // Filename mismatch warnings are non-fatal, report via errors but don't fail
61        // (caller can filter by message prefix if needed)
62
63        if errors.iter().any(|e| !e.message.starts_with("warning:")) {
64            return Err(errors);
65        }
66
67        // Warnings only -- attach them but succeed
68        if !errors.is_empty() {
69            for err in &errors {
70                eprintln!("{err}");
71            }
72        }
73
74        Ok(Self {
75            entries,
76            name_index,
77            content_root: Some(root.to_path_buf()),
78        })
79    }
80
81    /// Build a registry from pre-parsed entries.
82    pub fn from_entries(entries: Vec<RegistryEntry>) -> Result<Self, Vec<ParseError>> {
83        let mut errors = Vec::new();
84        let name_index = build_name_index(&entries, &mut errors);
85
86        let has_errors = errors.iter().any(|e| !e.message.starts_with("warning:"));
87        if has_errors {
88            return Err(errors);
89        }
90
91        Ok(Self {
92            entries,
93            name_index,
94            content_root: None,
95        })
96    }
97
98    /// Look up an entity by name. Returns None if not found.
99    pub fn get_by_name(&self, name: &str) -> Option<&RegistryEntry> {
100        self.name_index.get(name).map(|&idx| &self.entries[idx])
101    }
102
103    /// Number of entities in the registry.
104    pub fn len(&self) -> usize {
105        self.entries.len()
106    }
107
108    /// Whether the registry is empty.
109    pub fn is_empty(&self) -> bool {
110        self.entries.is_empty()
111    }
112
113    /// All entity names in the registry.
114    pub fn names(&self) -> Vec<&str> {
115        self.entries
116            .iter()
117            .map(|e| e.entity.name.as_str())
118            .collect()
119    }
120
121    /// All registry entries.
122    pub fn entries(&self) -> &[RegistryEntry] {
123        &self.entries
124    }
125
126    /// Compute the file-path slug for an entry (path relative to content root, minus `.md`).
127    /// Returns `None` if content root is not set.
128    pub fn slug_for(&self, entry: &RegistryEntry) -> Option<String> {
129        let root = self.content_root.as_ref()?;
130        path_to_slug(&entry.path, root)
131    }
132
133    /// Content root directory, if set.
134    pub fn content_root(&self) -> Option<&Path> {
135        self.content_root.as_deref()
136    }
137
138    /// Check all entity filenames against expected naming convention.
139    /// Returns warning messages for mismatches (same checks as `load()`,
140    /// but accessible after loading for strict validation).
141    pub fn check_filenames(&self) -> Vec<ParseError> {
142        let mut warnings = Vec::new();
143        for entry in &self.entries {
144            validate_filename(&entry.path, &entry.entity, &mut warnings);
145        }
146        warnings
147    }
148}
149
150/// Compute file-path slug from an absolute path relative to content root.
151/// Returns the path minus the `.md` extension, e.g. `people/id/harvey-moeis`.
152pub fn path_to_slug(path: &Path, content_root: &Path) -> Option<String> {
153    let relative = path.strip_prefix(content_root).ok()?;
154    let s = relative.to_str()?;
155    Some(s.strip_suffix(".md").unwrap_or(s).to_string())
156}
157
158/// Load all `.md` files from a directory tree, parsing each as an entity file.
159/// Supports both flat (`people/*.md`) and nested (`people/<country>/*.md`)
160/// layouts. Uses rayon to parse files in parallel.
161fn load_directory(
162    dir: &Path,
163    label: Label,
164    entries: &mut Vec<RegistryEntry>,
165    errors: &mut Vec<ParseError>,
166) {
167    let mut paths = Vec::new();
168    collect_md_files(dir, &mut paths, 0);
169
170    // Sort for deterministic ordering
171    paths.sort();
172
173    // Parse all files in parallel, collect results
174    let results: Vec<ParseResult> = paths
175        .par_iter()
176        .map(|path| parse_entity_file(path, label))
177        .collect();
178
179    // Merge results sequentially to preserve deterministic order
180    for result in results {
181        if let Some(entry) = result.entry {
182            entries.push(entry);
183        }
184        errors.extend(result.errors);
185    }
186}
187
188/// Recursively collect `.md` files from a directory tree.
189/// Max depth 2 supports `people/<country>/file.md` layout.
190fn collect_md_files(dir: &Path, paths: &mut Vec<PathBuf>, depth: usize) {
191    const MAX_DEPTH: usize = 2;
192    if depth > MAX_DEPTH {
193        return;
194    }
195
196    let Ok(read_dir) = std::fs::read_dir(dir) else {
197        return;
198    };
199
200    let mut dir_entries: Vec<_> = read_dir.filter_map(Result::ok).collect();
201    dir_entries.sort_by_key(std::fs::DirEntry::file_name);
202
203    for entry in dir_entries {
204        let path = entry.path();
205        if path.is_dir() {
206            collect_md_files(&path, paths, depth + 1);
207        } else if path.extension().and_then(|e| e.to_str()) == Some("md") {
208            paths.push(path);
209        }
210    }
211}
212
213/// Result of parsing a single entity file.
214struct ParseResult {
215    entry: Option<RegistryEntry>,
216    errors: Vec<ParseError>,
217}
218
219/// Parse a single entity file, returning the entry and any errors/warnings.
220fn parse_entity_file(path: &Path, label: Label) -> ParseResult {
221    let content = match std::fs::read_to_string(path) {
222        Ok(c) => c,
223        Err(e) => {
224            return ParseResult {
225                entry: None,
226                errors: vec![ParseError {
227                    line: 0,
228                    message: format!("{}: error reading file: {e}", path.display()),
229                }],
230            };
231        }
232    };
233
234    let parsed = match crate::parser::parse_entity_file(&content) {
235        Ok(p) => p,
236        Err(parse_errors) => {
237            return ParseResult {
238                entry: None,
239                errors: parse_errors
240                    .into_iter()
241                    .map(|err| ParseError {
242                        line: err.line,
243                        message: format!("{}: {}", path.display(), err.message),
244                    })
245                    .collect(),
246            };
247        }
248    };
249
250    let mut field_errors = Vec::new();
251    let mut entity = crate::entity::parse_entity_file_body(
252        &parsed.name,
253        &parsed.body,
254        label,
255        parsed.id,
256        parsed.title_line,
257        &mut field_errors,
258    );
259    entity.tags.clone_from(&parsed.tags);
260
261    let mut errors: Vec<ParseError> = field_errors
262        .into_iter()
263        .map(|err| ParseError {
264            line: err.line,
265            message: format!("{}: {}", path.display(), err.message),
266        })
267        .collect();
268
269    // Validate filename matches content
270    validate_filename(path, &entity, &mut errors);
271
272    ParseResult {
273        entry: Some(RegistryEntry {
274            entity,
275            path: path.to_path_buf(),
276            tags: parsed.tags,
277        }),
278        errors,
279    }
280}
281
282/// Build name → index map, detecting duplicate names.
283fn build_name_index(
284    entries: &[RegistryEntry],
285    errors: &mut Vec<ParseError>,
286) -> HashMap<String, usize> {
287    let mut index = HashMap::new();
288
289    for (i, entry) in entries.iter().enumerate() {
290        let name = &entry.entity.name;
291        if let Some(&existing_idx) = index.get(name.as_str()) {
292            let existing: &RegistryEntry = &entries[existing_idx];
293            errors.push(ParseError {
294                line: entry.entity.line,
295                message: format!(
296                    "duplicate entity name {name:?} in {} (first defined in {})",
297                    entry.path.display(),
298                    existing.path.display(),
299                ),
300            });
301        } else {
302            index.insert(name.clone(), i);
303        }
304    }
305
306    index
307}
308
309/// Warn if entity filename doesn't match content.
310/// Expected: `<name>--<qualifier>.md` in kebab-case.
311fn validate_filename(path: &Path, entity: &Entity, errors: &mut Vec<ParseError>) {
312    let Some(stem) = path.file_stem().and_then(|s| s.to_str()) else {
313        return;
314    };
315
316    if stem.len() > MAX_FILENAME_LEN {
317        errors.push(ParseError {
318            line: 0,
319            message: format!(
320                "warning: {}: filename stem exceeds {MAX_FILENAME_LEN} chars",
321                path.display()
322            ),
323        });
324    }
325
326    let expected_name = to_kebab_case(&entity.name);
327    let qualifier = entity
328        .fields
329        .iter()
330        .find(|(k, _)| k == "qualifier")
331        .and_then(|(_, v)| match v {
332            crate::entity::FieldValue::Single(s) => Some(s.as_str()),
333            crate::entity::FieldValue::List(_) => None,
334        });
335
336    let expected_stem = match qualifier {
337        Some(q) => format!("{expected_name}--{}", to_kebab_case(q)),
338        None => expected_name,
339    };
340
341    if stem != expected_stem {
342        errors.push(ParseError {
343            line: 0,
344            message: format!(
345                "warning: {}: filename {stem:?} doesn't match expected {expected_stem:?}",
346                path.display()
347            ),
348        });
349    }
350}
351
352/// Convert a display name to kebab-case for filename comparison.
353fn to_kebab_case(s: &str) -> String {
354    s.chars()
355        .map(|c| {
356            if c.is_alphanumeric() {
357                c.to_ascii_lowercase()
358            } else {
359                '-'
360            }
361        })
362        .collect::<String>()
363        .split('-')
364        .filter(|p| !p.is_empty())
365        .collect::<Vec<_>>()
366        .join("-")
367}
368
369#[cfg(test)]
370mod tests {
371    use super::*;
372    use crate::entity::{Entity, FieldValue, Label};
373
374    fn make_entry(name: &str, label: Label, path: &str) -> RegistryEntry {
375        RegistryEntry {
376            entity: Entity {
377                name: name.to_string(),
378                label,
379                fields: Vec::new(),
380                id: None,
381                line: 1,
382                tags: Vec::new(),
383                slug: None,
384            },
385            path: PathBuf::from(path),
386            tags: Vec::new(),
387        }
388    }
389
390    #[test]
391    fn registry_from_entries_lookup() {
392        let entries = vec![
393            make_entry("Alice", Label::Person, "people/alice.md"),
394            make_entry("Corp Inc", Label::Organization, "organizations/corp-inc.md"),
395        ];
396
397        let registry = EntityRegistry::from_entries(entries).unwrap();
398        assert_eq!(registry.len(), 2);
399        assert!(registry.get_by_name("Alice").is_some());
400        assert!(registry.get_by_name("Corp Inc").is_some());
401        assert!(registry.get_by_name("Bob").is_none());
402    }
403
404    #[test]
405    fn registry_detects_duplicate_names() {
406        let entries = vec![
407            make_entry("Alice", Label::Person, "people/alice-a.md"),
408            make_entry("Alice", Label::Person, "people/alice-b.md"),
409        ];
410
411        let errors = EntityRegistry::from_entries(entries).unwrap_err();
412        assert!(errors.iter().any(|e| e.message.contains("duplicate")));
413    }
414
415    #[test]
416    fn registry_names_list() {
417        let entries = vec![
418            make_entry("Alice", Label::Person, "people/alice.md"),
419            make_entry("Bob", Label::Person, "people/bob.md"),
420        ];
421
422        let registry = EntityRegistry::from_entries(entries).unwrap();
423        let names = registry.names();
424        assert!(names.contains(&"Alice"));
425        assert!(names.contains(&"Bob"));
426    }
427
428    #[test]
429    fn to_kebab_case_conversion() {
430        assert_eq!(to_kebab_case("Mark Bonnick"), "mark-bonnick");
431        assert_eq!(to_kebab_case("Arsenal FC"), "arsenal-fc");
432        assert_eq!(
433            to_kebab_case("English Football Club"),
434            "english-football-club"
435        );
436        assert_eq!(to_kebab_case("Bob"), "bob");
437    }
438
439    #[test]
440    fn validate_filename_matching() {
441        let entity = Entity {
442            name: "Mark Bonnick".to_string(),
443            label: Label::Person,
444            fields: vec![(
445                "qualifier".to_string(),
446                FieldValue::Single("Arsenal Kit Manager".to_string()),
447            )],
448            id: None,
449            line: 1,
450            tags: Vec::new(),
451            slug: None,
452        };
453
454        let mut errors = Vec::new();
455
456        // Correct filename
457        validate_filename(
458            Path::new("people/mark-bonnick--arsenal-kit-manager.md"),
459            &entity,
460            &mut errors,
461        );
462        assert!(errors.is_empty(), "errors: {errors:?}");
463
464        // Wrong filename
465        validate_filename(Path::new("people/wrong-name.md"), &entity, &mut errors);
466        assert!(errors.iter().any(|e| e.message.contains("warning:")));
467    }
468
469    #[test]
470    fn validate_filename_no_qualifier() {
471        let entity = Entity {
472            name: "Bob".to_string(),
473            label: Label::Person,
474            fields: Vec::new(),
475            id: None,
476            line: 1,
477            tags: Vec::new(),
478            slug: None,
479        };
480
481        let mut errors = Vec::new();
482        validate_filename(Path::new("people/bob.md"), &entity, &mut errors);
483        assert!(errors.is_empty(), "errors: {errors:?}");
484    }
485
486    #[test]
487    fn empty_registry() {
488        let registry = EntityRegistry::from_entries(Vec::new()).unwrap();
489        assert!(registry.is_empty());
490        assert_eq!(registry.len(), 0);
491        assert!(registry.get_by_name("anything").is_none());
492    }
493}