Skip to main content

weave_content/
registry.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3
4use rayon::prelude::*;
5
6use crate::entity::{Entity, Label};
7use crate::parser::ParseError;
8
9/// Maximum length of entity filename stem (without `.md`).
10const MAX_FILENAME_LEN: usize = 200;
11
12/// A loaded entity with its source file path.
13#[derive(Debug)]
14pub struct RegistryEntry {
15    pub entity: Entity,
16    pub path: PathBuf,
17    pub tags: Vec<String>,
18}
19
20/// Entity registry: holds all shared entities loaded from `people/` and
21/// `organizations/` directories. Provides name-based lookup for cross-file
22/// resolution.
23#[derive(Debug)]
24pub struct EntityRegistry {
25    entries: Vec<RegistryEntry>,
26    /// Name → index into `entries`. Names are case-sensitive.
27    name_index: HashMap<String, usize>,
28}
29
30impl EntityRegistry {
31    /// Build a registry from a content root directory.
32    ///
33    /// Scans `{root}/people/**/*.md` and `{root}/organizations/**/*.md`, parses
34    /// each file, validates for duplicates and filename mismatches. Supports both
35    /// flat and nested (country-based) directory layouts.
36    pub fn load(root: &Path) -> Result<Self, Vec<ParseError>> {
37        let mut entries = Vec::new();
38        let mut errors = Vec::new();
39
40        let actor_dir = root.join("people");
41        let institution_dir = root.join("organizations");
42
43        load_directory(&actor_dir, Label::Person, &mut entries, &mut errors);
44        load_directory(
45            &institution_dir,
46            Label::Organization,
47            &mut entries,
48            &mut errors,
49        );
50
51        // Build name index and detect duplicates
52        let name_index = build_name_index(&entries, &mut errors);
53
54        if errors.iter().any(|e| e.message.starts_with("duplicate")) {
55            return Err(errors);
56        }
57
58        // Filename mismatch warnings are non-fatal, report via errors but don't fail
59        // (caller can filter by message prefix if needed)
60
61        if errors.iter().any(|e| !e.message.starts_with("warning:")) {
62            return Err(errors);
63        }
64
65        // Warnings only -- attach them but succeed
66        if !errors.is_empty() {
67            for err in &errors {
68                eprintln!("{err}");
69            }
70        }
71
72        Ok(Self {
73            entries,
74            name_index,
75        })
76    }
77
78    /// Build a registry from pre-parsed entries.
79    pub fn from_entries(entries: Vec<RegistryEntry>) -> Result<Self, Vec<ParseError>> {
80        let mut errors = Vec::new();
81        let name_index = build_name_index(&entries, &mut errors);
82
83        let has_errors = errors.iter().any(|e| !e.message.starts_with("warning:"));
84        if has_errors {
85            return Err(errors);
86        }
87
88        Ok(Self {
89            entries,
90            name_index,
91        })
92    }
93
94    /// Look up an entity by name. Returns None if not found.
95    pub fn get_by_name(&self, name: &str) -> Option<&RegistryEntry> {
96        self.name_index.get(name).map(|&idx| &self.entries[idx])
97    }
98
99    /// Number of entities in the registry.
100    pub fn len(&self) -> usize {
101        self.entries.len()
102    }
103
104    /// Whether the registry is empty.
105    pub fn is_empty(&self) -> bool {
106        self.entries.is_empty()
107    }
108
109    /// All entity names in the registry.
110    pub fn names(&self) -> Vec<&str> {
111        self.entries
112            .iter()
113            .map(|e| e.entity.name.as_str())
114            .collect()
115    }
116
117    /// All registry entries.
118    pub fn entries(&self) -> &[RegistryEntry] {
119        &self.entries
120    }
121}
122
123/// Load all `.md` files from a directory tree, parsing each as an entity file.
124/// Supports both flat (`people/*.md`) and nested (`people/<country>/*.md`)
125/// layouts. Uses rayon to parse files in parallel.
126fn load_directory(
127    dir: &Path,
128    label: Label,
129    entries: &mut Vec<RegistryEntry>,
130    errors: &mut Vec<ParseError>,
131) {
132    let mut paths = Vec::new();
133    collect_md_files(dir, &mut paths, 0);
134
135    // Sort for deterministic ordering
136    paths.sort();
137
138    // Parse all files in parallel, collect results
139    let results: Vec<ParseResult> = paths
140        .par_iter()
141        .map(|path| parse_entity_file(path, label))
142        .collect();
143
144    // Merge results sequentially to preserve deterministic order
145    for result in results {
146        if let Some(entry) = result.entry {
147            entries.push(entry);
148        }
149        errors.extend(result.errors);
150    }
151}
152
153/// Recursively collect `.md` files from a directory tree.
154/// Max depth 2 supports `people/<country>/file.md` layout.
155fn collect_md_files(dir: &Path, paths: &mut Vec<PathBuf>, depth: usize) {
156    const MAX_DEPTH: usize = 2;
157    if depth > MAX_DEPTH {
158        return;
159    }
160
161    let Ok(read_dir) = std::fs::read_dir(dir) else {
162        return;
163    };
164
165    let mut dir_entries: Vec<_> = read_dir.filter_map(Result::ok).collect();
166    dir_entries.sort_by_key(std::fs::DirEntry::file_name);
167
168    for entry in dir_entries {
169        let path = entry.path();
170        if path.is_dir() {
171            collect_md_files(&path, paths, depth + 1);
172        } else if path.extension().and_then(|e| e.to_str()) == Some("md") {
173            paths.push(path);
174        }
175    }
176}
177
178/// Result of parsing a single entity file.
179struct ParseResult {
180    entry: Option<RegistryEntry>,
181    errors: Vec<ParseError>,
182}
183
184/// Parse a single entity file, returning the entry and any errors/warnings.
185fn parse_entity_file(path: &Path, label: Label) -> ParseResult {
186    let content = match std::fs::read_to_string(path) {
187        Ok(c) => c,
188        Err(e) => {
189            return ParseResult {
190                entry: None,
191                errors: vec![ParseError {
192                    line: 0,
193                    message: format!("{}: error reading file: {e}", path.display()),
194                }],
195            };
196        }
197    };
198
199    let parsed = match crate::parser::parse_entity_file(&content) {
200        Ok(p) => p,
201        Err(parse_errors) => {
202            return ParseResult {
203                entry: None,
204                errors: parse_errors
205                    .into_iter()
206                    .map(|err| ParseError {
207                        line: err.line,
208                        message: format!("{}: {}", path.display(), err.message),
209                    })
210                    .collect(),
211            };
212        }
213    };
214
215    let mut field_errors = Vec::new();
216    let mut entity = crate::entity::parse_entity_file_body(
217        &parsed.name,
218        &parsed.body,
219        label,
220        parsed.id,
221        parsed.title_line,
222        &mut field_errors,
223    );
224    entity.tags = parsed.tags.clone();
225
226    let mut errors: Vec<ParseError> = field_errors
227        .into_iter()
228        .map(|err| ParseError {
229            line: err.line,
230            message: format!("{}: {}", path.display(), err.message),
231        })
232        .collect();
233
234    // Validate filename matches content
235    validate_filename(path, &entity, &mut errors);
236
237    ParseResult {
238        entry: Some(RegistryEntry {
239            entity,
240            path: path.to_path_buf(),
241            tags: parsed.tags,
242        }),
243        errors,
244    }
245}
246
247/// Build name → index map, detecting duplicate names.
248fn build_name_index(
249    entries: &[RegistryEntry],
250    errors: &mut Vec<ParseError>,
251) -> HashMap<String, usize> {
252    let mut index = HashMap::new();
253
254    for (i, entry) in entries.iter().enumerate() {
255        let name = &entry.entity.name;
256        if let Some(&existing_idx) = index.get(name.as_str()) {
257            let existing: &RegistryEntry = &entries[existing_idx];
258            errors.push(ParseError {
259                line: entry.entity.line,
260                message: format!(
261                    "duplicate entity name {name:?} in {} (first defined in {})",
262                    entry.path.display(),
263                    existing.path.display(),
264                ),
265            });
266        } else {
267            index.insert(name.clone(), i);
268        }
269    }
270
271    index
272}
273
274/// Warn if entity filename doesn't match content.
275/// Expected: `<name>--<qualifier>.md` in kebab-case.
276fn validate_filename(path: &Path, entity: &Entity, errors: &mut Vec<ParseError>) {
277    let Some(stem) = path.file_stem().and_then(|s| s.to_str()) else {
278        return;
279    };
280
281    if stem.len() > MAX_FILENAME_LEN {
282        errors.push(ParseError {
283            line: 0,
284            message: format!(
285                "warning: {}: filename stem exceeds {MAX_FILENAME_LEN} chars",
286                path.display()
287            ),
288        });
289    }
290
291    let expected_name = to_kebab_case(&entity.name);
292    let qualifier = entity
293        .fields
294        .iter()
295        .find(|(k, _)| k == "qualifier")
296        .and_then(|(_, v)| match v {
297            crate::entity::FieldValue::Single(s) => Some(s.as_str()),
298            crate::entity::FieldValue::List(_) => None,
299        });
300
301    let expected_stem = match qualifier {
302        Some(q) => format!("{expected_name}--{}", to_kebab_case(q)),
303        None => expected_name,
304    };
305
306    if stem != expected_stem {
307        errors.push(ParseError {
308            line: 0,
309            message: format!(
310                "warning: {}: filename {stem:?} doesn't match expected {expected_stem:?}",
311                path.display()
312            ),
313        });
314    }
315}
316
317/// Convert a display name to kebab-case for filename comparison.
318fn to_kebab_case(s: &str) -> String {
319    s.chars()
320        .map(|c| {
321            if c.is_alphanumeric() {
322                c.to_ascii_lowercase()
323            } else {
324                '-'
325            }
326        })
327        .collect::<String>()
328        .split('-')
329        .filter(|p| !p.is_empty())
330        .collect::<Vec<_>>()
331        .join("-")
332}
333
334#[cfg(test)]
335mod tests {
336    use super::*;
337    use crate::entity::{Entity, FieldValue, Label};
338
339    fn make_entry(name: &str, label: Label, path: &str) -> RegistryEntry {
340        RegistryEntry {
341            entity: Entity {
342                name: name.to_string(),
343                label,
344                fields: Vec::new(),
345                id: None,
346                line: 1,
347                tags: Vec::new(),
348            },
349            path: PathBuf::from(path),
350            tags: Vec::new(),
351        }
352    }
353
354    #[test]
355    fn registry_from_entries_lookup() {
356        let entries = vec![
357            make_entry("Alice", Label::Person, "people/alice.md"),
358            make_entry("Corp Inc", Label::Organization, "organizations/corp-inc.md"),
359        ];
360
361        let registry = EntityRegistry::from_entries(entries).unwrap();
362        assert_eq!(registry.len(), 2);
363        assert!(registry.get_by_name("Alice").is_some());
364        assert!(registry.get_by_name("Corp Inc").is_some());
365        assert!(registry.get_by_name("Bob").is_none());
366    }
367
368    #[test]
369    fn registry_detects_duplicate_names() {
370        let entries = vec![
371            make_entry("Alice", Label::Person, "people/alice-a.md"),
372            make_entry("Alice", Label::Person, "people/alice-b.md"),
373        ];
374
375        let errors = EntityRegistry::from_entries(entries).unwrap_err();
376        assert!(errors.iter().any(|e| e.message.contains("duplicate")));
377    }
378
379    #[test]
380    fn registry_names_list() {
381        let entries = vec![
382            make_entry("Alice", Label::Person, "people/alice.md"),
383            make_entry("Bob", Label::Person, "people/bob.md"),
384        ];
385
386        let registry = EntityRegistry::from_entries(entries).unwrap();
387        let names = registry.names();
388        assert!(names.contains(&"Alice"));
389        assert!(names.contains(&"Bob"));
390    }
391
392    #[test]
393    fn to_kebab_case_conversion() {
394        assert_eq!(to_kebab_case("Mark Bonnick"), "mark-bonnick");
395        assert_eq!(to_kebab_case("Arsenal FC"), "arsenal-fc");
396        assert_eq!(
397            to_kebab_case("English Football Club"),
398            "english-football-club"
399        );
400        assert_eq!(to_kebab_case("Bob"), "bob");
401    }
402
403    #[test]
404    fn validate_filename_matching() {
405        let entity = Entity {
406            name: "Mark Bonnick".to_string(),
407            label: Label::Person,
408            fields: vec![(
409                "qualifier".to_string(),
410                FieldValue::Single("Arsenal Kit Manager".to_string()),
411            )],
412            id: None,
413            line: 1,
414            tags: Vec::new(),
415        };
416
417        let mut errors = Vec::new();
418
419        // Correct filename
420        validate_filename(
421            Path::new("people/mark-bonnick--arsenal-kit-manager.md"),
422            &entity,
423            &mut errors,
424        );
425        assert!(errors.is_empty(), "errors: {errors:?}");
426
427        // Wrong filename
428        validate_filename(Path::new("people/wrong-name.md"), &entity, &mut errors);
429        assert!(errors.iter().any(|e| e.message.contains("warning:")));
430    }
431
432    #[test]
433    fn validate_filename_no_qualifier() {
434        let entity = Entity {
435            name: "Bob".to_string(),
436            label: Label::Person,
437            fields: Vec::new(),
438            id: None,
439            line: 1,
440            tags: Vec::new(),
441        };
442
443        let mut errors = Vec::new();
444        validate_filename(Path::new("people/bob.md"), &entity, &mut errors);
445        assert!(errors.is_empty(), "errors: {errors:?}");
446    }
447
448    #[test]
449    fn empty_registry() {
450        let registry = EntityRegistry::from_entries(Vec::new()).unwrap();
451        assert!(registry.is_empty());
452        assert_eq!(registry.len(), 0);
453        assert!(registry.get_by_name("anything").is_none());
454    }
455}