Skip to main content

llmwiki_tooling/
config.rs

1use std::collections::HashSet;
2use std::path::Path;
3
4use serde::Deserialize;
5
6use crate::error::ConfigError;
7
8/// Severity level for a check or rule.
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
10pub enum Severity {
11    /// Causes non-zero exit code.
12    #[default]
13    Error,
14    /// Prints finding but does not affect exit code.
15    Warn,
16    /// Suppressed entirely.
17    Off,
18}
19
20impl std::fmt::Display for Severity {
21    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
22        match self {
23            Self::Error => f.write_str("error"),
24            Self::Warn => f.write_str("warn"),
25            Self::Off => f.write_str("off"),
26        }
27    }
28}
29
30impl<'de> Deserialize<'de> for Severity {
31    fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
32        let s = String::deserialize(deserializer)?;
33        match s.as_str() {
34            "error" => Ok(Self::Error),
35            "warn" => Ok(Self::Warn),
36            "off" => Ok(Self::Off),
37            other => Err(serde::de::Error::unknown_variant(
38                other,
39                &["error", "warn", "off"],
40            )),
41        }
42    }
43}
44
45/// Complete wiki configuration, parsed from `wiki.toml` or auto-detected.
46#[derive(Debug, Clone)]
47pub struct WikiConfig {
48    /// Path to the index file relative to wiki root. `None` means no index file.
49    pub index: Option<String>,
50    /// Content directories, sorted most-specific-first for resolution.
51    pub directories: Vec<DirectoryConfig>,
52    /// Linking behavior settings.
53    pub linking: LinkingConfig,
54    /// Wiki-wide structural check severities.
55    pub checks: ChecksConfig,
56    /// Parameterized rules scoped to directories.
57    pub rules: Vec<RuleConfig>,
58}
59
60/// A directory containing wiki pages.
61#[derive(Debug, Clone)]
62pub struct DirectoryConfig {
63    /// Path relative to wiki root.
64    pub path: String,
65    /// Whether pages in this directory feed bare mention detection.
66    pub autolink: bool,
67}
68
69/// Global linking behavior.
70#[derive(Debug, Clone)]
71pub struct LinkingConfig {
72    /// Page names to never auto-link.
73    pub exclude: HashSet<String>,
74    /// Frontmatter field for per-page auto-link opt-out.
75    pub autolink_field: String,
76}
77
78/// Wiki-wide structural check severities.
79#[derive(Debug, Clone)]
80pub struct ChecksConfig {
81    pub broken_links: Severity,
82    pub orphan_pages: Severity,
83    pub index_coverage: Severity,
84}
85
86/// A parameterized rule scoped to specific directories.
87#[derive(Debug, Clone)]
88pub enum RuleConfig {
89    RequiredSections {
90        dirs: Vec<String>,
91        sections: Vec<String>,
92        severity: Severity,
93    },
94    RequiredFrontmatter {
95        dirs: Vec<String>,
96        fields: Vec<String>,
97        severity: Severity,
98    },
99    MirrorParity {
100        left: String,
101        right: String,
102        severity: Severity,
103    },
104    CitationPattern {
105        name: String,
106        dirs: Vec<String>,
107        pattern: String,
108        match_in: String,
109        match_mode: MatchMode,
110        severity: Severity,
111    },
112}
113
114impl RuleConfig {
115    pub fn severity(&self) -> Severity {
116        match self {
117            Self::RequiredSections { severity, .. }
118            | Self::RequiredFrontmatter { severity, .. }
119            | Self::MirrorParity { severity, .. }
120            | Self::CitationPattern { severity, .. } => *severity,
121        }
122    }
123}
124
125/// How a citation pattern match is verified against `match_in` pages.
126#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
127pub enum MatchMode {
128    /// Search page file contents for the captured ID string.
129    #[default]
130    Content,
131    /// Check if a page with the captured ID as its filename exists.
132    Filename,
133}
134
135impl<'de> Deserialize<'de> for MatchMode {
136    fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
137        let s = String::deserialize(deserializer)?;
138        match s.as_str() {
139            "content" => Ok(Self::Content),
140            "filename" => Ok(Self::Filename),
141            other => Err(serde::de::Error::unknown_variant(
142                other,
143                &["content", "filename"],
144            )),
145        }
146    }
147}
148
149// --- TOML deserialization types ---
150
151#[derive(Deserialize)]
152struct RawConfig {
153    index: Option<String>,
154    #[serde(default)]
155    directories: Vec<RawDirectoryConfig>,
156    #[serde(default)]
157    linking: RawLinkingConfig,
158    #[serde(default)]
159    checks: RawChecksConfig,
160    #[serde(default)]
161    rules: Vec<RawRuleConfig>,
162}
163
164#[derive(Deserialize)]
165struct RawDirectoryConfig {
166    path: String,
167    #[serde(default = "default_true")]
168    autolink: bool,
169}
170
171fn default_true() -> bool {
172    true
173}
174
175#[derive(Deserialize, Default)]
176struct RawLinkingConfig {
177    #[serde(default)]
178    exclude: Vec<String>,
179    #[serde(default = "default_autolink_field")]
180    autolink_field: String,
181}
182
183fn default_autolink_field() -> String {
184    "autolink".to_owned()
185}
186
187#[derive(Deserialize, Default)]
188struct RawChecksConfig {
189    #[serde(default)]
190    broken_links: Option<Severity>,
191    #[serde(default)]
192    orphan_pages: Option<Severity>,
193    #[serde(default)]
194    index_coverage: Option<Severity>,
195}
196
197#[derive(Deserialize)]
198#[serde(tag = "check")]
199enum RawRuleConfig {
200    #[serde(rename = "required-sections")]
201    RequiredSections {
202        dirs: Vec<String>,
203        sections: Vec<String>,
204        #[serde(default)]
205        severity: Option<Severity>,
206    },
207    #[serde(rename = "required-frontmatter")]
208    RequiredFrontmatter {
209        dirs: Vec<String>,
210        fields: Vec<String>,
211        #[serde(default)]
212        severity: Option<Severity>,
213    },
214    #[serde(rename = "mirror-parity")]
215    MirrorParity {
216        left: String,
217        right: String,
218        #[serde(default)]
219        severity: Option<Severity>,
220    },
221    #[serde(rename = "citation-pattern")]
222    CitationPattern {
223        name: String,
224        dirs: Vec<String>,
225        #[serde(default)]
226        pattern: Option<String>,
227        #[serde(default)]
228        preset: Option<String>,
229        match_in: String,
230        #[serde(default)]
231        match_mode: Option<MatchMode>,
232        #[serde(default)]
233        severity: Option<Severity>,
234    },
235}
236
237/// Built-in citation pattern presets.
238fn resolve_preset(name: &str) -> Result<(String, MatchMode), ConfigError> {
239    match name {
240        "bold-method-year" => Ok((
241            r"\*\*(?P<id>[A-Za-z][A-Za-z0-9-]+)\*\*\s*\([^)]*\d{4}[^)]*\)".to_owned(),
242            MatchMode::Filename,
243        )),
244        other => Err(ConfigError::UnknownPreset(other.to_owned())),
245    }
246}
247
248impl WikiConfig {
249    /// Load config from `wiki.toml` in the given root directory.
250    /// Returns `None` if `wiki.toml` doesn't exist.
251    pub fn load(root: &Path) -> Result<Option<Self>, ConfigError> {
252        let config_path = root.join("wiki.toml");
253        let content = match std::fs::read_to_string(&config_path) {
254            Ok(content) => content,
255            Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None),
256            Err(e) => {
257                return Err(ConfigError::Read {
258                    path: config_path,
259                    source: e,
260                });
261            }
262        };
263        let raw: RawConfig = toml::from_str(&content).map_err(|e| ConfigError::Parse {
264            path: config_path,
265            source: e,
266        })?;
267        Self::from_raw(raw).map(Some)
268    }
269
270    /// Auto-detect config when no `wiki.toml` exists.
271    pub fn auto_detect(root: &Path) -> Self {
272        let has_wiki_dir = root.join("wiki").is_dir();
273        let dir_path = if has_wiki_dir { "wiki" } else { "." };
274
275        Self {
276            index: Some("index.md".to_owned()),
277            directories: vec![DirectoryConfig {
278                path: dir_path.to_owned(),
279                autolink: true,
280            }],
281            linking: LinkingConfig {
282                exclude: HashSet::new(),
283                autolink_field: default_autolink_field(),
284            },
285            checks: ChecksConfig {
286                broken_links: Severity::Error,
287                orphan_pages: Severity::Error,
288                index_coverage: Severity::Error,
289            },
290            rules: Vec::new(),
291        }
292    }
293
294    /// Load config from wiki.toml if present, otherwise auto-detect.
295    pub fn load_or_detect(root: &Path) -> Result<Self, ConfigError> {
296        match Self::load(root)? {
297            Some(config) => Ok(config),
298            None => Ok(Self::auto_detect(root)),
299        }
300    }
301
302    fn from_raw(raw: RawConfig) -> Result<Self, ConfigError> {
303        let mut directories: Vec<DirectoryConfig> = if raw.directories.is_empty() {
304            // No directories declared — auto-detect
305            vec![DirectoryConfig {
306                path: "wiki".to_owned(),
307                autolink: true,
308            }]
309        } else {
310            raw.directories
311                .into_iter()
312                .map(|d| DirectoryConfig {
313                    path: normalize_path(&d.path),
314                    autolink: d.autolink,
315                })
316                .collect()
317        };
318
319        // Sort most-specific first (longest path) for resolution
320        directories.sort_by(|a, b| b.path.len().cmp(&a.path.len()));
321
322        let linking = LinkingConfig {
323            exclude: raw.linking.exclude.into_iter().collect(),
324            autolink_field: raw.linking.autolink_field,
325        };
326
327        let checks = ChecksConfig {
328            broken_links: raw.checks.broken_links.unwrap_or(Severity::Error),
329            orphan_pages: raw.checks.orphan_pages.unwrap_or(Severity::Error),
330            index_coverage: raw.checks.index_coverage.unwrap_or(Severity::Error),
331        };
332
333        let mut rules = Vec::new();
334        for raw_rule in raw.rules {
335            rules.push(convert_rule(raw_rule)?);
336        }
337
338        // Validate citation patterns compile as regex
339        for rule in &rules {
340            if let RuleConfig::CitationPattern { pattern, name, .. } = rule {
341                regex_lite::Regex::new(pattern).map_err(|e| ConfigError::InvalidPattern {
342                    name: name.clone(),
343                    source: e,
344                })?;
345            }
346        }
347
348        Ok(Self {
349            index: match raw.index {
350                Some(s) if s.is_empty() => None,
351                Some(s) => Some(s),
352                None => Some("index.md".to_owned()),
353            },
354            directories,
355            linking,
356            checks,
357            rules,
358        })
359    }
360
361    /// Get the directory config that applies to a given relative path.
362    /// Returns the most-specific matching directory (longest prefix match).
363    pub fn directory_for(&self, rel_path: &Path) -> Option<&DirectoryConfig> {
364        let rel_str = rel_path.to_str()?;
365        // Directories are sorted most-specific first
366        self.directories
367            .iter()
368            .find(|d| rel_str.starts_with(&d.path) || d.path == ".")
369    }
370
371    /// Check if a page at the given relative path should be auto-linked.
372    pub fn is_autolink_dir(&self, rel_path: &Path) -> bool {
373        self.directory_for(rel_path)
374            .map(|d| d.autolink)
375            .unwrap_or(false)
376    }
377
378    /// Check if a relative path matches a directory prefix from a rule's `dirs` list.
379    pub fn matches_dirs(rel_path: &Path, dirs: &[String]) -> bool {
380        let Some(rel_str) = rel_path.to_str() else {
381            return false;
382        };
383        dirs.iter().any(|d| rel_str.starts_with(d.as_str()))
384    }
385
386    /// All mirror-parity rules' `right` paths (non-wiki directories used for parity checks).
387    pub fn mirror_paths(&self) -> Vec<(&str, &str)> {
388        self.rules
389            .iter()
390            .filter_map(|r| match r {
391                RuleConfig::MirrorParity { left, right, .. } => {
392                    Some((left.as_str(), right.as_str()))
393                }
394                _ => None,
395            })
396            .collect()
397    }
398}
399
400fn convert_rule(raw: RawRuleConfig) -> Result<RuleConfig, ConfigError> {
401    match raw {
402        RawRuleConfig::RequiredSections {
403            dirs,
404            sections,
405            severity,
406        } => Ok(RuleConfig::RequiredSections {
407            dirs: dirs.into_iter().map(|d| normalize_path(&d)).collect(),
408            sections,
409            severity: severity.unwrap_or(Severity::Error),
410        }),
411        RawRuleConfig::RequiredFrontmatter {
412            dirs,
413            fields,
414            severity,
415        } => Ok(RuleConfig::RequiredFrontmatter {
416            dirs: dirs.into_iter().map(|d| normalize_path(&d)).collect(),
417            fields,
418            severity: severity.unwrap_or(Severity::Error),
419        }),
420        RawRuleConfig::MirrorParity {
421            left,
422            right,
423            severity,
424        } => Ok(RuleConfig::MirrorParity {
425            left: normalize_path(&left),
426            right: normalize_path(&right),
427            severity: severity.unwrap_or(Severity::Error),
428        }),
429        RawRuleConfig::CitationPattern {
430            name,
431            dirs,
432            pattern,
433            preset,
434            match_in,
435            match_mode,
436            severity,
437        } => {
438            let (resolved_pattern, resolved_mode) = match (pattern, preset) {
439                (Some(p), None) => (p, match_mode.unwrap_or(MatchMode::Content)),
440                (None, Some(preset_name)) => {
441                    let (p, m) = resolve_preset(&preset_name)?;
442                    (p, match_mode.unwrap_or(m))
443                }
444                (Some(_), Some(_)) => {
445                    return Err(ConfigError::Validation(format!(
446                        "citation-pattern '{name}': cannot specify both 'pattern' and 'preset'"
447                    )));
448                }
449                (None, None) => {
450                    return Err(ConfigError::Validation(format!(
451                        "citation-pattern '{name}': must specify either 'pattern' or 'preset'"
452                    )));
453                }
454            };
455            Ok(RuleConfig::CitationPattern {
456                name,
457                dirs: dirs.into_iter().map(|d| normalize_path(&d)).collect(),
458                pattern: resolved_pattern,
459                match_in: normalize_path(&match_in),
460                match_mode: resolved_mode,
461                severity: severity.unwrap_or(Severity::Warn),
462            })
463        }
464    }
465}
466
467/// Strip trailing slashes for consistent prefix matching.
468fn normalize_path(path: &str) -> String {
469    path.trim_end_matches('/').to_owned()
470}
471
472#[cfg(test)]
473mod tests {
474    use super::*;
475
476    #[test]
477    fn parses_minimal_config() {
478        let toml = r#"
479[[directories]]
480path = "wiki"
481"#;
482        let raw: RawConfig = toml::from_str(toml).unwrap();
483        let config = WikiConfig::from_raw(raw).unwrap();
484        assert_eq!(config.directories.len(), 1);
485        assert_eq!(config.directories[0].path, "wiki");
486        assert!(config.directories[0].autolink);
487        assert_eq!(config.checks.broken_links, Severity::Error);
488    }
489
490    #[test]
491    fn parses_full_config() {
492        let toml = r#"
493index = "contents.md"
494
495[[directories]]
496path = "wiki"
497
498[[directories]]
499path = "wiki/papers"
500autolink = false
501
502[linking]
503exclude = ["the", "a"]
504autolink_field = "auto"
505
506[checks]
507broken_links = "error"
508orphan_pages = "warn"
509index_coverage = "off"
510
511[[rules]]
512check = "required-sections"
513dirs = ["wiki/concepts"]
514sections = ["See also"]
515severity = "error"
516
517[[rules]]
518check = "mirror-parity"
519left = "wiki/papers"
520right = "raw/papers"
521severity = "warn"
522
523[[rules]]
524check = "citation-pattern"
525name = "arxiv"
526dirs = ["wiki"]
527pattern = 'arxiv\.org/abs/(?P<id>\d{4}\.\d{4,5})'
528match_in = "wiki/papers"
529severity = "warn"
530
531[[rules]]
532check = "citation-pattern"
533name = "bold-method"
534preset = "bold-method-year"
535dirs = ["wiki"]
536match_in = "wiki/papers"
537severity = "warn"
538"#;
539        let raw: RawConfig = toml::from_str(toml).unwrap();
540        let config = WikiConfig::from_raw(raw).unwrap();
541
542        assert_eq!(config.index.as_deref(), Some("contents.md"));
543        assert!(config.linking.exclude.contains("the"));
544        assert_eq!(config.linking.autolink_field, "auto");
545        assert_eq!(config.checks.orphan_pages, Severity::Warn);
546        assert_eq!(config.checks.index_coverage, Severity::Off);
547        assert_eq!(config.rules.len(), 4);
548
549        // Most specific directory first
550        assert_eq!(config.directories[0].path, "wiki/papers");
551        assert!(!config.directories[0].autolink);
552        assert_eq!(config.directories[1].path, "wiki");
553        assert!(config.directories[1].autolink);
554    }
555
556    #[test]
557    fn directory_resolution_most_specific_wins() {
558        let config = WikiConfig {
559            index: None,
560            directories: vec![
561                DirectoryConfig {
562                    path: "wiki/papers".to_owned(),
563                    autolink: false,
564                },
565                DirectoryConfig {
566                    path: "wiki".to_owned(),
567                    autolink: true,
568                },
569            ],
570            linking: LinkingConfig {
571                exclude: HashSet::new(),
572                autolink_field: "autolink".to_owned(),
573            },
574            checks: ChecksConfig {
575                broken_links: Severity::Error,
576                orphan_pages: Severity::Error,
577                index_coverage: Severity::Error,
578            },
579            rules: Vec::new(),
580        };
581
582        assert!(config.is_autolink_dir(Path::new("wiki/concepts/GRPO.md")));
583        assert!(!config.is_autolink_dir(Path::new("wiki/papers/deepseek.md")));
584    }
585
586    #[test]
587    fn auto_detect_with_wiki_dir() {
588        let dir = tempfile::tempdir().unwrap();
589        std::fs::create_dir(dir.path().join("wiki")).unwrap();
590        std::fs::write(dir.path().join("index.md"), "# Index").unwrap();
591
592        let config = WikiConfig::auto_detect(dir.path());
593        assert_eq!(config.directories[0].path, "wiki");
594        assert_eq!(config.index.as_deref(), Some("index.md"));
595    }
596
597    #[test]
598    fn auto_detect_flat_wiki() {
599        let dir = tempfile::tempdir().unwrap();
600        std::fs::write(dir.path().join("index.md"), "# Index").unwrap();
601
602        let config = WikiConfig::auto_detect(dir.path());
603        assert_eq!(config.directories[0].path, ".");
604    }
605
606    #[test]
607    fn rejects_pattern_and_preset_together() {
608        let toml = r#"
609[[rules]]
610check = "citation-pattern"
611name = "test"
612dirs = ["wiki"]
613pattern = "foo"
614preset = "bold-method-year"
615match_in = "wiki"
616"#;
617        let raw: RawConfig = toml::from_str(toml).unwrap();
618        let err = WikiConfig::from_raw(raw).unwrap_err();
619        assert!(err.to_string().contains("cannot specify both"));
620    }
621
622    #[test]
623    fn rejects_unknown_preset() {
624        let toml = r#"
625[[rules]]
626check = "citation-pattern"
627name = "test"
628dirs = ["wiki"]
629preset = "nonexistent"
630match_in = "wiki"
631"#;
632        let raw: RawConfig = toml::from_str(toml).unwrap();
633        let err = WikiConfig::from_raw(raw).unwrap_err();
634        assert!(err.to_string().contains("nonexistent"));
635    }
636
637    #[test]
638    fn matches_dirs_prefix() {
639        assert!(WikiConfig::matches_dirs(
640            Path::new("wiki/concepts/GRPO.md"),
641            &["wiki/concepts".to_owned()]
642        ));
643        assert!(WikiConfig::matches_dirs(
644            Path::new("wiki/concepts/GRPO.md"),
645            &["wiki".to_owned()]
646        ));
647        assert!(!WikiConfig::matches_dirs(
648            Path::new("wiki/papers/foo.md"),
649            &["wiki/concepts".to_owned()]
650        ));
651    }
652}