Skip to main content

kardo_core/analysis/
integrity.rs

1//! Reference integrity checking: validate internal links and heading references.
2
3use regex::Regex;
4use serde::Serialize;
5use std::collections::{HashMap, HashSet};
6use std::path::Path;
7
8/// A broken reference found during integrity analysis.
9#[derive(Debug, Clone, Serialize)]
10pub struct BrokenReference {
11    /// File that contains the broken reference.
12    pub source_file: String,
13    /// The link target that could not be resolved.
14    pub target: String,
15    /// Classification of how the reference is broken.
16    pub kind: BrokenRefKind,
17}
18
19#[derive(Debug, Clone, Serialize)]
20pub enum BrokenRefKind {
21    FileNotFound,
22    HeadingNotFound,
23    /// @filename directive reference (e.g. @AGENTS.md, @.claude/AGENT_FLOW_RULES.md)
24    DirectiveReference,
25}
26
27/// Aggregate integrity result.
28#[derive(Debug, Clone, Serialize)]
29pub struct IntegrityResult {
30    /// Ratio of valid references to total references (0.0-1.0).
31    pub score: f64,
32    /// Total number of internal references checked.
33    pub total_refs: usize,
34    /// Number of references that resolved successfully.
35    pub valid_refs: usize,
36    /// List of references that could not be resolved.
37    pub broken: Vec<BrokenReference>,
38}
39
40pub struct IntegrityAnalyzer;
41
42impl IntegrityAnalyzer {
43    /// Check internal link integrity.
44    ///
45    /// `file_links`: map of (source_file_path → vec of link targets found in that file)
46    /// `known_files`: set of all known file relative paths in the project
47    /// `file_headings`: map of (file_path → vec of heading slugs in that file)
48    pub fn analyze(
49        file_links: &HashMap<String, Vec<String>>,
50        known_files: &HashSet<String>,
51        file_headings: &HashMap<String, Vec<String>>,
52    ) -> IntegrityResult {
53        let mut total_refs: usize = 0;
54        let mut valid_refs: usize = 0;
55        let mut broken = Vec::new();
56
57        for (source, links) in file_links {
58            let source_dir = Path::new(source)
59                .parent()
60                .map(|p| p.to_string_lossy().to_string())
61                .unwrap_or_default();
62
63            for link in links {
64                // Skip external links
65                if link.starts_with("http://") || link.starts_with("https://") || link.starts_with("mailto:") {
66                    continue;
67                }
68
69                total_refs += 1;
70
71                // Split link into file part and optional heading fragment
72                let (file_part, heading_part) = if let Some(idx) = link.find('#') {
73                    let file = &link[..idx];
74                    let heading = &link[idx + 1..];
75                    (file, if heading.is_empty() { None } else { Some(heading) })
76                } else {
77                    (link.as_str(), None)
78                };
79
80                // Resolve relative path
81                let resolved = if file_part.is_empty() {
82                    // Self-reference (#heading) — file is the source itself
83                    source.clone()
84                } else {
85                    Self::resolve_path(&source_dir, file_part)
86                };
87
88                // Check if file exists
89                if !file_part.is_empty() && !known_files.contains(&resolved) {
90                    broken.push(BrokenReference {
91                        source_file: source.clone(),
92                        target: link.clone(),
93                        kind: BrokenRefKind::FileNotFound,
94                    });
95                    continue;
96                }
97
98                // Check heading if present
99                if let Some(heading) = heading_part {
100                    let headings = file_headings.get(&resolved);
101                    let heading_exists = headings
102                        .map(|hs| hs.iter().any(|h| h == heading))
103                        .unwrap_or(false);
104
105                    if heading_exists {
106                        valid_refs += 1;
107                    } else {
108                        broken.push(BrokenReference {
109                            source_file: source.clone(),
110                            target: link.clone(),
111                            kind: BrokenRefKind::HeadingNotFound,
112                        });
113                    }
114                } else {
115                    valid_refs += 1;
116                }
117            }
118        }
119
120        // Additional pass: scan config files for @filename directive references
121        Self::check_directive_references(
122            file_links,
123            known_files,
124            &mut total_refs,
125            &mut valid_refs,
126            &mut broken,
127        );
128
129        let score = if total_refs == 0 {
130            1.0 // No references = no broken references = perfect
131        } else {
132            valid_refs as f64 / total_refs as f64
133        };
134
135        IntegrityResult {
136            score,
137            total_refs,
138            valid_refs,
139            broken,
140        }
141    }
142
143    /// Scan config file contents for @filename directive patterns and validate them.
144    ///
145    /// Recognizes patterns like: `@AGENTS.md`, `@.claude/AGENT_FLOW_RULES.md`,
146    /// `@instructions` (mapped to `.claude/instructions`)
147    fn check_directive_references(
148        file_links: &HashMap<String, Vec<String>>,
149        known_files: &HashSet<String>,
150        total_refs: &mut usize,
151        valid_refs: &mut usize,
152        broken: &mut Vec<BrokenReference>,
153    ) {
154        // We need the raw file contents to scan for @directives.
155        // file_links keys are source files that had links parsed — we use them
156        // to identify config files. For now, scan known config files.
157        // Note: This is called from analyze() which receives file_links.
158        // The caller should provide config file contents separately for full coverage.
159        // For backwards compatibility, we scan the link targets for @-prefixed entries.
160
161        // Pattern: @filepath (word boundary before @, filepath has at least one . or /)
162        let directive_re = Regex::new(r"@([a-zA-Z0-9_./-]+(?:\.[a-zA-Z]{1,6}|/[a-zA-Z0-9_.-]+))").unwrap();
163
164        for (source, links) in file_links {
165            // Look for @-prefixed references in the link targets
166            // These may have been extracted as raw text by the parser
167            for link in links {
168                // Skip external links
169                if link.starts_with("http://") || link.starts_with("https://") || link.starts_with("mailto:") {
170                    continue;
171                }
172                if let Some(cap) = directive_re.captures(link) {
173                    let target = cap.get(1).unwrap().as_str();
174                    *total_refs += 1;
175
176                    if known_files.contains(target) {
177                        *valid_refs += 1;
178                    } else {
179                        broken.push(BrokenReference {
180                            source_file: source.clone(),
181                            target: format!("@{}", target),
182                            kind: BrokenRefKind::DirectiveReference,
183                        });
184                    }
185                }
186            }
187        }
188    }
189
190    /// Analyze directive references found in config file contents.
191    ///
192    /// Call this with a map of config files (path → content) to validate
193    /// @filename directives like `@AGENTS.md` or `@.claude/instructions`.
194    pub fn analyze_directives(
195        config_contents: &HashMap<String, String>,
196        known_files: &HashSet<String>,
197    ) -> Vec<BrokenReference> {
198        let directive_re = Regex::new(r"@([a-zA-Z0-9_./-]+(?:\.[a-zA-Z]{1,6}|/[a-zA-Z0-9_.-]+))").unwrap();
199        let mut broken = Vec::new();
200
201        for (source, content) in config_contents {
202            for cap in directive_re.captures_iter(content) {
203                let target = cap.get(1).unwrap().as_str();
204                if !known_files.contains(target) {
205                    broken.push(BrokenReference {
206                        source_file: source.clone(),
207                        target: format!("@{}", target),
208                        kind: BrokenRefKind::DirectiveReference,
209                    });
210                }
211            }
212        }
213
214        broken
215    }
216
217    /// Resolve a relative path against a base directory.
218    fn resolve_path(base_dir: &str, relative: &str) -> String {
219        if relative.starts_with('/') {
220            // Absolute from project root
221            relative.trim_start_matches('/').to_string()
222        } else {
223            let base = Path::new(base_dir);
224            let resolved = base.join(relative);
225            // Normalize: remove `.` and `..` components
226            let mut parts: Vec<&str> = Vec::new();
227            for component in resolved.components() {
228                match component {
229                    std::path::Component::Normal(s) => {
230                        parts.push(s.to_str().unwrap_or(""));
231                    }
232                    std::path::Component::ParentDir => {
233                        parts.pop();
234                    }
235                    std::path::Component::CurDir => {}
236                    _ => {}
237                }
238            }
239            parts.join("/")
240        }
241    }
242}
243
244#[cfg(test)]
245mod tests {
246    use super::*;
247
248    fn make_known_files(files: &[&str]) -> HashSet<String> {
249        files.iter().map(|s| s.to_string()).collect()
250    }
251
252    fn make_headings(entries: &[(&str, &[&str])]) -> HashMap<String, Vec<String>> {
253        entries
254            .iter()
255            .map(|(file, headings)| {
256                (file.to_string(), headings.iter().map(|h| h.to_string()).collect())
257            })
258            .collect()
259    }
260
261    #[test]
262    fn test_all_valid_links() {
263        let mut links = HashMap::new();
264        links.insert(
265            "docs/guide.md".to_string(),
266            vec!["../README.md".to_string(), "./api.md".to_string()],
267        );
268
269        let files = make_known_files(&["README.md", "docs/guide.md", "docs/api.md"]);
270        let headings = HashMap::new();
271
272        let result = IntegrityAnalyzer::analyze(&links, &files, &headings);
273        assert_eq!(result.total_refs, 2);
274        assert_eq!(result.valid_refs, 2);
275        assert!((result.score - 1.0).abs() < 0.01);
276        assert!(result.broken.is_empty());
277    }
278
279    #[test]
280    fn test_broken_file_link() {
281        let mut links = HashMap::new();
282        links.insert(
283            "README.md".to_string(),
284            vec!["docs/missing.md".to_string()],
285        );
286
287        let files = make_known_files(&["README.md"]);
288        let headings = HashMap::new();
289
290        let result = IntegrityAnalyzer::analyze(&links, &files, &headings);
291        assert_eq!(result.total_refs, 1);
292        assert_eq!(result.valid_refs, 0);
293        assert!((result.score - 0.0).abs() < 0.01);
294        assert_eq!(result.broken.len(), 1);
295        assert!(matches!(result.broken[0].kind, BrokenRefKind::FileNotFound));
296    }
297
298    #[test]
299    fn test_broken_heading_link() {
300        let mut links = HashMap::new();
301        links.insert(
302            "README.md".to_string(),
303            vec!["docs/api.md#nonexistent".to_string()],
304        );
305
306        let files = make_known_files(&["README.md", "docs/api.md"]);
307        let headings = make_headings(&[("docs/api.md", &["getting-started", "usage"])]);
308
309        let result = IntegrityAnalyzer::analyze(&links, &files, &headings);
310        assert_eq!(result.total_refs, 1);
311        assert_eq!(result.valid_refs, 0);
312        assert_eq!(result.broken.len(), 1);
313        assert!(matches!(result.broken[0].kind, BrokenRefKind::HeadingNotFound));
314    }
315
316    #[test]
317    fn test_valid_heading_link() {
318        let mut links = HashMap::new();
319        links.insert(
320            "README.md".to_string(),
321            vec!["docs/api.md#usage".to_string()],
322        );
323
324        let files = make_known_files(&["README.md", "docs/api.md"]);
325        let headings = make_headings(&[("docs/api.md", &["usage"])]);
326
327        let result = IntegrityAnalyzer::analyze(&links, &files, &headings);
328        assert_eq!(result.valid_refs, 1);
329        assert!((result.score - 1.0).abs() < 0.01);
330    }
331
332    #[test]
333    fn test_external_links_skipped() {
334        let mut links = HashMap::new();
335        links.insert(
336            "README.md".to_string(),
337            vec![
338                "https://example.com".to_string(),
339                "http://example.com".to_string(),
340                "mailto:test@test.com".to_string(),
341            ],
342        );
343
344        let files = make_known_files(&["README.md"]);
345        let headings = HashMap::new();
346
347        let result = IntegrityAnalyzer::analyze(&links, &files, &headings);
348        assert_eq!(result.total_refs, 0); // External links not counted
349        assert!((result.score - 1.0).abs() < 0.01);
350    }
351
352    #[test]
353    fn test_no_links() {
354        let links = HashMap::new();
355        let files = make_known_files(&["README.md"]);
356        let headings = HashMap::new();
357
358        let result = IntegrityAnalyzer::analyze(&links, &files, &headings);
359        assert!((result.score - 1.0).abs() < 0.01);
360    }
361
362    #[test]
363    fn test_at_reference_valid() {
364        // Test that @AGENTS.md is validated against known_files
365        let mut config = HashMap::new();
366        config.insert("CLAUDE.md".to_string(), "@AGENTS.md\n@.claude/instructions".to_string());
367
368        let files = make_known_files(&["CLAUDE.md", "AGENTS.md", ".claude/instructions"]);
369        let broken = IntegrityAnalyzer::analyze_directives(&config, &files);
370        assert!(broken.is_empty(), "Valid @references should not be broken");
371    }
372
373    #[test]
374    fn test_at_reference_broken() {
375        let mut config = HashMap::new();
376        config.insert("CLAUDE.md".to_string(), "@AGENTS.md\n@.claude/missing.md".to_string());
377
378        let files = make_known_files(&["CLAUDE.md", "AGENTS.md"]);
379        let broken = IntegrityAnalyzer::analyze_directives(&config, &files);
380        assert_eq!(broken.len(), 1);
381        assert!(broken[0].target.contains("missing.md"));
382        assert!(matches!(broken[0].kind, BrokenRefKind::DirectiveReference));
383    }
384
385    #[test]
386    fn test_claude_dir_reference_validation() {
387        let mut config = HashMap::new();
388        config.insert(
389            "CLAUDE.md".to_string(),
390            "Read @.claude/AGENT_FLOW_RULES.md for details".to_string(),
391        );
392
393        let files = make_known_files(&["CLAUDE.md", ".claude/AGENT_FLOW_RULES.md"]);
394        let broken = IntegrityAnalyzer::analyze_directives(&config, &files);
395        assert!(broken.is_empty());
396
397        // Now without the target file
398        let files_without = make_known_files(&["CLAUDE.md"]);
399        let broken2 = IntegrityAnalyzer::analyze_directives(&config, &files_without);
400        assert_eq!(broken2.len(), 1);
401    }
402
403    #[test]
404    fn test_mixed_valid_and_broken() {
405        let mut links = HashMap::new();
406        links.insert(
407            "README.md".to_string(),
408            vec![
409                "docs/api.md".to_string(),     // valid
410                "docs/missing.md".to_string(),  // broken
411            ],
412        );
413
414        let files = make_known_files(&["README.md", "docs/api.md"]);
415        let headings = HashMap::new();
416
417        let result = IntegrityAnalyzer::analyze(&links, &files, &headings);
418        assert_eq!(result.total_refs, 2);
419        assert_eq!(result.valid_refs, 1);
420        assert!((result.score - 0.5).abs() < 0.01);
421    }
422}