Skip to main content

normalize_native_rules/
check_refs.rs

1//! Check documentation references for broken links
2
3use normalize_output::OutputFormatter;
4use normalize_output::diagnostics::{DiagnosticsReport, Issue, Severity};
5use serde::Serialize;
6use std::path::Path;
7
8static CODE_REF_RE: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
9
10/// A broken reference found in documentation
11#[derive(Debug, Clone, Serialize, schemars::JsonSchema)]
12struct BrokenRef {
13    file: String,
14    line: usize,
15    reference: String,
16    context: String,
17}
18
19/// Report produced by the broken-ref native rule check.
20#[derive(Debug, Serialize, schemars::JsonSchema)]
21pub struct CheckRefsReport {
22    broken_refs: Vec<BrokenRef>,
23    files_checked: usize,
24    symbols_indexed: usize,
25}
26
27impl OutputFormatter for CheckRefsReport {
28    fn format_text(&self) -> String {
29        let mut lines = Vec::new();
30        lines.push("Documentation Reference Check".to_string());
31        lines.push(String::new());
32        lines.push(format!("Files checked: {}", self.files_checked));
33        lines.push(format!("Symbols indexed: {}", self.symbols_indexed));
34        lines.push(String::new());
35
36        if self.broken_refs.is_empty() {
37            lines.push("No broken references found.".to_string());
38        } else {
39            lines.push(format!("Broken references ({}):", self.broken_refs.len()));
40            lines.push(String::new());
41            for r in &self.broken_refs {
42                lines.push(format!("  {}:{}: `{}`", r.file, r.line, r.reference));
43                if r.context.len() <= 80 {
44                    lines.push(format!("    {}", r.context));
45                }
46            }
47        }
48
49        lines.join("\n")
50    }
51}
52
53/// Derive the normalize data directory for a project root.
54///
55/// Resolution order:
56/// 1. If `NORMALIZE_INDEX_DIR` is set to an absolute path, use it directly.
57/// 2. If `NORMALIZE_INDEX_DIR` is set to a relative path, use `$XDG_DATA_HOME/normalize/<relative>`.
58/// 3. Otherwise, use `<root>/.normalize`.
59///
60/// Public so that sibling modules (e.g. `boundary_violations`) can locate
61/// the index without duplicating the resolution logic.
62pub fn normalize_dir_for_root(root: &Path) -> std::path::PathBuf {
63    if let Ok(index_dir) = std::env::var("NORMALIZE_INDEX_DIR") {
64        let path = std::path::PathBuf::from(&index_dir);
65        if path.is_absolute() {
66            return path;
67        }
68        let data_home = std::env::var("XDG_DATA_HOME")
69            .map(std::path::PathBuf::from)
70            .unwrap_or_else(|_| {
71                dirs::home_dir()
72                    .unwrap_or_else(|| std::path::PathBuf::from("."))
73                    .join(".local/share")
74            });
75        return data_home.join("normalize").join(path);
76    }
77    root.join(".normalize")
78}
79
80/// Build a CheckRefsReport without printing (for service layer).
81pub async fn build_check_refs_report(
82    root: &Path,
83    walk_config: &normalize_rules_config::WalkConfig,
84) -> Result<CheckRefsReport, String> {
85    // Open index to get known symbols
86    let db_path = normalize_dir_for_root(root).join("index.sqlite");
87    let idx = normalize_facts::FileIndex::open(&db_path, root)
88        .await
89        .map_err(|e| format!("Failed to open index: {e}"))?;
90
91    // Get all symbol names from index
92    let all_symbols = match idx.all_symbol_names().await {
93        Ok(syms) => syms,
94        Err(e) => {
95            tracing::warn!(
96                "normalize-native-rules: failed to query symbol names: {}",
97                e
98            );
99            std::collections::HashSet::new()
100        }
101    };
102
103    if all_symbols.is_empty() {
104        return Err("No symbols indexed. Run: normalize structure rebuild".to_string());
105    }
106
107    // Find markdown files
108    let md_files: Vec<_> = crate::walk::gitignore_walk(root, walk_config)
109        .filter(|e| e.path().extension().and_then(|s| s.to_str()) == Some("md"))
110        .map(|e| e.path().to_path_buf())
111        .collect();
112
113    if md_files.is_empty() {
114        return Ok(CheckRefsReport {
115            broken_refs: Vec::new(),
116            files_checked: 0,
117            symbols_indexed: all_symbols.len(),
118        });
119    }
120
121    // Regex for code references: `identifier` or `Module::method` or `Module.method`
122    let code_ref_re = CODE_REF_RE.get_or_init(|| {
123        // normalize-syntax-allow: rust/unwrap-in-impl - compile-time-known-valid regex
124        regex::Regex::new(r"`([A-Z][a-zA-Z0-9_]*(?:[:\.][a-zA-Z_][a-zA-Z0-9_]*)*)`").unwrap()
125    });
126
127    let mut broken_refs: Vec<BrokenRef> = Vec::new();
128
129    for md_file in &md_files {
130        let content = match std::fs::read_to_string(md_file) {
131            Ok(c) => c,
132            Err(_) => continue,
133        };
134
135        let rel_path = md_file
136            .strip_prefix(root)
137            .unwrap_or(md_file)
138            .display()
139            .to_string();
140
141        let md_dir = md_file.parent().unwrap_or(root);
142
143        let mut in_code_block = false;
144        for (line_num, line) in content.lines().enumerate() {
145            if line.trim().starts_with("```") {
146                in_code_block = !in_code_block;
147                continue;
148            }
149            if in_code_block {
150                continue;
151            }
152
153            for cap in code_ref_re.captures_iter(line) {
154                let reference = &cap[1];
155
156                if is_common_non_symbol(reference) {
157                    continue;
158                }
159
160                if looks_like_file_path(reference) {
161                    // Check if the file exists relative to the markdown file
162                    let file_path = md_dir.join(reference.replace("::", "/"));
163                    if !file_path.exists() && !root.join(reference.replace("::", "/")).exists() {
164                        broken_refs.push(BrokenRef {
165                            file: rel_path.clone(),
166                            line: line_num + 1,
167                            reference: reference.to_string(),
168                            context: line.trim().to_string(),
169                        });
170                    }
171                } else if !all_symbols.contains(reference) {
172                    broken_refs.push(BrokenRef {
173                        file: rel_path.clone(),
174                        line: line_num + 1,
175                        reference: reference.to_string(),
176                        context: line.trim().to_string(),
177                    });
178                }
179            }
180        }
181    }
182
183    Ok(CheckRefsReport {
184        broken_refs,
185        files_checked: md_files.len(),
186        symbols_indexed: all_symbols.len(),
187    })
188}
189
190impl From<CheckRefsReport> for DiagnosticsReport {
191    fn from(report: CheckRefsReport) -> Self {
192        DiagnosticsReport {
193            issues: report
194                .broken_refs
195                .into_iter()
196                .map(|r| Issue {
197                    file: r.file,
198                    line: Some(r.line),
199                    column: None,
200                    end_line: None,
201                    end_column: None,
202                    rule_id: "broken-ref".into(),
203                    message: if looks_like_file_path(&r.reference) {
204                        format!("broken file link `{}`", r.reference)
205                    } else {
206                        format!("unknown symbol `{}`", r.reference)
207                    },
208                    severity: Severity::Warning,
209                    source: "check-refs".into(),
210                    related: vec![],
211                    suggestion: None,
212                })
213                .collect(),
214            files_checked: report.files_checked,
215            sources_run: vec!["check-refs".into()],
216            tool_errors: vec![],
217            daemon_cached: false,
218        }
219    }
220}
221
222/// Check if a reference looks like a file path.
223///
224/// Heuristic: the part after the last `.` is lowercase-only (a file extension),
225/// not a capitalized method name or field access.
226fn looks_like_file_path(s: &str) -> bool {
227    let Some(dot) = s.rfind('.') else {
228        return false;
229    };
230    // SAFETY: '.' is ASCII (1 byte), so dot + 1 is always a valid char boundary
231    let ext = &s[dot + 1..];
232    !ext.is_empty() && ext.len() <= 5 && ext.chars().all(|c| c.is_ascii_lowercase())
233}
234
235/// Check if a string is a common non-symbol pattern (command, path, etc.)
236fn is_common_non_symbol(s: &str) -> bool {
237    // Skip common patterns that aren't symbols
238    matches!(
239        s,
240        "TODO"
241            | "FIXME"
242            | "NOTE"
243            | "HACK"
244            | "XXX"
245            | "BUG"
246            | "OK"
247            | "Err"
248            | "Ok"
249            | "None"
250            | "Some"
251            | "True"
252            | "False"
253            | "String"
254            | "Vec"
255            | "Option"
256            | "Result"
257            | "Box"
258            | "Arc"
259            | "Rc"
260            | "HashMap"
261            | "HashSet"
262            | "BTreeMap"
263            | "BTreeSet"
264            | "PathBuf"
265            | "Path"
266            | "File"
267            | "Read"
268            | "Write"
269            | "Debug"
270            | "Clone"
271            | "Copy"
272            | "Default"
273            | "Send"
274            | "Sync"
275            | "Serialize"
276            | "Deserialize"
277    ) || s.len() < 2
278        || s.chars().all(|c| c.is_uppercase() || c == '_') // ALL_CAPS constants
279}