Skip to main content

alint_rules/
markdown_paths_resolve.rs

1//! `markdown_paths_resolve` — backticked workspace paths in
2//! markdown files must resolve to real files or directories.
3//!
4//! Targets the AGENTS.md / CLAUDE.md staleness problem:
5//! agent-context files reference workspace paths in inline
6//! backticks (`` `src/api/users.ts` ``), and those paths drift
7//! as the codebase evolves. The v0.6 `agent-context-no-stale-paths`
8//! rule surfaces *candidate* drift via a regex; this rule does
9//! the precise check.
10//!
11//! Design doc: `docs/design/v0.7/markdown_paths_resolve.md`.
12
13use std::path::Path;
14
15use alint_core::{Context, Error, Level, Result, Rule, RuleSpec, Scope, Violation};
16use serde::Deserialize;
17
18#[derive(Debug, Deserialize)]
19struct Options {
20    /// Whitelist of path-shape prefixes to validate. A backticked
21    /// token must start with one of these to be considered a path
22    /// candidate. No defaults — every project's layout differs and
23    /// the user must declare which prefixes mark a path.
24    prefixes: Vec<String>,
25
26    /// Skip backticked tokens containing template-variable
27    /// markers (`{{ }}`, `${ }`, `<…>`). Default true.
28    #[serde(default = "default_ignore_template_vars")]
29    ignore_template_vars: bool,
30}
31
32fn default_ignore_template_vars() -> bool {
33    true
34}
35
36#[derive(Debug)]
37pub struct MarkdownPathsResolveRule {
38    id: String,
39    level: Level,
40    policy_url: Option<String>,
41    message: Option<String>,
42    scope: Scope,
43    prefixes: Vec<String>,
44    ignore_template_vars: bool,
45}
46
47impl Rule for MarkdownPathsResolveRule {
48    fn id(&self) -> &str {
49        &self.id
50    }
51    fn level(&self) -> Level {
52        self.level
53    }
54    fn policy_url(&self) -> Option<&str> {
55        self.policy_url.as_deref()
56    }
57    fn path_scope(&self) -> Option<&Scope> {
58        Some(&self.scope)
59    }
60
61    fn evaluate(&self, ctx: &Context<'_>) -> Result<Vec<Violation>> {
62        let mut violations = Vec::new();
63        for entry in ctx.index.files() {
64            if !self.scope.matches(&entry.path) {
65                continue;
66            }
67            let full = ctx.root.join(&entry.path);
68            // Unreadable file: silently skip; a sibling rule can flag it.
69            let Ok(bytes) = std::fs::read(&full) else {
70                continue;
71            };
72            let Ok(text) = std::str::from_utf8(&bytes) else {
73                continue; // non-UTF-8 markdown is degenerate; skip
74            };
75            for cand in scan_markdown_paths(text, &self.prefixes) {
76                if self.ignore_template_vars && has_template_vars(&cand.token) {
77                    continue;
78                }
79                let lookup = strip_path_decoration(&cand.token);
80                if !path_resolves(ctx, lookup) {
81                    let msg = self.message.clone().unwrap_or_else(|| {
82                        format!(
83                            "backticked path `{}` doesn't resolve to a file or directory",
84                            cand.token
85                        )
86                    });
87                    violations.push(
88                        Violation::new(msg)
89                            .with_path(entry.path.clone())
90                            .with_location(cand.line, cand.column),
91                    );
92                }
93            }
94        }
95        Ok(violations)
96    }
97}
98
99pub fn build(spec: &RuleSpec) -> Result<Box<dyn Rule>> {
100    let Some(paths) = &spec.paths else {
101        return Err(Error::rule_config(
102            &spec.id,
103            "markdown_paths_resolve requires a `paths` field",
104        ));
105    };
106    let opts: Options = spec
107        .deserialize_options()
108        .map_err(|e| Error::rule_config(&spec.id, format!("invalid options: {e}")))?;
109    if opts.prefixes.is_empty() {
110        return Err(Error::rule_config(
111            &spec.id,
112            "markdown_paths_resolve requires a non-empty `prefixes` list — \
113             declare which path shapes (e.g. [\"src/\", \"crates/\", \"docs/\"]) \
114             count as path candidates in your codebase",
115        ));
116    }
117    Ok(Box::new(MarkdownPathsResolveRule {
118        id: spec.id.clone(),
119        level: spec.level,
120        policy_url: spec.policy_url.clone(),
121        message: spec.message.clone(),
122        scope: Scope::from_paths_spec(paths)?,
123        prefixes: opts.prefixes,
124        ignore_template_vars: opts.ignore_template_vars,
125    }))
126}
127
128// ─── markdown scanner ──────────────────────────────────────────
129
130/// One backticked path candidate found in a markdown source.
131#[derive(Debug, PartialEq, Eq)]
132struct Candidate {
133    token: String,
134    line: usize,
135    column: usize,
136}
137
138/// Walk a markdown string, returning every backticked token that
139/// starts with one of `prefixes`. Skips fenced code blocks
140/// (```` ``` ```` / `~~~`) and 4-space-indented code blocks; those
141/// contain code samples, not factual claims about the tree.
142fn scan_markdown_paths(text: &str, prefixes: &[String]) -> Vec<Candidate> {
143    let mut out = Vec::new();
144    let mut in_fenced = false;
145    let mut fence_marker: Option<char> = None;
146    let mut fence_len: usize = 0;
147
148    for (line_idx, line) in text.lines().enumerate() {
149        let line_no = line_idx + 1;
150
151        // Detect fenced-code-block boundaries. CommonMark allows
152        // ``` and ~~~ with at least 3 markers; the closing fence
153        // must use the same character and at least as many
154        // markers. `info string` (e.g. ```yaml) follows the
155        // opening fence; we don't care about its content.
156        let trimmed = line.trim_start();
157        if let Some((ch, n)) = detect_fence(trimmed) {
158            if !in_fenced {
159                in_fenced = true;
160                fence_marker = Some(ch);
161                fence_len = n;
162            } else if fence_marker == Some(ch) && n >= fence_len && only_fence(trimmed, ch) {
163                in_fenced = false;
164                fence_marker = None;
165                fence_len = 0;
166            }
167            continue;
168        }
169        if in_fenced {
170            continue;
171        }
172
173        // Skip 4-space indented code blocks. Per CommonMark, only
174        // applies when the indented line is NOT inside a list.
175        // We're conservative — any 4-space-prefixed line is treated
176        // as code unless it's a continuation of a list item, which
177        // we don't track here. Acceptable: false-skip rate >
178        // false-flag rate for our use.
179        if line.starts_with("    ") || line.starts_with('\t') {
180            continue;
181        }
182
183        // Find inline backticks. A run of N backticks opens an
184        // inline span that closes at the next run of EXACTLY N
185        // backticks. Per CommonMark, longer runs nest the span so
186        // it can contain shorter backtick sequences. Most paths
187        // use single backticks, which is what we optimise for.
188        let bytes = line.as_bytes();
189        let mut i = 0;
190        while i < bytes.len() {
191            if bytes[i] != b'`' {
192                i += 1;
193                continue;
194            }
195            let run_start = i;
196            while i < bytes.len() && bytes[i] == b'`' {
197                i += 1;
198            }
199            let run_len = i - run_start;
200            // Find the matching closing run.
201            let close_start = find_closing_run(&bytes[i..], run_len).map(|p| i + p);
202            let Some(close) = close_start else {
203                // Unmatched backticks → not a span; bail this line.
204                break;
205            };
206            let token_bytes = &bytes[i..close];
207            // Inline-code spans wrap their content with one space
208            // padding when the content starts/ends with a backtick;
209            // CommonMark trims one leading + one trailing space.
210            let token = std::str::from_utf8(token_bytes).unwrap_or("").trim();
211            if !token.is_empty() && starts_with_any_prefix(token, prefixes) {
212                out.push(Candidate {
213                    token: token.to_string(),
214                    line: line_no,
215                    column: run_start + 1, // 1-indexed; points at opening backtick
216                });
217            }
218            i = close + run_len;
219        }
220    }
221    out
222}
223
224/// If `s` starts with N+ backticks or tildes (N ≥ 3), return the
225/// fence character and the run length. Otherwise None.
226fn detect_fence(s: &str) -> Option<(char, usize)> {
227    let mut chars = s.chars();
228    let ch = chars.next()?;
229    if ch != '`' && ch != '~' {
230        return None;
231    }
232    let n = 1 + chars.take_while(|&c| c == ch).count();
233    if n >= 3 { Some((ch, n)) } else { None }
234}
235
236/// True if `s` consists only of `ch`-characters (allowing
237/// trailing whitespace). Used to decide if an opening-fence
238/// marker line could close a fence — `CommonMark` says the
239/// closing fence cannot have an info string after the markers.
240fn only_fence(s: &str, ch: char) -> bool {
241    s.trim_end().chars().all(|c| c == ch)
242}
243
244/// Find the position (relative to `bytes` start) of the next run
245/// of exactly `len` backticks. Returns None if not found in
246/// `bytes`.
247fn find_closing_run(bytes: &[u8], len: usize) -> Option<usize> {
248    let mut i = 0;
249    while i < bytes.len() {
250        if bytes[i] != b'`' {
251            i += 1;
252            continue;
253        }
254        let start = i;
255        while i < bytes.len() && bytes[i] == b'`' {
256            i += 1;
257        }
258        if i - start == len {
259            return Some(start);
260        }
261    }
262    None
263}
264
265fn starts_with_any_prefix(s: &str, prefixes: &[String]) -> bool {
266    prefixes.iter().any(|p| s.starts_with(p))
267}
268
269/// True if `s` contains a template-variable marker
270/// (`{{ … }}` / `${ … }` / `<…>`).
271fn has_template_vars(s: &str) -> bool {
272    s.contains("{{") || s.contains("${") || (s.contains('<') && s.contains('>'))
273}
274
275/// Strip trailing punctuation, trailing slashes, and
276/// `:line` / `#L<n>` location suffixes that aren't part of
277/// the path-on-disk we want to look up.
278fn strip_path_decoration(s: &str) -> &str {
279    // Strip a `#L<n>` GitHub-style anchor first (everything from
280    // `#` to end), then a `:N` line-number suffix, then trailing
281    // punctuation, then trailing slash.
282    let hash = s.find('#').unwrap_or(s.len());
283    let s = &s[..hash];
284    let colon_loc = s
285        .rfind(':')
286        .filter(|&i| s[i + 1..].chars().all(|c| c.is_ascii_digit()) && i + 1 < s.len());
287    let s = match colon_loc {
288        Some(i) => &s[..i],
289        None => s,
290    };
291    let s = s.trim_end_matches(|c: char| ".,:;?!".contains(c));
292    s.trim_end_matches('/')
293}
294
295/// Does `lookup` resolve to a real file or directory in the
296/// scanned tree? Glob characters in the lookup are matched
297/// against the file index (any-of); plain paths use exact
298/// lookup of either file or directory.
299fn path_resolves(ctx: &Context<'_>, lookup: &str) -> bool {
300    if lookup.is_empty() {
301        return false;
302    }
303    if lookup.contains('*') || lookup.contains('?') || lookup.contains('[') {
304        // Glob — match against the index. Build a globset on the
305        // fly; cheap for one pattern.
306        let Ok(glob) = globset::Glob::new(lookup) else {
307            return false;
308        };
309        let matcher = glob.compile_matcher();
310        return ctx.index.entries.iter().any(|e| matcher.is_match(&e.path));
311    }
312    let p = Path::new(lookup);
313    ctx.index.entries.iter().any(|e| &*e.path == p)
314}
315
316#[cfg(test)]
317mod tests {
318    use super::*;
319
320    fn prefixes(list: &[&str]) -> Vec<String> {
321        list.iter().map(|s| (*s).to_string()).collect()
322    }
323
324    #[test]
325    fn finds_inline_backtick_with_matching_prefix() {
326        let pf = prefixes(&["src/", "docs/"]);
327        let cands = scan_markdown_paths("see `src/foo.ts` and `npm` and `docs/x.md`", &pf);
328        assert_eq!(cands.len(), 2);
329        assert_eq!(cands[0].token, "src/foo.ts");
330        assert_eq!(cands[1].token, "docs/x.md");
331    }
332
333    #[test]
334    fn skips_fenced_code_blocks() {
335        let pf = prefixes(&["src/"]);
336        let md = "before\n\
337                  ```yaml\n\
338                  example: `src/should-not-fire.ts`\n\
339                  ```\n\
340                  after `src/should-fire.ts`";
341        let cands = scan_markdown_paths(md, &pf);
342        assert_eq!(cands.len(), 1);
343        assert_eq!(cands[0].token, "src/should-fire.ts");
344    }
345
346    #[test]
347    fn skips_indented_code_blocks() {
348        let pf = prefixes(&["src/"]);
349        let md = "normal `src/a.ts` line\n\
350                  \n\
351                  \x20\x20\x20\x20indented `src/should-not-fire.ts`\n";
352        let cands = scan_markdown_paths(md, &pf);
353        assert_eq!(cands.len(), 1);
354        assert_eq!(cands[0].token, "src/a.ts");
355    }
356
357    #[test]
358    fn handles_tilde_fences() {
359        let pf = prefixes(&["src/"]);
360        let md = "before `src/yes.ts`\n~~~\nin code: `src/no.ts`\n~~~\nafter `src/yes2.ts`";
361        let tokens: Vec<_> = scan_markdown_paths(md, &pf)
362            .into_iter()
363            .map(|c| c.token)
364            .collect();
365        assert_eq!(tokens, vec!["src/yes.ts", "src/yes2.ts"]);
366    }
367
368    #[test]
369    fn line_and_column_are_correct() {
370        let pf = prefixes(&["src/"]);
371        let md = "first line\nsecond `src/foo.ts` here";
372        let cands = scan_markdown_paths(md, &pf);
373        assert_eq!(cands.len(), 1);
374        assert_eq!(cands[0].line, 2);
375        // "second " is 7 chars + 1 for the opening backtick at col 8
376        assert_eq!(cands[0].column, 8);
377    }
378
379    #[test]
380    fn template_vars_detected() {
381        assert!(has_template_vars("src/{{user_id}}.json"));
382        assert!(has_template_vars("src/${name}.ts"));
383        assert!(has_template_vars("src/<placeholder>.ts"));
384        assert!(!has_template_vars("src/concrete.ts"));
385        assert!(!has_template_vars("src/foo[0].ts")); // brackets without angle
386    }
387
388    #[test]
389    fn path_decoration_stripped() {
390        assert_eq!(strip_path_decoration("src/foo.ts"), "src/foo.ts");
391        assert_eq!(strip_path_decoration("src/foo.ts."), "src/foo.ts");
392        assert_eq!(strip_path_decoration("src/foo.ts,"), "src/foo.ts");
393        assert_eq!(strip_path_decoration("src/foo.ts:42"), "src/foo.ts");
394        assert_eq!(strip_path_decoration("src/foo.ts#L42"), "src/foo.ts");
395        assert_eq!(strip_path_decoration("src/foo.ts:42#L1"), "src/foo.ts");
396        assert_eq!(strip_path_decoration("src/foo/"), "src/foo");
397    }
398
399    #[test]
400    fn prefix_matching() {
401        let pf = prefixes(&["src/", "crates/"]);
402        assert!(starts_with_any_prefix("src/foo.ts", &pf));
403        assert!(starts_with_any_prefix("crates/alint", &pf));
404        assert!(!starts_with_any_prefix("docs/x.md", &pf));
405        assert!(!starts_with_any_prefix("README.md", &pf));
406    }
407
408    #[test]
409    fn unmatched_backticks_do_not_explode() {
410        let pf = prefixes(&["src/"]);
411        let cands = scan_markdown_paths("`src/foo.ts unmatched", &pf);
412        assert!(cands.is_empty());
413    }
414
415    #[test]
416    fn double_backticks_can_contain_single() {
417        let pf = prefixes(&["src/"]);
418        let md = "double `` ` `` then `src/foo.ts`";
419        let cands = scan_markdown_paths(md, &pf);
420        assert_eq!(cands.len(), 1);
421        assert_eq!(cands[0].token, "src/foo.ts");
422    }
423}