rumdl_lib/rules/
md051_link_fragments.rs

1use crate::rule::{LintError, LintResult, LintWarning, Rule, Severity};
2use crate::utils::anchor_styles::AnchorStyle;
3use pulldown_cmark::LinkType;
4use regex::Regex;
5use std::collections::HashSet;
6use std::sync::LazyLock;
7// HTML tags with id or name attributes (supports any HTML element, not just <a>)
8// This pattern only captures the first id/name attribute in a tag
9static HTML_ANCHOR_PATTERN: LazyLock<Regex> =
10    LazyLock::new(|| Regex::new(r#"\b(?:id|name)\s*=\s*["']([^"']+)["']"#).unwrap());
11
12/// Rule MD051: Link fragments
13///
14/// See [docs/md051.md](../../docs/md051.md) for full documentation, configuration, and examples.
15///
16/// This rule validates that link anchors (the part after #) exist in the current document.
17/// Only applies to internal document links (like #heading), not to external URLs or cross-file links.
18#[derive(Clone)]
19pub struct MD051LinkFragments {
20    /// Anchor style to use for validation
21    anchor_style: AnchorStyle,
22}
23
24impl Default for MD051LinkFragments {
25    fn default() -> Self {
26        Self::new()
27    }
28}
29
30impl MD051LinkFragments {
31    pub fn new() -> Self {
32        Self {
33            anchor_style: AnchorStyle::GitHub,
34        }
35    }
36
37    /// Create with specific anchor style
38    pub fn with_anchor_style(style: AnchorStyle) -> Self {
39        Self { anchor_style: style }
40    }
41
42    /// Extract all valid heading anchors from the document
43    /// Returns (markdown_anchors, html_anchors) where markdown_anchors are lowercased
44    /// for case-insensitive matching, and html_anchors are case-sensitive
45    fn extract_headings_from_context(
46        &self,
47        ctx: &crate::lint_context::LintContext,
48    ) -> (HashSet<String>, HashSet<String>) {
49        let mut markdown_headings = HashSet::with_capacity(32);
50        let mut html_anchors = HashSet::with_capacity(16);
51        let mut fragment_counts = std::collections::HashMap::new();
52
53        for line_info in &ctx.lines {
54            if line_info.in_front_matter {
55                continue;
56            }
57
58            // Extract HTML anchor tags with id/name attributes
59            if !line_info.in_code_block {
60                let content = line_info.content(ctx.content);
61                let bytes = content.as_bytes();
62
63                // Skip lines without HTML tags or id/name attributes
64                if bytes.contains(&b'<') && (content.contains("id=") || content.contains("name=")) {
65                    // HTML spec: only the first id attribute per element is valid
66                    // Process element by element to handle multiple id attributes correctly
67                    let mut pos = 0;
68                    while pos < content.len() {
69                        if let Some(start) = content[pos..].find('<') {
70                            let tag_start = pos + start;
71                            if let Some(end) = content[tag_start..].find('>') {
72                                let tag_end = tag_start + end + 1;
73                                let tag = &content[tag_start..tag_end];
74
75                                // Extract first id or name attribute from this tag
76                                if let Some(caps) = HTML_ANCHOR_PATTERN.find(tag) {
77                                    let matched_text = caps.as_str();
78                                    if let Some(caps) = HTML_ANCHOR_PATTERN.captures(matched_text)
79                                        && let Some(id_match) = caps.get(1)
80                                    {
81                                        let id = id_match.as_str();
82                                        if !id.is_empty() {
83                                            html_anchors.insert(id.to_string());
84                                        }
85                                    }
86                                }
87                                pos = tag_end;
88                            } else {
89                                break;
90                            }
91                        } else {
92                            break;
93                        }
94                    }
95                }
96            }
97
98            // Extract markdown heading anchors
99            if let Some(heading) = &line_info.heading {
100                // Custom ID from {#custom-id} syntax
101                if let Some(custom_id) = &heading.custom_id {
102                    markdown_headings.insert(custom_id.to_lowercase());
103                }
104
105                // Generate anchor from heading text
106                // The anchor generation algorithm handles markdown formatting and HTML tags correctly
107                let fragment = self.anchor_style.generate_fragment(&heading.text);
108
109                if !fragment.is_empty() {
110                    // Handle duplicate headings by appending -1, -2, etc.
111                    let final_fragment = if let Some(count) = fragment_counts.get_mut(&fragment) {
112                        let suffix = *count;
113                        *count += 1;
114                        format!("{fragment}-{suffix}")
115                    } else {
116                        fragment_counts.insert(fragment.clone(), 1);
117                        fragment
118                    };
119                    markdown_headings.insert(final_fragment);
120                }
121            }
122        }
123
124        (markdown_headings, html_anchors)
125    }
126
127    /// Fast check if URL is external (doesn't need to be validated)
128    #[inline]
129    fn is_external_url_fast(url: &str) -> bool {
130        // Quick prefix checks for common protocols
131        url.starts_with("http://")
132            || url.starts_with("https://")
133            || url.starts_with("ftp://")
134            || url.starts_with("mailto:")
135            || url.starts_with("tel:")
136            || url.starts_with("//")
137    }
138
139    /// Check if URL is a cross-file link (contains a file path before #)
140    #[inline]
141    fn is_cross_file_link(url: &str) -> bool {
142        if let Some(fragment_pos) = url.find('#') {
143            let path_part = &url[..fragment_pos];
144
145            // If there's no path part, it's just a fragment (#heading)
146            if path_part.is_empty() {
147                return false;
148            }
149
150            // Check for Liquid syntax used by Jekyll and other static site generators
151            // Liquid tags: {% ... %} for control flow and includes
152            // Liquid variables: {{ ... }} for outputting values
153            // These are template directives that reference external content and should be skipped
154            // We check for proper bracket order to avoid false positives
155            if let Some(tag_start) = path_part.find("{%")
156                && path_part[tag_start + 2..].contains("%}")
157            {
158                return true;
159            }
160            if let Some(var_start) = path_part.find("{{")
161                && path_part[var_start + 2..].contains("}}")
162            {
163                return true;
164            }
165
166            // Check if it's an absolute path (starts with /)
167            // These are links to other pages on the same site
168            if path_part.starts_with('/') {
169                return true;
170            }
171
172            // Check if it looks like a file path:
173            // - Contains a file extension (dot followed by letters)
174            // - Contains path separators
175            // - Contains relative path indicators
176            path_part.contains('.')
177                && (
178                    // Has file extension pattern (handle query parameters by splitting on them first)
179                    {
180                    let clean_path = path_part.split('?').next().unwrap_or(path_part);
181                    // Handle files starting with dot
182                    if let Some(after_dot) = clean_path.strip_prefix('.') {
183                        let dots_count = clean_path.matches('.').count();
184                        if dots_count == 1 {
185                            // Could be ".ext" (just extension) or ".hidden" (hidden file)
186                            // If it's a known file extension, treat as cross-file link
187                            !after_dot.is_empty() && after_dot.len() <= 10 &&
188                            after_dot.chars().all(|c| c.is_ascii_alphanumeric()) &&
189                            // Additional check: common file extensions are likely cross-file
190                            (after_dot.len() <= 4 || matches!(after_dot, "html" | "json" | "yaml" | "toml"))
191                        } else {
192                            // Hidden file with extension like ".hidden.txt"
193                            clean_path.split('.').next_back().is_some_and(|ext| {
194                                !ext.is_empty() && ext.len() <= 10 && ext.chars().all(|c| c.is_ascii_alphanumeric())
195                            })
196                        }
197                    } else {
198                        // Regular file path
199                        clean_path.split('.').next_back().is_some_and(|ext| {
200                            !ext.is_empty() && ext.len() <= 10 && ext.chars().all(|c| c.is_ascii_alphanumeric())
201                        })
202                    }
203                } ||
204                // Or contains path separators
205                path_part.contains('/') || path_part.contains('\\') ||
206                // Or starts with relative path indicators
207                path_part.starts_with("./") || path_part.starts_with("../")
208                )
209        } else {
210            false
211        }
212    }
213}
214
215impl Rule for MD051LinkFragments {
216    fn name(&self) -> &'static str {
217        "MD051"
218    }
219
220    fn description(&self) -> &'static str {
221        "Link fragments should reference valid headings"
222    }
223
224    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
225        // Skip if no link fragments present
226        if !ctx.likely_has_links_or_images() {
227            return true;
228        }
229        // Check for # character (fragments)
230        !ctx.has_char('#')
231    }
232
233    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
234        let mut warnings = Vec::new();
235
236        if ctx.content.is_empty() || ctx.links.is_empty() || self.should_skip(ctx) {
237            return Ok(warnings);
238        }
239
240        let (markdown_headings, html_anchors) = self.extract_headings_from_context(ctx);
241
242        for link in &ctx.links {
243            if link.is_reference {
244                continue;
245            }
246
247            // Skip wiki-links - they reference other files and may have their own fragment validation
248            if matches!(link.link_type, LinkType::WikiLink { .. }) {
249                continue;
250            }
251
252            // Skip links inside Jinja templates
253            if ctx.is_in_jinja_range(link.byte_offset) {
254                continue;
255            }
256
257            let url = &link.url;
258
259            // Skip links without fragments or external URLs
260            if !url.contains('#') || Self::is_external_url_fast(url) {
261                continue;
262            }
263
264            // Skip mdbook template placeholders ({{#VARIABLE}})
265            // mdbook uses {{#VARIABLE}} syntax where # is part of the template, not a fragment
266            if url.contains("{{#") && url.contains("}}") {
267                continue;
268            }
269
270            // Skip Quarto/RMarkdown cross-references (@fig-, @tbl-, @sec-, @eq-, etc.)
271            // These are special cross-reference syntax, not HTML anchors
272            // Format: @prefix-identifier or just @identifier
273            if url.starts_with('@') {
274                continue;
275            }
276
277            // Cross-file links are valid if the file exists (not checked here)
278            if Self::is_cross_file_link(url) {
279                continue;
280            }
281
282            let Some(fragment_pos) = url.find('#') else {
283                continue;
284            };
285
286            let fragment = &url[fragment_pos + 1..];
287
288            // Skip Liquid template variables and filters
289            if (url.contains("{{") && fragment.contains('|')) || fragment.ends_with("}}") || fragment.ends_with("%}") {
290                continue;
291            }
292
293            if fragment.is_empty() {
294                continue;
295            }
296
297            // Validate fragment against document headings
298            // HTML anchors are case-sensitive, markdown anchors are case-insensitive
299            let found = if html_anchors.contains(fragment) {
300                true
301            } else {
302                let fragment_lower = fragment.to_lowercase();
303                markdown_headings.contains(&fragment_lower)
304            };
305
306            if !found {
307                warnings.push(LintWarning {
308                    rule_name: Some(self.name().to_string()),
309                    message: format!("Link anchor '#{fragment}' does not exist in document headings"),
310                    line: link.line,
311                    column: link.start_col + 1,
312                    end_line: link.line,
313                    end_column: link.end_col + 1,
314                    severity: Severity::Warning,
315                    fix: None,
316                });
317            }
318        }
319
320        Ok(warnings)
321    }
322
323    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
324        // MD051 does not provide auto-fix
325        // Link fragment corrections require human judgment to avoid incorrect fixes
326        Ok(ctx.content.to_string())
327    }
328
329    fn as_any(&self) -> &dyn std::any::Any {
330        self
331    }
332
333    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
334    where
335        Self: Sized,
336    {
337        // Config keys are normalized to kebab-case by the config system
338        let anchor_style = if let Some(rule_config) = config.rules.get("MD051") {
339            if let Some(style_str) = rule_config.values.get("anchor-style").and_then(|v| v.as_str()) {
340                match style_str.to_lowercase().as_str() {
341                    "kramdown" => AnchorStyle::Kramdown,
342                    "kramdown-gfm" => AnchorStyle::KramdownGfm,
343                    "jekyll" => AnchorStyle::KramdownGfm, // Backward compatibility alias
344                    _ => AnchorStyle::GitHub,
345                }
346            } else {
347                AnchorStyle::GitHub
348            }
349        } else {
350            AnchorStyle::GitHub
351        };
352
353        Box::new(MD051LinkFragments::with_anchor_style(anchor_style))
354    }
355
356    fn default_config_section(&self) -> Option<(String, toml::Value)> {
357        let value: toml::Value = toml::from_str(
358            r#"
359# Anchor generation style to match your target platform
360# Options: "github" (default), "kramdown-gfm", "kramdown"
361# Note: "jekyll" is accepted as an alias for "kramdown-gfm" (backward compatibility)
362anchor-style = "github"
363"#,
364        )
365        .ok()?;
366        Some(("MD051".to_string(), value))
367    }
368}
369
370#[cfg(test)]
371mod tests {
372    use super::*;
373    use crate::lint_context::LintContext;
374
375    #[test]
376    fn test_quarto_cross_references() {
377        let rule = MD051LinkFragments::new();
378
379        // Test that Quarto cross-references are skipped
380        let content = r#"# Test Document
381
382## Figures
383
384See [@fig-plot] for the visualization.
385
386More details in [@tbl-results] and [@sec-methods].
387
388The equation [@eq-regression] shows the relationship.
389
390Reference to [@lst-code] for implementation."#;
391        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Quarto);
392        let result = rule.check(&ctx).unwrap();
393        assert!(
394            result.is_empty(),
395            "Quarto cross-references (@fig-, @tbl-, @sec-, @eq-) should not trigger MD051 warnings. Got {} warnings",
396            result.len()
397        );
398
399        // Test that normal anchors still work
400        let content_with_anchor = r#"# Test
401
402See [link](#test) for details."#;
403        let ctx_anchor = LintContext::new(content_with_anchor, crate::config::MarkdownFlavor::Quarto);
404        let result_anchor = rule.check(&ctx_anchor).unwrap();
405        assert!(result_anchor.is_empty(), "Valid anchor should not trigger warning");
406
407        // Test that invalid anchors are still flagged
408        let content_invalid = r#"# Test
409
410See [link](#nonexistent) for details."#;
411        let ctx_invalid = LintContext::new(content_invalid, crate::config::MarkdownFlavor::Quarto);
412        let result_invalid = rule.check(&ctx_invalid).unwrap();
413        assert_eq!(result_invalid.len(), 1, "Invalid anchor should still trigger warning");
414    }
415}