rumdl_lib/rules/
md051_link_fragments.rs

1use crate::rule::{CrossFileScope, FixCapability, LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
2use crate::utils::anchor_styles::AnchorStyle;
3use crate::workspace_index::{CrossFileLinkIndex, FileIndex, HeadingIndex};
4use pulldown_cmark::LinkType;
5use regex::Regex;
6use std::collections::{HashMap, HashSet};
7use std::path::{Component, Path, PathBuf};
8use std::sync::LazyLock;
9// HTML tags with id or name attributes (supports any HTML element, not just <a>)
10// This pattern only captures the first id/name attribute in a tag
11static HTML_ANCHOR_PATTERN: LazyLock<Regex> =
12    LazyLock::new(|| Regex::new(r#"\b(?:id|name)\s*=\s*["']([^"']+)["']"#).unwrap());
13
14// Attribute anchor pattern for kramdown/MkDocs { #id } syntax
15// Matches {#id} or { #id } with optional spaces, supports multiple anchors
16// Also supports classes and attributes: { #id .class key=value }
17static ATTR_ANCHOR_PATTERN: LazyLock<Regex> =
18    LazyLock::new(|| Regex::new(r#"\{\s*#([a-zA-Z0-9_][a-zA-Z0-9_-]*)[^}]*\}"#).unwrap());
19
20// Material for MkDocs setting anchor pattern: <!-- md:setting NAME -->
21// Used in headings to generate anchors for configuration option references
22static MD_SETTING_PATTERN: LazyLock<Regex> =
23    LazyLock::new(|| Regex::new(r"<!--\s*md:setting\s+([^\s]+)\s*-->").unwrap());
24
25/// Normalize a path by resolving . and .. components
26fn normalize_path(path: &Path) -> PathBuf {
27    let mut result = PathBuf::new();
28    for component in path.components() {
29        match component {
30            Component::CurDir => {} // Skip .
31            Component::ParentDir => {
32                result.pop(); // Go up one level for ..
33            }
34            c => result.push(c.as_os_str()),
35        }
36    }
37    result
38}
39
40/// Rule MD051: Link fragments
41///
42/// See [docs/md051.md](../../docs/md051.md) for full documentation, configuration, and examples.
43///
44/// This rule validates that link anchors (the part after #) point to existing headings.
45/// Supports both same-document anchors and cross-file fragment links when linting a workspace.
46#[derive(Clone)]
47pub struct MD051LinkFragments {
48    /// Anchor style to use for validation
49    anchor_style: AnchorStyle,
50}
51
52impl Default for MD051LinkFragments {
53    fn default() -> Self {
54        Self::new()
55    }
56}
57
58impl MD051LinkFragments {
59    pub fn new() -> Self {
60        Self {
61            anchor_style: AnchorStyle::GitHub,
62        }
63    }
64
65    /// Create with specific anchor style
66    pub fn with_anchor_style(style: AnchorStyle) -> Self {
67        Self { anchor_style: style }
68    }
69
70    /// Parse ATX heading content from blockquote inner text.
71    /// Strips the leading `# ` marker, optional closing hash sequence, and extracts custom IDs.
72    /// Returns `(clean_text, custom_id)` or None if not a heading.
73    fn parse_blockquote_heading(bq_content: &str) -> Option<(String, Option<String>)> {
74        static BQ_ATX_HEADING_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(#{1,6})\s+(.*)$").unwrap());
75
76        let trimmed = bq_content.trim();
77        let caps = BQ_ATX_HEADING_RE.captures(trimmed)?;
78        let mut rest = caps.get(2).map_or("", |m| m.as_str()).to_string();
79
80        // Strip optional closing hash sequence (CommonMark: trailing `#`s preceded by a space)
81        let rest_trimmed = rest.trim_end();
82        if let Some(last_hash_pos) = rest_trimmed.rfind('#') {
83            let after_hashes = &rest_trimmed[last_hash_pos..];
84            if after_hashes.chars().all(|c| c == '#') {
85                // Find where the consecutive trailing hashes start
86                let mut hash_start = last_hash_pos;
87                while hash_start > 0 && rest_trimmed.as_bytes()[hash_start - 1] == b'#' {
88                    hash_start -= 1;
89                }
90                // Must be preceded by whitespace (or be the entire content)
91                if hash_start == 0
92                    || rest_trimmed
93                        .as_bytes()
94                        .get(hash_start - 1)
95                        .is_some_and(|b| b.is_ascii_whitespace())
96                {
97                    rest = rest_trimmed[..hash_start].trim_end().to_string();
98                }
99            }
100        }
101
102        let (clean_text, custom_id) = crate::utils::header_id_utils::extract_header_id(&rest);
103        Some((clean_text, custom_id))
104    }
105
106    /// Insert a heading fragment with deduplication.
107    /// When `use_underscore_dedup` is true (Python-Markdown/MkDocs), the primary suffix
108    /// uses `_N` and `-N` is registered as a fallback. Otherwise, only `-N` is used.
109    ///
110    /// Empty fragments (from CJK-only headings) are handled specially for Python-Markdown:
111    /// the first empty slug gets `_1`, the second `_2`, etc. (matching Python-Markdown's
112    /// `unique()` function which always enters the dedup loop for falsy IDs).
113    fn insert_deduplicated_fragment(
114        fragment: String,
115        fragment_counts: &mut HashMap<String, usize>,
116        markdown_headings: &mut HashSet<String>,
117        use_underscore_dedup: bool,
118    ) {
119        if fragment.is_empty() {
120            if !use_underscore_dedup {
121                return;
122            }
123            // Python-Markdown: empty slug → _1, _2, _3, ...
124            let count = fragment_counts.entry(fragment).or_insert(0);
125            *count += 1;
126            markdown_headings.insert(format!("_{count}"));
127            return;
128        }
129        if let Some(count) = fragment_counts.get_mut(&fragment) {
130            let suffix = *count;
131            *count += 1;
132            if use_underscore_dedup {
133                // Python-Markdown primary: heading_1, heading_2
134                markdown_headings.insert(format!("{fragment}_{suffix}"));
135                // Also accept GitHub-style for compatibility
136                markdown_headings.insert(format!("{fragment}-{suffix}"));
137            } else {
138                // GitHub-style: heading-1, heading-2
139                markdown_headings.insert(format!("{fragment}-{suffix}"));
140            }
141        } else {
142            fragment_counts.insert(fragment.clone(), 1);
143            markdown_headings.insert(fragment);
144        }
145    }
146
147    /// Add a heading to the cross-file index with proper deduplication.
148    /// When `use_underscore_dedup` is true (Python-Markdown/MkDocs), the primary anchor
149    /// uses `_N` and `-N` is registered as a fallback alias.
150    ///
151    /// Empty fragments (from CJK-only headings) get `_1`, `_2`, etc. in Python-Markdown mode.
152    fn add_heading_to_index(
153        fragment: &str,
154        text: &str,
155        custom_anchor: Option<String>,
156        line: usize,
157        fragment_counts: &mut HashMap<String, usize>,
158        file_index: &mut FileIndex,
159        use_underscore_dedup: bool,
160    ) {
161        if fragment.is_empty() {
162            if !use_underscore_dedup {
163                return;
164            }
165            // Python-Markdown: empty slug → _1, _2, _3, ...
166            let count = fragment_counts.entry(fragment.to_string()).or_insert(0);
167            *count += 1;
168            file_index.add_heading(HeadingIndex {
169                text: text.to_string(),
170                auto_anchor: format!("_{count}"),
171                custom_anchor,
172                line,
173                is_setext: false,
174            });
175            return;
176        }
177        if let Some(count) = fragment_counts.get_mut(fragment) {
178            let suffix = *count;
179            *count += 1;
180            let (primary, alias) = if use_underscore_dedup {
181                // Python-Markdown primary: heading_1; GitHub fallback: heading-1
182                (format!("{fragment}_{suffix}"), Some(format!("{fragment}-{suffix}")))
183            } else {
184                // GitHub-style primary: heading-1
185                (format!("{fragment}-{suffix}"), None)
186            };
187            file_index.add_heading(HeadingIndex {
188                text: text.to_string(),
189                auto_anchor: primary,
190                custom_anchor,
191                line,
192                is_setext: false,
193            });
194            if let Some(alias_anchor) = alias {
195                let heading_idx = file_index.headings.len() - 1;
196                file_index.add_anchor_alias(alias_anchor, heading_idx);
197            }
198        } else {
199            fragment_counts.insert(fragment.to_string(), 1);
200            file_index.add_heading(HeadingIndex {
201                text: text.to_string(),
202                auto_anchor: fragment.to_string(),
203                custom_anchor,
204                line,
205                is_setext: false,
206            });
207        }
208    }
209
210    /// Extract all valid heading anchors from the document
211    /// Returns (markdown_anchors, html_anchors) where markdown_anchors are lowercased
212    /// for case-insensitive matching, and html_anchors are case-sensitive
213    fn extract_headings_from_context(
214        &self,
215        ctx: &crate::lint_context::LintContext,
216    ) -> (HashSet<String>, HashSet<String>) {
217        let mut markdown_headings = HashSet::with_capacity(32);
218        let mut html_anchors = HashSet::with_capacity(16);
219        let mut fragment_counts = std::collections::HashMap::new();
220        let use_underscore_dedup = self.anchor_style == AnchorStyle::PythonMarkdown;
221
222        for line_info in &ctx.lines {
223            if line_info.in_front_matter {
224                continue;
225            }
226
227            // Skip code blocks for anchor extraction
228            if line_info.in_code_block {
229                continue;
230            }
231
232            let content = line_info.content(ctx.content);
233            let bytes = content.as_bytes();
234
235            // Extract HTML anchor tags with id/name attributes
236            if bytes.contains(&b'<') && (content.contains("id=") || content.contains("name=")) {
237                // HTML spec: only the first id attribute per element is valid
238                // Process element by element to handle multiple id attributes correctly
239                let mut pos = 0;
240                while pos < content.len() {
241                    if let Some(start) = content[pos..].find('<') {
242                        let tag_start = pos + start;
243                        if let Some(end) = content[tag_start..].find('>') {
244                            let tag_end = tag_start + end + 1;
245                            let tag = &content[tag_start..tag_end];
246
247                            // Extract first id or name attribute from this tag
248                            if let Some(caps) = HTML_ANCHOR_PATTERN.find(tag) {
249                                let matched_text = caps.as_str();
250                                if let Some(caps) = HTML_ANCHOR_PATTERN.captures(matched_text)
251                                    && let Some(id_match) = caps.get(1)
252                                {
253                                    let id = id_match.as_str();
254                                    if !id.is_empty() {
255                                        html_anchors.insert(id.to_string());
256                                    }
257                                }
258                            }
259                            pos = tag_end;
260                        } else {
261                            break;
262                        }
263                    } else {
264                        break;
265                    }
266                }
267            }
268
269            // Extract attribute anchors { #id } from non-heading lines
270            // Headings already have custom_id extracted below
271            if line_info.heading.is_none() && content.contains('{') && content.contains('#') {
272                for caps in ATTR_ANCHOR_PATTERN.captures_iter(content) {
273                    if let Some(id_match) = caps.get(1) {
274                        // Add to markdown_headings (lowercased for case-insensitive matching)
275                        markdown_headings.insert(id_match.as_str().to_lowercase());
276                    }
277                }
278            }
279
280            // Extract heading anchors from blockquote content
281            // Blockquote headings (e.g., "> ## Heading") are not detected by the main heading parser
282            // because the regex operates on the full line, but they still generate valid anchors
283            if line_info.heading.is_none()
284                && let Some(bq) = &line_info.blockquote
285                && let Some((clean_text, custom_id)) = Self::parse_blockquote_heading(&bq.content)
286            {
287                if let Some(id) = custom_id {
288                    markdown_headings.insert(id.to_lowercase());
289                }
290                let fragment = self.anchor_style.generate_fragment(&clean_text);
291                Self::insert_deduplicated_fragment(
292                    fragment,
293                    &mut fragment_counts,
294                    &mut markdown_headings,
295                    use_underscore_dedup,
296                );
297            }
298
299            // Extract markdown heading anchors
300            if let Some(heading) = &line_info.heading {
301                // Custom ID from {#custom-id} syntax
302                if let Some(custom_id) = &heading.custom_id {
303                    markdown_headings.insert(custom_id.to_lowercase());
304                }
305
306                // Generate fragment directly from heading text
307                // Note: HTML stripping was removed because it interfered with arrow patterns
308                // like <-> and placeholders like <FILE>. The anchor styles handle these correctly.
309                let fragment = self.anchor_style.generate_fragment(&heading.text);
310
311                Self::insert_deduplicated_fragment(
312                    fragment,
313                    &mut fragment_counts,
314                    &mut markdown_headings,
315                    use_underscore_dedup,
316                );
317            }
318        }
319
320        (markdown_headings, html_anchors)
321    }
322
323    /// Fast check if URL is external (doesn't need to be validated)
324    #[inline]
325    fn is_external_url_fast(url: &str) -> bool {
326        // Quick prefix checks for common protocols
327        url.starts_with("http://")
328            || url.starts_with("https://")
329            || url.starts_with("ftp://")
330            || url.starts_with("mailto:")
331            || url.starts_with("tel:")
332            || url.starts_with("//")
333    }
334
335    /// Resolve a path by trying markdown extensions if it has no extension
336    ///
337    /// For extension-less paths (e.g., `page`), returns a list of paths to try:
338    /// 1. The original path (in case it's already in the index)
339    /// 2. The path with each markdown extension (e.g., `page.md`, `page.markdown`, etc.)
340    ///
341    /// For paths with extensions, returns just the original path.
342    #[inline]
343    fn resolve_path_with_extensions(path: &Path, extensions: &[&str]) -> Vec<PathBuf> {
344        if path.extension().is_none() {
345            // Extension-less path - try with markdown extensions
346            let mut paths = Vec::with_capacity(extensions.len() + 1);
347            // First try the exact path (in case it's already in the index)
348            paths.push(path.to_path_buf());
349            // Then try with each markdown extension
350            for ext in extensions {
351                let path_with_ext = path.with_extension(&ext[1..]); // Remove leading dot
352                paths.push(path_with_ext);
353            }
354            paths
355        } else {
356            // Path has extension - use as-is
357            vec![path.to_path_buf()]
358        }
359    }
360
361    /// Check if a path part (without fragment) is an extension-less path
362    ///
363    /// Extension-less paths are potential cross-file links that need resolution
364    /// with markdown extensions (e.g., `page#section` -> `page.md#section`).
365    ///
366    /// We recognize them as extension-less if:
367    /// 1. Path has no extension (no dot)
368    /// 2. Path is not empty
369    /// 3. Path doesn't look like a query parameter or special syntax
370    /// 4. Path contains at least one alphanumeric character (valid filename)
371    /// 5. Path contains only valid path characters (alphanumeric, slashes, hyphens, underscores)
372    ///
373    /// Optimized: single pass through characters to check both conditions.
374    #[inline]
375    fn is_extensionless_path(path_part: &str) -> bool {
376        // Quick rejections for common non-extension-less cases
377        if path_part.is_empty()
378            || path_part.contains('.')
379            || path_part.contains('?')
380            || path_part.contains('&')
381            || path_part.contains('=')
382        {
383            return false;
384        }
385
386        // Single pass: check for alphanumeric and validate all characters
387        let mut has_alphanumeric = false;
388        for c in path_part.chars() {
389            if c.is_alphanumeric() {
390                has_alphanumeric = true;
391            } else if !matches!(c, '/' | '\\' | '-' | '_') {
392                // Invalid character found - early exit
393                return false;
394            }
395        }
396
397        // Must have at least one alphanumeric character to be a valid filename
398        has_alphanumeric
399    }
400
401    /// Check if URL is a cross-file link (contains a file path before #)
402    #[inline]
403    fn is_cross_file_link(url: &str) -> bool {
404        if let Some(fragment_pos) = url.find('#') {
405            let path_part = &url[..fragment_pos];
406
407            // If there's no path part, it's just a fragment (#heading)
408            if path_part.is_empty() {
409                return false;
410            }
411
412            // Check for Liquid syntax used by Jekyll and other static site generators
413            // Liquid tags: {% ... %} for control flow and includes
414            // Liquid variables: {{ ... }} for outputting values
415            // These are template directives that reference external content and should be skipped
416            // We check for proper bracket order to avoid false positives
417            if let Some(tag_start) = path_part.find("{%")
418                && path_part[tag_start + 2..].contains("%}")
419            {
420                return true;
421            }
422            if let Some(var_start) = path_part.find("{{")
423                && path_part[var_start + 2..].contains("}}")
424            {
425                return true;
426            }
427
428            // Check if it's an absolute path (starts with /)
429            // These are links to other pages on the same site
430            if path_part.starts_with('/') {
431                return true;
432            }
433
434            // Check if it looks like a file path:
435            // - Contains a file extension (dot followed by letters)
436            // - Contains path separators
437            // - Contains relative path indicators
438            // - OR is an extension-less path with a fragment (GitHub-style: page#section)
439            let has_extension = path_part.contains('.')
440                && (
441                    // Has file extension pattern (handle query parameters by splitting on them first)
442                    {
443                    let clean_path = path_part.split('?').next().unwrap_or(path_part);
444                    // Handle files starting with dot
445                    if let Some(after_dot) = clean_path.strip_prefix('.') {
446                        let dots_count = clean_path.matches('.').count();
447                        if dots_count == 1 {
448                            // Could be ".ext" (file extension) or ".hidden" (hidden file)
449                            // Treat short alphanumeric suffixes as file extensions
450                            !after_dot.is_empty() && after_dot.len() <= 10 &&
451                            after_dot.chars().all(|c| c.is_ascii_alphanumeric())
452                        } else {
453                            // Hidden file with extension like ".hidden.txt"
454                            clean_path.split('.').next_back().is_some_and(|ext| {
455                                !ext.is_empty() && ext.len() <= 10 && ext.chars().all(|c| c.is_ascii_alphanumeric())
456                            })
457                        }
458                    } else {
459                        // Regular file path
460                        clean_path.split('.').next_back().is_some_and(|ext| {
461                            !ext.is_empty() && ext.len() <= 10 && ext.chars().all(|c| c.is_ascii_alphanumeric())
462                        })
463                    }
464                } ||
465                // Or contains path separators
466                path_part.contains('/') || path_part.contains('\\') ||
467                // Or starts with relative path indicators
468                path_part.starts_with("./") || path_part.starts_with("../")
469                );
470
471            // Extension-less paths with fragments are potential cross-file links
472            // This supports GitHub-style links like [link](page#section) that resolve to page.md#section
473            let is_extensionless = Self::is_extensionless_path(path_part);
474
475            has_extension || is_extensionless
476        } else {
477            false
478        }
479    }
480}
481
482impl Rule for MD051LinkFragments {
483    fn name(&self) -> &'static str {
484        "MD051"
485    }
486
487    fn description(&self) -> &'static str {
488        "Link fragments should reference valid headings"
489    }
490
491    fn fix_capability(&self) -> FixCapability {
492        FixCapability::Unfixable
493    }
494
495    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
496        // Skip if no link fragments present
497        if !ctx.likely_has_links_or_images() {
498            return true;
499        }
500        // Check for # character (fragments)
501        !ctx.has_char('#')
502    }
503
504    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
505        let mut warnings = Vec::new();
506
507        if ctx.content.is_empty() || ctx.links.is_empty() || self.should_skip(ctx) {
508            return Ok(warnings);
509        }
510
511        let (markdown_headings, html_anchors) = self.extract_headings_from_context(ctx);
512
513        for link in &ctx.links {
514            if link.is_reference {
515                continue;
516            }
517
518            // Skip links inside PyMdown blocks (MkDocs flavor)
519            if ctx.line_info(link.line).is_some_and(|info| info.in_pymdown_block) {
520                continue;
521            }
522
523            // Skip wiki-links - they reference other files and may have their own fragment validation
524            if matches!(link.link_type, LinkType::WikiLink { .. }) {
525                continue;
526            }
527
528            // Skip links inside Jinja templates
529            if ctx.is_in_jinja_range(link.byte_offset) {
530                continue;
531            }
532
533            // Skip Quarto/Pandoc citations ([@citation], @citation)
534            // Citations are bibliography references, not link fragments
535            if ctx.flavor == crate::config::MarkdownFlavor::Quarto && ctx.is_in_citation(link.byte_offset) {
536                continue;
537            }
538
539            // Skip links inside shortcodes ({{< ... >}} or {{% ... %}})
540            // Shortcodes may contain template syntax that looks like fragment links
541            if ctx.is_in_shortcode(link.byte_offset) {
542                continue;
543            }
544
545            let url = &link.url;
546
547            // Skip links without fragments or external URLs
548            if !url.contains('#') || Self::is_external_url_fast(url) {
549                continue;
550            }
551
552            // Skip mdbook template placeholders ({{#VARIABLE}})
553            // mdbook uses {{#VARIABLE}} syntax where # is part of the template, not a fragment
554            if url.contains("{{#") && url.contains("}}") {
555                continue;
556            }
557
558            // Skip Quarto/RMarkdown cross-references (@fig-, @tbl-, @sec-, @eq-, etc.)
559            // These are special cross-reference syntax, not HTML anchors
560            // Format: @prefix-identifier or just @identifier
561            if url.starts_with('@') {
562                continue;
563            }
564
565            // Cross-file links are valid if the file exists (not checked here)
566            if Self::is_cross_file_link(url) {
567                continue;
568            }
569
570            let Some(fragment_pos) = url.find('#') else {
571                continue;
572            };
573
574            let fragment = &url[fragment_pos + 1..];
575
576            // Skip Liquid template variables and filters
577            if (url.contains("{{") && fragment.contains('|')) || fragment.ends_with("}}") || fragment.ends_with("%}") {
578                continue;
579            }
580
581            if fragment.is_empty() {
582                continue;
583            }
584
585            // Skip MkDocs runtime-generated anchors:
586            // - #fn:NAME, #fnref:NAME from the footnotes extension
587            // - #+key.path or #+key:value from Material for MkDocs option references
588            //   (e.g., #+type:abstract, #+toc.slugify, #+pymdownx.highlight.anchor_linenums)
589            if ctx.flavor == crate::config::MarkdownFlavor::MkDocs
590                && (fragment.starts_with("fn:")
591                    || fragment.starts_with("fnref:")
592                    || (fragment.starts_with('+') && (fragment.contains('.') || fragment.contains(':'))))
593            {
594                continue;
595            }
596
597            // Validate fragment against document headings
598            // HTML anchors are case-sensitive, markdown anchors are case-insensitive
599            let found = if html_anchors.contains(fragment) {
600                true
601            } else {
602                let fragment_lower = fragment.to_lowercase();
603                markdown_headings.contains(&fragment_lower)
604            };
605
606            if !found {
607                warnings.push(LintWarning {
608                    rule_name: Some(self.name().to_string()),
609                    message: format!("Link anchor '#{fragment}' does not exist in document headings"),
610                    line: link.line,
611                    column: link.start_col + 1,
612                    end_line: link.line,
613                    end_column: link.end_col + 1,
614                    severity: Severity::Error,
615                    fix: None,
616                });
617            }
618        }
619
620        Ok(warnings)
621    }
622
623    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
624        // MD051 does not provide auto-fix
625        // Link fragment corrections require human judgment to avoid incorrect fixes
626        Ok(ctx.content.to_string())
627    }
628
629    fn as_any(&self) -> &dyn std::any::Any {
630        self
631    }
632
633    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
634    where
635        Self: Sized,
636    {
637        // Config keys are normalized to kebab-case by the config system
638        let explicit_style = config
639            .rules
640            .get("MD051")
641            .and_then(|rc| rc.values.get("anchor-style"))
642            .and_then(|v| v.as_str())
643            .map(|style_str| match style_str.to_lowercase().as_str() {
644                "kramdown" => AnchorStyle::Kramdown,
645                "kramdown-gfm" | "jekyll" => AnchorStyle::KramdownGfm,
646                "python-markdown" | "python_markdown" | "mkdocs" => AnchorStyle::PythonMarkdown,
647                _ => AnchorStyle::GitHub,
648            });
649
650        // When a flavor is active and no explicit anchor style is configured,
651        // default to the flavor's native anchor generation
652        let anchor_style = explicit_style.unwrap_or(match config.global.flavor {
653            crate::config::MarkdownFlavor::MkDocs => AnchorStyle::PythonMarkdown,
654            crate::config::MarkdownFlavor::Kramdown => AnchorStyle::KramdownGfm,
655            _ => AnchorStyle::GitHub,
656        });
657
658        Box::new(MD051LinkFragments::with_anchor_style(anchor_style))
659    }
660
661    fn category(&self) -> RuleCategory {
662        RuleCategory::Link
663    }
664
665    fn cross_file_scope(&self) -> CrossFileScope {
666        CrossFileScope::Workspace
667    }
668
669    fn contribute_to_index(&self, ctx: &crate::lint_context::LintContext, file_index: &mut FileIndex) {
670        let mut fragment_counts = HashMap::new();
671        let use_underscore_dedup = self.anchor_style == AnchorStyle::PythonMarkdown;
672
673        // Extract headings, HTML anchors, and attribute anchors (for other files to reference)
674        for (line_idx, line_info) in ctx.lines.iter().enumerate() {
675            if line_info.in_front_matter {
676                continue;
677            }
678
679            // Skip code blocks for anchor extraction
680            if line_info.in_code_block {
681                continue;
682            }
683
684            let content = line_info.content(ctx.content);
685
686            // Extract HTML anchors (id or name attributes on any element)
687            if content.contains('<') && (content.contains("id=") || content.contains("name=")) {
688                let mut pos = 0;
689                while pos < content.len() {
690                    if let Some(start) = content[pos..].find('<') {
691                        let tag_start = pos + start;
692                        if let Some(end) = content[tag_start..].find('>') {
693                            let tag_end = tag_start + end + 1;
694                            let tag = &content[tag_start..tag_end];
695
696                            if let Some(caps) = HTML_ANCHOR_PATTERN.captures(tag)
697                                && let Some(id_match) = caps.get(1)
698                            {
699                                file_index.add_html_anchor(id_match.as_str().to_string());
700                            }
701                            pos = tag_end;
702                        } else {
703                            break;
704                        }
705                    } else {
706                        break;
707                    }
708                }
709            }
710
711            // Extract attribute anchors { #id } on non-heading lines
712            // Headings already have custom_id extracted via heading.custom_id
713            if line_info.heading.is_none() && content.contains("{") && content.contains("#") {
714                for caps in ATTR_ANCHOR_PATTERN.captures_iter(content) {
715                    if let Some(id_match) = caps.get(1) {
716                        file_index.add_attribute_anchor(id_match.as_str().to_string());
717                    }
718                }
719            }
720
721            // Extract heading anchors from blockquote content
722            if line_info.heading.is_none()
723                && let Some(bq) = &line_info.blockquote
724                && let Some((clean_text, custom_id)) = Self::parse_blockquote_heading(&bq.content)
725            {
726                let fragment = self.anchor_style.generate_fragment(&clean_text);
727                Self::add_heading_to_index(
728                    &fragment,
729                    &clean_text,
730                    custom_id,
731                    line_idx + 1,
732                    &mut fragment_counts,
733                    file_index,
734                    use_underscore_dedup,
735                );
736            }
737
738            // Extract heading anchors
739            if let Some(heading) = &line_info.heading {
740                let fragment = self.anchor_style.generate_fragment(&heading.text);
741
742                Self::add_heading_to_index(
743                    &fragment,
744                    &heading.text,
745                    heading.custom_id.clone(),
746                    line_idx + 1,
747                    &mut fragment_counts,
748                    file_index,
749                    use_underscore_dedup,
750                );
751
752                // Extract Material for MkDocs setting anchors from headings.
753                // These are rendered as anchors at build time by Material's JS.
754                // Most references use #+key.path format (handled by the skip logic in check()),
755                // but this extraction enables cross-file validation for direct #key.path references.
756                if ctx.flavor == crate::config::MarkdownFlavor::MkDocs
757                    && let Some(caps) = MD_SETTING_PATTERN.captures(content)
758                    && let Some(name) = caps.get(1)
759                {
760                    file_index.add_html_anchor(name.as_str().to_string());
761                }
762            }
763        }
764
765        // Extract cross-file links (for validation against other files)
766        for link in &ctx.links {
767            if link.is_reference {
768                continue;
769            }
770
771            // Skip links inside PyMdown blocks (MkDocs flavor)
772            if ctx.line_info(link.line).is_some_and(|info| info.in_pymdown_block) {
773                continue;
774            }
775
776            // Skip wiki-links - they use a different linking system and are not validated
777            // as relative file paths
778            if matches!(link.link_type, LinkType::WikiLink { .. }) {
779                continue;
780            }
781
782            let url = &link.url;
783
784            // Skip external URLs
785            if Self::is_external_url_fast(url) {
786                continue;
787            }
788
789            // Only process cross-file links with fragments
790            if Self::is_cross_file_link(url)
791                && let Some(fragment_pos) = url.find('#')
792            {
793                let path_part = &url[..fragment_pos];
794                let fragment = &url[fragment_pos + 1..];
795
796                // Skip empty fragments or template syntax
797                if fragment.is_empty() || fragment.contains("{{") || fragment.contains("{%") {
798                    continue;
799                }
800
801                file_index.add_cross_file_link(CrossFileLinkIndex {
802                    target_path: path_part.to_string(),
803                    fragment: fragment.to_string(),
804                    line: link.line,
805                    column: link.start_col + 1,
806                });
807            }
808        }
809    }
810
811    fn cross_file_check(
812        &self,
813        file_path: &Path,
814        file_index: &FileIndex,
815        workspace_index: &crate::workspace_index::WorkspaceIndex,
816    ) -> LintResult {
817        let mut warnings = Vec::new();
818
819        // Supported markdown file extensions (with leading dot, matching MD057)
820        const MARKDOWN_EXTENSIONS: &[&str] = &[
821            ".md",
822            ".markdown",
823            ".mdx",
824            ".mkd",
825            ".mkdn",
826            ".mdown",
827            ".mdwn",
828            ".qmd",
829            ".rmd",
830        ];
831
832        // Check each cross-file link in this file
833        for cross_link in &file_index.cross_file_links {
834            // Skip cross-file links without fragments - nothing to validate
835            if cross_link.fragment.is_empty() {
836                continue;
837            }
838
839            // Resolve the target file path relative to the current file
840            let base_target_path = if let Some(parent) = file_path.parent() {
841                parent.join(&cross_link.target_path)
842            } else {
843                Path::new(&cross_link.target_path).to_path_buf()
844            };
845
846            // Normalize the path (remove . and ..)
847            let base_target_path = normalize_path(&base_target_path);
848
849            // For extension-less paths, try resolving with markdown extensions
850            // This handles GitHub-style links like [link](page#section) -> page.md#section
851            let target_paths_to_try = Self::resolve_path_with_extensions(&base_target_path, MARKDOWN_EXTENSIONS);
852
853            // Try to find the target file in the workspace index
854            let mut target_file_index = None;
855
856            for target_path in &target_paths_to_try {
857                if let Some(index) = workspace_index.get_file(target_path) {
858                    target_file_index = Some(index);
859                    break;
860                }
861            }
862
863            if let Some(target_file_index) = target_file_index {
864                // Check if the fragment matches any heading in the target file (O(1) lookup)
865                if !target_file_index.has_anchor(&cross_link.fragment) {
866                    warnings.push(LintWarning {
867                        rule_name: Some(self.name().to_string()),
868                        line: cross_link.line,
869                        column: cross_link.column,
870                        end_line: cross_link.line,
871                        end_column: cross_link.column + cross_link.target_path.len() + 1 + cross_link.fragment.len(),
872                        message: format!(
873                            "Link fragment '{}' not found in '{}'",
874                            cross_link.fragment, cross_link.target_path
875                        ),
876                        severity: Severity::Error,
877                        fix: None,
878                    });
879                }
880            }
881            // If target file not in index, skip (could be external file or not in workspace)
882        }
883
884        Ok(warnings)
885    }
886
887    fn default_config_section(&self) -> Option<(String, toml::Value)> {
888        let value: toml::Value = toml::from_str(
889            r#"
890# Anchor generation style to match your target platform
891# Options: "github" (default), "kramdown-gfm", "kramdown"
892# Note: "jekyll" is accepted as an alias for "kramdown-gfm" (backward compatibility)
893anchor-style = "github"
894"#,
895        )
896        .ok()?;
897        Some(("MD051".to_string(), value))
898    }
899}
900
901#[cfg(test)]
902mod tests {
903    use super::*;
904    use crate::lint_context::LintContext;
905
906    #[test]
907    fn test_quarto_cross_references() {
908        let rule = MD051LinkFragments::new();
909
910        // Test that Quarto cross-references are skipped
911        let content = r#"# Test Document
912
913## Figures
914
915See [@fig-plot] for the visualization.
916
917More details in [@tbl-results] and [@sec-methods].
918
919The equation [@eq-regression] shows the relationship.
920
921Reference to [@lst-code] for implementation."#;
922        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Quarto, None);
923        let result = rule.check(&ctx).unwrap();
924        assert!(
925            result.is_empty(),
926            "Quarto cross-references (@fig-, @tbl-, @sec-, @eq-) should not trigger MD051 warnings. Got {} warnings",
927            result.len()
928        );
929
930        // Test that normal anchors still work
931        let content_with_anchor = r#"# Test
932
933See [link](#test) for details."#;
934        let ctx_anchor = LintContext::new(content_with_anchor, crate::config::MarkdownFlavor::Quarto, None);
935        let result_anchor = rule.check(&ctx_anchor).unwrap();
936        assert!(result_anchor.is_empty(), "Valid anchor should not trigger warning");
937
938        // Test that invalid anchors are still flagged
939        let content_invalid = r#"# Test
940
941See [link](#nonexistent) for details."#;
942        let ctx_invalid = LintContext::new(content_invalid, crate::config::MarkdownFlavor::Quarto, None);
943        let result_invalid = rule.check(&ctx_invalid).unwrap();
944        assert_eq!(result_invalid.len(), 1, "Invalid anchor should still trigger warning");
945    }
946
947    #[test]
948    fn test_jsx_in_heading_anchor() {
949        // Issue #510: JSX/HTML tags in headings should be stripped for anchor generation
950        let rule = MD051LinkFragments::new();
951
952        // Self-closing JSX tag
953        let content = "# Test\n\n### `retentionPolicy`<Component />\n\n[link](#retentionpolicy)\n";
954        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
955        let result = rule.check(&ctx).unwrap();
956        assert!(
957            result.is_empty(),
958            "JSX self-closing tag should be stripped from anchor: got {result:?}"
959        );
960
961        // JSX with attributes
962        let content2 =
963            "### retentionPolicy<HeaderTag type=\"danger\" text=\"required\" />\n\n[link](#retentionpolicy)\n";
964        let ctx2 = LintContext::new(content2, crate::config::MarkdownFlavor::Standard, None);
965        let result2 = rule.check(&ctx2).unwrap();
966        assert!(
967            result2.is_empty(),
968            "JSX tag with attributes should be stripped from anchor: got {result2:?}"
969        );
970
971        // HTML tags with inner text preserved
972        let content3 = "### Test <span>extra</span>\n\n[link](#test-extra)\n";
973        let ctx3 = LintContext::new(content3, crate::config::MarkdownFlavor::Standard, None);
974        let result3 = rule.check(&ctx3).unwrap();
975        assert!(
976            result3.is_empty(),
977            "HTML tag content should be preserved in anchor: got {result3:?}"
978        );
979    }
980
981    // Cross-file validation tests
982    #[test]
983    fn test_cross_file_scope() {
984        let rule = MD051LinkFragments::new();
985        assert_eq!(rule.cross_file_scope(), CrossFileScope::Workspace);
986    }
987
988    #[test]
989    fn test_contribute_to_index_extracts_headings() {
990        let rule = MD051LinkFragments::new();
991        let content = "# First Heading\n\n# Second { #custom }\n\n## Third";
992        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
993
994        let mut file_index = FileIndex::new();
995        rule.contribute_to_index(&ctx, &mut file_index);
996
997        assert_eq!(file_index.headings.len(), 3);
998        assert_eq!(file_index.headings[0].text, "First Heading");
999        assert_eq!(file_index.headings[0].auto_anchor, "first-heading");
1000        assert!(file_index.headings[0].custom_anchor.is_none());
1001
1002        assert_eq!(file_index.headings[1].text, "Second");
1003        assert_eq!(file_index.headings[1].custom_anchor, Some("custom".to_string()));
1004
1005        assert_eq!(file_index.headings[2].text, "Third");
1006    }
1007
1008    #[test]
1009    fn test_contribute_to_index_extracts_cross_file_links() {
1010        let rule = MD051LinkFragments::new();
1011        let content = "See [docs](other.md#installation) and [more](../guide.md#getting-started)";
1012        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
1013
1014        let mut file_index = FileIndex::new();
1015        rule.contribute_to_index(&ctx, &mut file_index);
1016
1017        assert_eq!(file_index.cross_file_links.len(), 2);
1018        assert_eq!(file_index.cross_file_links[0].target_path, "other.md");
1019        assert_eq!(file_index.cross_file_links[0].fragment, "installation");
1020        assert_eq!(file_index.cross_file_links[1].target_path, "../guide.md");
1021        assert_eq!(file_index.cross_file_links[1].fragment, "getting-started");
1022    }
1023
1024    #[test]
1025    fn test_cross_file_check_valid_fragment() {
1026        use crate::workspace_index::WorkspaceIndex;
1027
1028        let rule = MD051LinkFragments::new();
1029
1030        // Build workspace index with target file
1031        let mut workspace_index = WorkspaceIndex::new();
1032        let mut target_file_index = FileIndex::new();
1033        target_file_index.add_heading(HeadingIndex {
1034            text: "Installation Guide".to_string(),
1035            auto_anchor: "installation-guide".to_string(),
1036            custom_anchor: None,
1037            line: 1,
1038            is_setext: false,
1039        });
1040        workspace_index.insert_file(PathBuf::from("docs/install.md"), target_file_index);
1041
1042        // Create a FileIndex for the file being checked
1043        let mut current_file_index = FileIndex::new();
1044        current_file_index.add_cross_file_link(CrossFileLinkIndex {
1045            target_path: "install.md".to_string(),
1046            fragment: "installation-guide".to_string(),
1047            line: 3,
1048            column: 5,
1049        });
1050
1051        let warnings = rule
1052            .cross_file_check(Path::new("docs/readme.md"), &current_file_index, &workspace_index)
1053            .unwrap();
1054
1055        // Should find no warnings since fragment exists
1056        assert!(warnings.is_empty());
1057    }
1058
1059    #[test]
1060    fn test_cross_file_check_invalid_fragment() {
1061        use crate::workspace_index::WorkspaceIndex;
1062
1063        let rule = MD051LinkFragments::new();
1064
1065        // Build workspace index with target file
1066        let mut workspace_index = WorkspaceIndex::new();
1067        let mut target_file_index = FileIndex::new();
1068        target_file_index.add_heading(HeadingIndex {
1069            text: "Installation Guide".to_string(),
1070            auto_anchor: "installation-guide".to_string(),
1071            custom_anchor: None,
1072            line: 1,
1073            is_setext: false,
1074        });
1075        workspace_index.insert_file(PathBuf::from("docs/install.md"), target_file_index);
1076
1077        // Create a FileIndex with a cross-file link pointing to non-existent fragment
1078        let mut current_file_index = FileIndex::new();
1079        current_file_index.add_cross_file_link(CrossFileLinkIndex {
1080            target_path: "install.md".to_string(),
1081            fragment: "nonexistent".to_string(),
1082            line: 3,
1083            column: 5,
1084        });
1085
1086        let warnings = rule
1087            .cross_file_check(Path::new("docs/readme.md"), &current_file_index, &workspace_index)
1088            .unwrap();
1089
1090        // Should find one warning since fragment doesn't exist
1091        assert_eq!(warnings.len(), 1);
1092        assert!(warnings[0].message.contains("nonexistent"));
1093        assert!(warnings[0].message.contains("install.md"));
1094    }
1095
1096    #[test]
1097    fn test_cross_file_check_custom_anchor_match() {
1098        use crate::workspace_index::WorkspaceIndex;
1099
1100        let rule = MD051LinkFragments::new();
1101
1102        // Build workspace index with target file that has custom anchor
1103        let mut workspace_index = WorkspaceIndex::new();
1104        let mut target_file_index = FileIndex::new();
1105        target_file_index.add_heading(HeadingIndex {
1106            text: "Installation Guide".to_string(),
1107            auto_anchor: "installation-guide".to_string(),
1108            custom_anchor: Some("install".to_string()),
1109            line: 1,
1110            is_setext: false,
1111        });
1112        workspace_index.insert_file(PathBuf::from("docs/install.md"), target_file_index);
1113
1114        // Link uses custom anchor
1115        let mut current_file_index = FileIndex::new();
1116        current_file_index.add_cross_file_link(CrossFileLinkIndex {
1117            target_path: "install.md".to_string(),
1118            fragment: "install".to_string(),
1119            line: 3,
1120            column: 5,
1121        });
1122
1123        let warnings = rule
1124            .cross_file_check(Path::new("docs/readme.md"), &current_file_index, &workspace_index)
1125            .unwrap();
1126
1127        // Should find no warnings since custom anchor matches
1128        assert!(warnings.is_empty());
1129    }
1130
1131    #[test]
1132    fn test_cross_file_check_target_not_in_workspace() {
1133        use crate::workspace_index::WorkspaceIndex;
1134
1135        let rule = MD051LinkFragments::new();
1136
1137        // Empty workspace index
1138        let workspace_index = WorkspaceIndex::new();
1139
1140        // Link to file not in workspace
1141        let mut current_file_index = FileIndex::new();
1142        current_file_index.add_cross_file_link(CrossFileLinkIndex {
1143            target_path: "external.md".to_string(),
1144            fragment: "heading".to_string(),
1145            line: 3,
1146            column: 5,
1147        });
1148
1149        let warnings = rule
1150            .cross_file_check(Path::new("docs/readme.md"), &current_file_index, &workspace_index)
1151            .unwrap();
1152
1153        // Should not warn about files not in workspace
1154        assert!(warnings.is_empty());
1155    }
1156
1157    #[test]
1158    fn test_wikilinks_skipped_in_check() {
1159        // Wikilinks should not trigger MD051 warnings for missing fragments
1160        let rule = MD051LinkFragments::new();
1161
1162        let content = r#"# Test Document
1163
1164## Valid Heading
1165
1166[[Microsoft#Windows OS]]
1167[[SomePage#section]]
1168[[page|Display Text]]
1169[[path/to/page#section]]
1170"#;
1171        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
1172        let result = rule.check(&ctx).unwrap();
1173
1174        assert!(
1175            result.is_empty(),
1176            "Wikilinks should not trigger MD051 warnings. Got: {result:?}"
1177        );
1178    }
1179
1180    #[test]
1181    fn test_wikilinks_not_added_to_cross_file_index() {
1182        // Wikilinks should not be added to the cross-file link index
1183        let rule = MD051LinkFragments::new();
1184
1185        let content = r#"# Test Document
1186
1187[[Microsoft#Windows OS]]
1188[[SomePage#section]]
1189[Regular Link](other.md#section)
1190"#;
1191        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
1192
1193        let mut file_index = FileIndex::new();
1194        rule.contribute_to_index(&ctx, &mut file_index);
1195
1196        // Should only have one cross-file link (the regular markdown link)
1197        // Wikilinks should not be added
1198        let cross_file_links = &file_index.cross_file_links;
1199        assert_eq!(
1200            cross_file_links.len(),
1201            1,
1202            "Only regular markdown links should be indexed, not wikilinks. Got: {cross_file_links:?}"
1203        );
1204        assert_eq!(file_index.cross_file_links[0].target_path, "other.md");
1205        assert_eq!(file_index.cross_file_links[0].fragment, "section");
1206    }
1207}
rumdl_lib/rules/md051_link_fragments.rs

rumdl_lib/rules/
md051_link_fragments.rs