Skip to main content

rumdl_lib/rules/
md051_link_fragments.rs

1use crate::rule::{CrossFileScope, LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
2use crate::utils::anchor_styles::AnchorStyle;
3use crate::workspace_index::{CrossFileLinkIndex, FileIndex, HeadingIndex};
4use pulldown_cmark::LinkType;
5use regex::Regex;
6use std::collections::{HashMap, HashSet};
7use std::path::{Component, Path, PathBuf};
8use std::sync::LazyLock;
9// HTML tags with id or name attributes (supports any HTML element, not just <a>)
10// This pattern only captures the first id/name attribute in a tag
11static HTML_ANCHOR_PATTERN: LazyLock<Regex> =
12    LazyLock::new(|| Regex::new(r#"\b(?:id|name)\s*=\s*["']([^"']+)["']"#).unwrap());
13
14// Attribute anchor pattern for kramdown/MkDocs { #id } syntax
15// Matches {#id} or { #id } with optional spaces, supports multiple anchors
16// Also supports classes and attributes: { #id .class key=value }
17static ATTR_ANCHOR_PATTERN: LazyLock<Regex> =
18    LazyLock::new(|| Regex::new(r#"\{\s*#([a-zA-Z][a-zA-Z0-9_-]*)[^}]*\}"#).unwrap());
19
20// Material for MkDocs setting anchor pattern: <!-- md:setting NAME -->
21// Used in headings to generate anchors for configuration option references
22static MD_SETTING_PATTERN: LazyLock<Regex> =
23    LazyLock::new(|| Regex::new(r"<!--\s*md:setting\s+([^\s]+)\s*-->").unwrap());
24
25/// Normalize a path by resolving . and .. components
26fn normalize_path(path: &Path) -> PathBuf {
27    let mut result = PathBuf::new();
28    for component in path.components() {
29        match component {
30            Component::CurDir => {} // Skip .
31            Component::ParentDir => {
32                result.pop(); // Go up one level for ..
33            }
34            c => result.push(c.as_os_str()),
35        }
36    }
37    result
38}
39
40/// Rule MD051: Link fragments
41///
42/// See [docs/md051.md](../../docs/md051.md) for full documentation, configuration, and examples.
43///
44/// This rule validates that link anchors (the part after #) point to existing headings.
45/// Supports both same-document anchors and cross-file fragment links when linting a workspace.
46#[derive(Clone)]
47pub struct MD051LinkFragments {
48    /// Anchor style to use for validation
49    anchor_style: AnchorStyle,
50}
51
52impl Default for MD051LinkFragments {
53    fn default() -> Self {
54        Self::new()
55    }
56}
57
58impl MD051LinkFragments {
59    pub fn new() -> Self {
60        Self {
61            anchor_style: AnchorStyle::GitHub,
62        }
63    }
64
65    /// Create with specific anchor style
66    pub fn with_anchor_style(style: AnchorStyle) -> Self {
67        Self { anchor_style: style }
68    }
69
70    /// Parse ATX heading content from blockquote inner text.
71    /// Strips the leading `# ` marker, optional closing hash sequence, and extracts custom IDs.
72    /// Returns `(clean_text, custom_id)` or None if not a heading.
73    fn parse_blockquote_heading(bq_content: &str) -> Option<(String, Option<String>)> {
74        static BQ_ATX_HEADING_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(#{1,6})\s+(.*)$").unwrap());
75
76        let trimmed = bq_content.trim();
77        let caps = BQ_ATX_HEADING_RE.captures(trimmed)?;
78        let mut rest = caps.get(2).map_or("", |m| m.as_str()).to_string();
79
80        // Strip optional closing hash sequence (CommonMark: trailing `#`s preceded by a space)
81        let rest_trimmed = rest.trim_end();
82        if let Some(last_hash_pos) = rest_trimmed.rfind('#') {
83            let after_hashes = &rest_trimmed[last_hash_pos..];
84            if after_hashes.chars().all(|c| c == '#') {
85                // Find where the consecutive trailing hashes start
86                let mut hash_start = last_hash_pos;
87                while hash_start > 0 && rest_trimmed.as_bytes()[hash_start - 1] == b'#' {
88                    hash_start -= 1;
89                }
90                // Must be preceded by whitespace (or be the entire content)
91                if hash_start == 0
92                    || rest_trimmed
93                        .as_bytes()
94                        .get(hash_start - 1)
95                        .is_some_and(|b| b.is_ascii_whitespace())
96                {
97                    rest = rest_trimmed[..hash_start].trim_end().to_string();
98                }
99            }
100        }
101
102        let (clean_text, custom_id) = crate::utils::header_id_utils::extract_header_id(&rest);
103        Some((clean_text, custom_id))
104    }
105
106    /// Insert a heading fragment with deduplication.
107    /// When `use_underscore_dedup` is true (Python-Markdown/MkDocs), the primary suffix
108    /// uses `_N` and `-N` is registered as a fallback. Otherwise, only `-N` is used.
109    ///
110    /// Empty fragments (from CJK-only headings) are handled specially for Python-Markdown:
111    /// the first empty slug gets `_1`, the second `_2`, etc. (matching Python-Markdown's
112    /// `unique()` function which always enters the dedup loop for falsy IDs).
113    fn insert_deduplicated_fragment(
114        fragment: String,
115        fragment_counts: &mut HashMap<String, usize>,
116        markdown_headings: &mut HashSet<String>,
117        use_underscore_dedup: bool,
118    ) {
119        if fragment.is_empty() {
120            if !use_underscore_dedup {
121                return;
122            }
123            // Python-Markdown: empty slug → _1, _2, _3, ...
124            let count = fragment_counts.entry(fragment).or_insert(0);
125            *count += 1;
126            markdown_headings.insert(format!("_{count}"));
127            return;
128        }
129        if let Some(count) = fragment_counts.get_mut(&fragment) {
130            let suffix = *count;
131            *count += 1;
132            if use_underscore_dedup {
133                // Python-Markdown primary: heading_1, heading_2
134                markdown_headings.insert(format!("{fragment}_{suffix}"));
135                // Also accept GitHub-style for compatibility
136                markdown_headings.insert(format!("{fragment}-{suffix}"));
137            } else {
138                // GitHub-style: heading-1, heading-2
139                markdown_headings.insert(format!("{fragment}-{suffix}"));
140            }
141        } else {
142            fragment_counts.insert(fragment.clone(), 1);
143            markdown_headings.insert(fragment);
144        }
145    }
146
147    /// Add a heading to the cross-file index with proper deduplication.
148    /// When `use_underscore_dedup` is true (Python-Markdown/MkDocs), the primary anchor
149    /// uses `_N` and `-N` is registered as a fallback alias.
150    ///
151    /// Empty fragments (from CJK-only headings) get `_1`, `_2`, etc. in Python-Markdown mode.
152    fn add_heading_to_index(
153        fragment: &str,
154        text: &str,
155        custom_anchor: Option<String>,
156        line: usize,
157        fragment_counts: &mut HashMap<String, usize>,
158        file_index: &mut FileIndex,
159        use_underscore_dedup: bool,
160    ) {
161        if fragment.is_empty() {
162            if !use_underscore_dedup {
163                return;
164            }
165            // Python-Markdown: empty slug → _1, _2, _3, ...
166            let count = fragment_counts.entry(fragment.to_string()).or_insert(0);
167            *count += 1;
168            file_index.add_heading(HeadingIndex {
169                text: text.to_string(),
170                auto_anchor: format!("_{count}"),
171                custom_anchor,
172                line,
173            });
174            return;
175        }
176        if let Some(count) = fragment_counts.get_mut(fragment) {
177            let suffix = *count;
178            *count += 1;
179            let (primary, alias) = if use_underscore_dedup {
180                // Python-Markdown primary: heading_1; GitHub fallback: heading-1
181                (format!("{fragment}_{suffix}"), Some(format!("{fragment}-{suffix}")))
182            } else {
183                // GitHub-style primary: heading-1
184                (format!("{fragment}-{suffix}"), None)
185            };
186            file_index.add_heading(HeadingIndex {
187                text: text.to_string(),
188                auto_anchor: primary,
189                custom_anchor,
190                line,
191            });
192            if let Some(alias_anchor) = alias {
193                let heading_idx = file_index.headings.len() - 1;
194                file_index.add_anchor_alias(alias_anchor, heading_idx);
195            }
196        } else {
197            fragment_counts.insert(fragment.to_string(), 1);
198            file_index.add_heading(HeadingIndex {
199                text: text.to_string(),
200                auto_anchor: fragment.to_string(),
201                custom_anchor,
202                line,
203            });
204        }
205    }
206
207    /// Extract all valid heading anchors from the document
208    /// Returns (markdown_anchors, html_anchors) where markdown_anchors are lowercased
209    /// for case-insensitive matching, and html_anchors are case-sensitive
210    fn extract_headings_from_context(
211        &self,
212        ctx: &crate::lint_context::LintContext,
213    ) -> (HashSet<String>, HashSet<String>) {
214        let mut markdown_headings = HashSet::with_capacity(32);
215        let mut html_anchors = HashSet::with_capacity(16);
216        let mut fragment_counts = std::collections::HashMap::new();
217        let use_underscore_dedup = self.anchor_style == AnchorStyle::PythonMarkdown;
218
219        for line_info in &ctx.lines {
220            if line_info.in_front_matter {
221                continue;
222            }
223
224            // Skip code blocks for anchor extraction
225            if line_info.in_code_block {
226                continue;
227            }
228
229            let content = line_info.content(ctx.content);
230            let bytes = content.as_bytes();
231
232            // Extract HTML anchor tags with id/name attributes
233            if bytes.contains(&b'<') && (content.contains("id=") || content.contains("name=")) {
234                // HTML spec: only the first id attribute per element is valid
235                // Process element by element to handle multiple id attributes correctly
236                let mut pos = 0;
237                while pos < content.len() {
238                    if let Some(start) = content[pos..].find('<') {
239                        let tag_start = pos + start;
240                        if let Some(end) = content[tag_start..].find('>') {
241                            let tag_end = tag_start + end + 1;
242                            let tag = &content[tag_start..tag_end];
243
244                            // Extract first id or name attribute from this tag
245                            if let Some(caps) = HTML_ANCHOR_PATTERN.find(tag) {
246                                let matched_text = caps.as_str();
247                                if let Some(caps) = HTML_ANCHOR_PATTERN.captures(matched_text)
248                                    && let Some(id_match) = caps.get(1)
249                                {
250                                    let id = id_match.as_str();
251                                    if !id.is_empty() {
252                                        html_anchors.insert(id.to_string());
253                                    }
254                                }
255                            }
256                            pos = tag_end;
257                        } else {
258                            break;
259                        }
260                    } else {
261                        break;
262                    }
263                }
264            }
265
266            // Extract attribute anchors { #id } from non-heading lines
267            // Headings already have custom_id extracted below
268            if line_info.heading.is_none() && content.contains('{') && content.contains('#') {
269                for caps in ATTR_ANCHOR_PATTERN.captures_iter(content) {
270                    if let Some(id_match) = caps.get(1) {
271                        // Add to markdown_headings (lowercased for case-insensitive matching)
272                        markdown_headings.insert(id_match.as_str().to_lowercase());
273                    }
274                }
275            }
276
277            // Extract heading anchors from blockquote content
278            // Blockquote headings (e.g., "> ## Heading") are not detected by the main heading parser
279            // because the regex operates on the full line, but they still generate valid anchors
280            if line_info.heading.is_none()
281                && let Some(bq) = &line_info.blockquote
282                && let Some((clean_text, custom_id)) = Self::parse_blockquote_heading(&bq.content)
283            {
284                if let Some(id) = custom_id {
285                    markdown_headings.insert(id.to_lowercase());
286                }
287                let fragment = self.anchor_style.generate_fragment(&clean_text);
288                Self::insert_deduplicated_fragment(
289                    fragment,
290                    &mut fragment_counts,
291                    &mut markdown_headings,
292                    use_underscore_dedup,
293                );
294            }
295
296            // Extract markdown heading anchors
297            if let Some(heading) = &line_info.heading {
298                // Custom ID from {#custom-id} syntax
299                if let Some(custom_id) = &heading.custom_id {
300                    markdown_headings.insert(custom_id.to_lowercase());
301                }
302
303                // Generate fragment directly from heading text
304                // Note: HTML stripping was removed because it interfered with arrow patterns
305                // like <-> and placeholders like <FILE>. The anchor styles handle these correctly.
306                let fragment = self.anchor_style.generate_fragment(&heading.text);
307
308                Self::insert_deduplicated_fragment(
309                    fragment,
310                    &mut fragment_counts,
311                    &mut markdown_headings,
312                    use_underscore_dedup,
313                );
314            }
315        }
316
317        (markdown_headings, html_anchors)
318    }
319
320    /// Fast check if URL is external (doesn't need to be validated)
321    #[inline]
322    fn is_external_url_fast(url: &str) -> bool {
323        // Quick prefix checks for common protocols
324        url.starts_with("http://")
325            || url.starts_with("https://")
326            || url.starts_with("ftp://")
327            || url.starts_with("mailto:")
328            || url.starts_with("tel:")
329            || url.starts_with("//")
330    }
331
332    /// Resolve a path by trying markdown extensions if it has no extension
333    ///
334    /// For extension-less paths (e.g., `page`), returns a list of paths to try:
335    /// 1. The original path (in case it's already in the index)
336    /// 2. The path with each markdown extension (e.g., `page.md`, `page.markdown`, etc.)
337    ///
338    /// For paths with extensions, returns just the original path.
339    #[inline]
340    fn resolve_path_with_extensions(path: &Path, extensions: &[&str]) -> Vec<PathBuf> {
341        if path.extension().is_none() {
342            // Extension-less path - try with markdown extensions
343            let mut paths = Vec::with_capacity(extensions.len() + 1);
344            // First try the exact path (in case it's already in the index)
345            paths.push(path.to_path_buf());
346            // Then try with each markdown extension
347            for ext in extensions {
348                let path_with_ext = path.with_extension(&ext[1..]); // Remove leading dot
349                paths.push(path_with_ext);
350            }
351            paths
352        } else {
353            // Path has extension - use as-is
354            vec![path.to_path_buf()]
355        }
356    }
357
358    /// Check if a path part (without fragment) is an extension-less path
359    ///
360    /// Extension-less paths are potential cross-file links that need resolution
361    /// with markdown extensions (e.g., `page#section` -> `page.md#section`).
362    ///
363    /// We recognize them as extension-less if:
364    /// 1. Path has no extension (no dot)
365    /// 2. Path is not empty
366    /// 3. Path doesn't look like a query parameter or special syntax
367    /// 4. Path contains at least one alphanumeric character (valid filename)
368    /// 5. Path contains only valid path characters (alphanumeric, slashes, hyphens, underscores)
369    ///
370    /// Optimized: single pass through characters to check both conditions.
371    #[inline]
372    fn is_extensionless_path(path_part: &str) -> bool {
373        // Quick rejections for common non-extension-less cases
374        if path_part.is_empty()
375            || path_part.contains('.')
376            || path_part.contains('?')
377            || path_part.contains('&')
378            || path_part.contains('=')
379        {
380            return false;
381        }
382
383        // Single pass: check for alphanumeric and validate all characters
384        let mut has_alphanumeric = false;
385        for c in path_part.chars() {
386            if c.is_alphanumeric() {
387                has_alphanumeric = true;
388            } else if !matches!(c, '/' | '\\' | '-' | '_') {
389                // Invalid character found - early exit
390                return false;
391            }
392        }
393
394        // Must have at least one alphanumeric character to be a valid filename
395        has_alphanumeric
396    }
397
398    /// Check if URL is a cross-file link (contains a file path before #)
399    #[inline]
400    fn is_cross_file_link(url: &str) -> bool {
401        if let Some(fragment_pos) = url.find('#') {
402            let path_part = &url[..fragment_pos];
403
404            // If there's no path part, it's just a fragment (#heading)
405            if path_part.is_empty() {
406                return false;
407            }
408
409            // Check for Liquid syntax used by Jekyll and other static site generators
410            // Liquid tags: {% ... %} for control flow and includes
411            // Liquid variables: {{ ... }} for outputting values
412            // These are template directives that reference external content and should be skipped
413            // We check for proper bracket order to avoid false positives
414            if let Some(tag_start) = path_part.find("{%")
415                && path_part[tag_start + 2..].contains("%}")
416            {
417                return true;
418            }
419            if let Some(var_start) = path_part.find("{{")
420                && path_part[var_start + 2..].contains("}}")
421            {
422                return true;
423            }
424
425            // Check if it's an absolute path (starts with /)
426            // These are links to other pages on the same site
427            if path_part.starts_with('/') {
428                return true;
429            }
430
431            // Check if it looks like a file path:
432            // - Contains a file extension (dot followed by letters)
433            // - Contains path separators
434            // - Contains relative path indicators
435            // - OR is an extension-less path with a fragment (GitHub-style: page#section)
436            let has_extension = path_part.contains('.')
437                && (
438                    // Has file extension pattern (handle query parameters by splitting on them first)
439                    {
440                    let clean_path = path_part.split('?').next().unwrap_or(path_part);
441                    // Handle files starting with dot
442                    if let Some(after_dot) = clean_path.strip_prefix('.') {
443                        let dots_count = clean_path.matches('.').count();
444                        if dots_count == 1 {
445                            // Could be ".ext" (file extension) or ".hidden" (hidden file)
446                            // Treat short alphanumeric suffixes as file extensions
447                            !after_dot.is_empty() && after_dot.len() <= 10 &&
448                            after_dot.chars().all(|c| c.is_ascii_alphanumeric())
449                        } else {
450                            // Hidden file with extension like ".hidden.txt"
451                            clean_path.split('.').next_back().is_some_and(|ext| {
452                                !ext.is_empty() && ext.len() <= 10 && ext.chars().all(|c| c.is_ascii_alphanumeric())
453                            })
454                        }
455                    } else {
456                        // Regular file path
457                        clean_path.split('.').next_back().is_some_and(|ext| {
458                            !ext.is_empty() && ext.len() <= 10 && ext.chars().all(|c| c.is_ascii_alphanumeric())
459                        })
460                    }
461                } ||
462                // Or contains path separators
463                path_part.contains('/') || path_part.contains('\\') ||
464                // Or starts with relative path indicators
465                path_part.starts_with("./") || path_part.starts_with("../")
466                );
467
468            // Extension-less paths with fragments are potential cross-file links
469            // This supports GitHub-style links like [link](page#section) that resolve to page.md#section
470            let is_extensionless = Self::is_extensionless_path(path_part);
471
472            has_extension || is_extensionless
473        } else {
474            false
475        }
476    }
477}
478
479impl Rule for MD051LinkFragments {
480    fn name(&self) -> &'static str {
481        "MD051"
482    }
483
484    fn description(&self) -> &'static str {
485        "Link fragments should reference valid headings"
486    }
487
488    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
489        // Skip if no link fragments present
490        if !ctx.likely_has_links_or_images() {
491            return true;
492        }
493        // Check for # character (fragments)
494        !ctx.has_char('#')
495    }
496
497    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
498        let mut warnings = Vec::new();
499
500        if ctx.content.is_empty() || ctx.links.is_empty() || self.should_skip(ctx) {
501            return Ok(warnings);
502        }
503
504        let (markdown_headings, html_anchors) = self.extract_headings_from_context(ctx);
505
506        for link in &ctx.links {
507            if link.is_reference {
508                continue;
509            }
510
511            // Skip links inside PyMdown blocks (MkDocs flavor)
512            if ctx.line_info(link.line).is_some_and(|info| info.in_pymdown_block) {
513                continue;
514            }
515
516            // Skip wiki-links - they reference other files and may have their own fragment validation
517            if matches!(link.link_type, LinkType::WikiLink { .. }) {
518                continue;
519            }
520
521            // Skip links inside Jinja templates
522            if ctx.is_in_jinja_range(link.byte_offset) {
523                continue;
524            }
525
526            // Skip Quarto/Pandoc citations ([@citation], @citation)
527            // Citations are bibliography references, not link fragments
528            if ctx.flavor == crate::config::MarkdownFlavor::Quarto && ctx.is_in_citation(link.byte_offset) {
529                continue;
530            }
531
532            // Skip links inside shortcodes ({{< ... >}} or {{% ... %}})
533            // Shortcodes may contain template syntax that looks like fragment links
534            if ctx.is_in_shortcode(link.byte_offset) {
535                continue;
536            }
537
538            let url = &link.url;
539
540            // Skip links without fragments or external URLs
541            if !url.contains('#') || Self::is_external_url_fast(url) {
542                continue;
543            }
544
545            // Skip mdbook template placeholders ({{#VARIABLE}})
546            // mdbook uses {{#VARIABLE}} syntax where # is part of the template, not a fragment
547            if url.contains("{{#") && url.contains("}}") {
548                continue;
549            }
550
551            // Skip Quarto/RMarkdown cross-references (@fig-, @tbl-, @sec-, @eq-, etc.)
552            // These are special cross-reference syntax, not HTML anchors
553            // Format: @prefix-identifier or just @identifier
554            if url.starts_with('@') {
555                continue;
556            }
557
558            // Cross-file links are valid if the file exists (not checked here)
559            if Self::is_cross_file_link(url) {
560                continue;
561            }
562
563            let Some(fragment_pos) = url.find('#') else {
564                continue;
565            };
566
567            let fragment = &url[fragment_pos + 1..];
568
569            // Skip Liquid template variables and filters
570            if (url.contains("{{") && fragment.contains('|')) || fragment.ends_with("}}") || fragment.ends_with("%}") {
571                continue;
572            }
573
574            if fragment.is_empty() {
575                continue;
576            }
577
578            // Skip MkDocs runtime-generated anchors:
579            // - #fn:NAME, #fnref:NAME from the footnotes extension
580            // - #+key.path or #+key:value from Material for MkDocs option references
581            //   (e.g., #+type:abstract, #+toc.slugify, #+pymdownx.highlight.anchor_linenums)
582            if ctx.flavor == crate::config::MarkdownFlavor::MkDocs
583                && (fragment.starts_with("fn:")
584                    || fragment.starts_with("fnref:")
585                    || (fragment.starts_with('+') && (fragment.contains('.') || fragment.contains(':'))))
586            {
587                continue;
588            }
589
590            // Validate fragment against document headings
591            // HTML anchors are case-sensitive, markdown anchors are case-insensitive
592            let found = if html_anchors.contains(fragment) {
593                true
594            } else {
595                let fragment_lower = fragment.to_lowercase();
596                markdown_headings.contains(&fragment_lower)
597            };
598
599            if !found {
600                warnings.push(LintWarning {
601                    rule_name: Some(self.name().to_string()),
602                    message: format!("Link anchor '#{fragment}' does not exist in document headings"),
603                    line: link.line,
604                    column: link.start_col + 1,
605                    end_line: link.line,
606                    end_column: link.end_col + 1,
607                    severity: Severity::Error,
608                    fix: None,
609                });
610            }
611        }
612
613        Ok(warnings)
614    }
615
616    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
617        // MD051 does not provide auto-fix
618        // Link fragment corrections require human judgment to avoid incorrect fixes
619        Ok(ctx.content.to_string())
620    }
621
622    fn as_any(&self) -> &dyn std::any::Any {
623        self
624    }
625
626    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
627    where
628        Self: Sized,
629    {
630        // Config keys are normalized to kebab-case by the config system
631        let explicit_style = config
632            .rules
633            .get("MD051")
634            .and_then(|rc| rc.values.get("anchor-style"))
635            .and_then(|v| v.as_str())
636            .map(|style_str| match style_str.to_lowercase().as_str() {
637                "kramdown" => AnchorStyle::Kramdown,
638                "kramdown-gfm" | "jekyll" => AnchorStyle::KramdownGfm,
639                "python-markdown" | "python_markdown" | "mkdocs" => AnchorStyle::PythonMarkdown,
640                _ => AnchorStyle::GitHub,
641            });
642
643        // When MkDocs flavor is active and no explicit anchor style is configured,
644        // default to PythonMarkdown (since MkDocs uses Python-Markdown's toc extension)
645        let anchor_style = explicit_style.unwrap_or_else(|| {
646            if config.global.flavor == crate::config::MarkdownFlavor::MkDocs {
647                AnchorStyle::PythonMarkdown
648            } else {
649                AnchorStyle::GitHub
650            }
651        });
652
653        Box::new(MD051LinkFragments::with_anchor_style(anchor_style))
654    }
655
656    fn category(&self) -> RuleCategory {
657        RuleCategory::Link
658    }
659
660    fn cross_file_scope(&self) -> CrossFileScope {
661        CrossFileScope::Workspace
662    }
663
664    fn contribute_to_index(&self, ctx: &crate::lint_context::LintContext, file_index: &mut FileIndex) {
665        let mut fragment_counts = HashMap::new();
666        let use_underscore_dedup = self.anchor_style == AnchorStyle::PythonMarkdown;
667
668        // Extract headings, HTML anchors, and attribute anchors (for other files to reference)
669        for (line_idx, line_info) in ctx.lines.iter().enumerate() {
670            if line_info.in_front_matter {
671                continue;
672            }
673
674            // Skip code blocks for anchor extraction
675            if line_info.in_code_block {
676                continue;
677            }
678
679            let content = line_info.content(ctx.content);
680
681            // Extract HTML anchors (id or name attributes on any element)
682            if content.contains('<') && (content.contains("id=") || content.contains("name=")) {
683                let mut pos = 0;
684                while pos < content.len() {
685                    if let Some(start) = content[pos..].find('<') {
686                        let tag_start = pos + start;
687                        if let Some(end) = content[tag_start..].find('>') {
688                            let tag_end = tag_start + end + 1;
689                            let tag = &content[tag_start..tag_end];
690
691                            if let Some(caps) = HTML_ANCHOR_PATTERN.captures(tag)
692                                && let Some(id_match) = caps.get(1)
693                            {
694                                file_index.add_html_anchor(id_match.as_str().to_string());
695                            }
696                            pos = tag_end;
697                        } else {
698                            break;
699                        }
700                    } else {
701                        break;
702                    }
703                }
704            }
705
706            // Extract attribute anchors { #id } on non-heading lines
707            // Headings already have custom_id extracted via heading.custom_id
708            if line_info.heading.is_none() && content.contains("{") && content.contains("#") {
709                for caps in ATTR_ANCHOR_PATTERN.captures_iter(content) {
710                    if let Some(id_match) = caps.get(1) {
711                        file_index.add_attribute_anchor(id_match.as_str().to_string());
712                    }
713                }
714            }
715
716            // Extract heading anchors from blockquote content
717            if line_info.heading.is_none()
718                && let Some(bq) = &line_info.blockquote
719                && let Some((clean_text, custom_id)) = Self::parse_blockquote_heading(&bq.content)
720            {
721                let fragment = self.anchor_style.generate_fragment(&clean_text);
722                Self::add_heading_to_index(
723                    &fragment,
724                    &clean_text,
725                    custom_id,
726                    line_idx + 1,
727                    &mut fragment_counts,
728                    file_index,
729                    use_underscore_dedup,
730                );
731            }
732
733            // Extract heading anchors
734            if let Some(heading) = &line_info.heading {
735                let fragment = self.anchor_style.generate_fragment(&heading.text);
736
737                Self::add_heading_to_index(
738                    &fragment,
739                    &heading.text,
740                    heading.custom_id.clone(),
741                    line_idx + 1,
742                    &mut fragment_counts,
743                    file_index,
744                    use_underscore_dedup,
745                );
746
747                // Extract Material for MkDocs setting anchors from headings.
748                // These are rendered as anchors at build time by Material's JS.
749                // Most references use #+key.path format (handled by the skip logic in check()),
750                // but this extraction enables cross-file validation for direct #key.path references.
751                if ctx.flavor == crate::config::MarkdownFlavor::MkDocs
752                    && let Some(caps) = MD_SETTING_PATTERN.captures(content)
753                    && let Some(name) = caps.get(1)
754                {
755                    file_index.add_html_anchor(name.as_str().to_string());
756                }
757            }
758        }
759
760        // Extract cross-file links (for validation against other files)
761        for link in &ctx.links {
762            if link.is_reference {
763                continue;
764            }
765
766            // Skip links inside PyMdown blocks (MkDocs flavor)
767            if ctx.line_info(link.line).is_some_and(|info| info.in_pymdown_block) {
768                continue;
769            }
770
771            // Skip wiki-links - they use a different linking system and are not validated
772            // as relative file paths
773            if matches!(link.link_type, LinkType::WikiLink { .. }) {
774                continue;
775            }
776
777            let url = &link.url;
778
779            // Skip external URLs
780            if Self::is_external_url_fast(url) {
781                continue;
782            }
783
784            // Only process cross-file links with fragments
785            if Self::is_cross_file_link(url)
786                && let Some(fragment_pos) = url.find('#')
787            {
788                let path_part = &url[..fragment_pos];
789                let fragment = &url[fragment_pos + 1..];
790
791                // Skip empty fragments or template syntax
792                if fragment.is_empty() || fragment.contains("{{") || fragment.contains("{%") {
793                    continue;
794                }
795
796                file_index.add_cross_file_link(CrossFileLinkIndex {
797                    target_path: path_part.to_string(),
798                    fragment: fragment.to_string(),
799                    line: link.line,
800                    column: link.start_col + 1,
801                });
802            }
803        }
804    }
805
806    fn cross_file_check(
807        &self,
808        file_path: &Path,
809        file_index: &FileIndex,
810        workspace_index: &crate::workspace_index::WorkspaceIndex,
811    ) -> LintResult {
812        let mut warnings = Vec::new();
813
814        // Supported markdown file extensions (with leading dot, matching MD057)
815        const MARKDOWN_EXTENSIONS: &[&str] = &[
816            ".md",
817            ".markdown",
818            ".mdx",
819            ".mkd",
820            ".mkdn",
821            ".mdown",
822            ".mdwn",
823            ".qmd",
824            ".rmd",
825        ];
826
827        // Check each cross-file link in this file
828        for cross_link in &file_index.cross_file_links {
829            // Skip cross-file links without fragments - nothing to validate
830            if cross_link.fragment.is_empty() {
831                continue;
832            }
833
834            // Resolve the target file path relative to the current file
835            let base_target_path = if let Some(parent) = file_path.parent() {
836                parent.join(&cross_link.target_path)
837            } else {
838                Path::new(&cross_link.target_path).to_path_buf()
839            };
840
841            // Normalize the path (remove . and ..)
842            let base_target_path = normalize_path(&base_target_path);
843
844            // For extension-less paths, try resolving with markdown extensions
845            // This handles GitHub-style links like [link](page#section) -> page.md#section
846            let target_paths_to_try = Self::resolve_path_with_extensions(&base_target_path, MARKDOWN_EXTENSIONS);
847
848            // Try to find the target file in the workspace index
849            let mut target_file_index = None;
850
851            for target_path in &target_paths_to_try {
852                if let Some(index) = workspace_index.get_file(target_path) {
853                    target_file_index = Some(index);
854                    break;
855                }
856            }
857
858            if let Some(target_file_index) = target_file_index {
859                // Check if the fragment matches any heading in the target file (O(1) lookup)
860                if !target_file_index.has_anchor(&cross_link.fragment) {
861                    warnings.push(LintWarning {
862                        rule_name: Some(self.name().to_string()),
863                        line: cross_link.line,
864                        column: cross_link.column,
865                        end_line: cross_link.line,
866                        end_column: cross_link.column + cross_link.target_path.len() + 1 + cross_link.fragment.len(),
867                        message: format!(
868                            "Link fragment '{}' not found in '{}'",
869                            cross_link.fragment, cross_link.target_path
870                        ),
871                        severity: Severity::Error,
872                        fix: None,
873                    });
874                }
875            }
876            // If target file not in index, skip (could be external file or not in workspace)
877        }
878
879        Ok(warnings)
880    }
881
882    fn default_config_section(&self) -> Option<(String, toml::Value)> {
883        let value: toml::Value = toml::from_str(
884            r#"
885# Anchor generation style to match your target platform
886# Options: "github" (default), "kramdown-gfm", "kramdown"
887# Note: "jekyll" is accepted as an alias for "kramdown-gfm" (backward compatibility)
888anchor-style = "github"
889"#,
890        )
891        .ok()?;
892        Some(("MD051".to_string(), value))
893    }
894}
895
896#[cfg(test)]
897mod tests {
898    use super::*;
899    use crate::lint_context::LintContext;
900
901    #[test]
902    fn test_quarto_cross_references() {
903        let rule = MD051LinkFragments::new();
904
905        // Test that Quarto cross-references are skipped
906        let content = r#"# Test Document
907
908## Figures
909
910See [@fig-plot] for the visualization.
911
912More details in [@tbl-results] and [@sec-methods].
913
914The equation [@eq-regression] shows the relationship.
915
916Reference to [@lst-code] for implementation."#;
917        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Quarto, None);
918        let result = rule.check(&ctx).unwrap();
919        assert!(
920            result.is_empty(),
921            "Quarto cross-references (@fig-, @tbl-, @sec-, @eq-) should not trigger MD051 warnings. Got {} warnings",
922            result.len()
923        );
924
925        // Test that normal anchors still work
926        let content_with_anchor = r#"# Test
927
928See [link](#test) for details."#;
929        let ctx_anchor = LintContext::new(content_with_anchor, crate::config::MarkdownFlavor::Quarto, None);
930        let result_anchor = rule.check(&ctx_anchor).unwrap();
931        assert!(result_anchor.is_empty(), "Valid anchor should not trigger warning");
932
933        // Test that invalid anchors are still flagged
934        let content_invalid = r#"# Test
935
936See [link](#nonexistent) for details."#;
937        let ctx_invalid = LintContext::new(content_invalid, crate::config::MarkdownFlavor::Quarto, None);
938        let result_invalid = rule.check(&ctx_invalid).unwrap();
939        assert_eq!(result_invalid.len(), 1, "Invalid anchor should still trigger warning");
940    }
941
942    // Cross-file validation tests
943    #[test]
944    fn test_cross_file_scope() {
945        let rule = MD051LinkFragments::new();
946        assert_eq!(rule.cross_file_scope(), CrossFileScope::Workspace);
947    }
948
949    #[test]
950    fn test_contribute_to_index_extracts_headings() {
951        let rule = MD051LinkFragments::new();
952        let content = "# First Heading\n\n# Second { #custom }\n\n## Third";
953        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
954
955        let mut file_index = FileIndex::new();
956        rule.contribute_to_index(&ctx, &mut file_index);
957
958        assert_eq!(file_index.headings.len(), 3);
959        assert_eq!(file_index.headings[0].text, "First Heading");
960        assert_eq!(file_index.headings[0].auto_anchor, "first-heading");
961        assert!(file_index.headings[0].custom_anchor.is_none());
962
963        assert_eq!(file_index.headings[1].text, "Second");
964        assert_eq!(file_index.headings[1].custom_anchor, Some("custom".to_string()));
965
966        assert_eq!(file_index.headings[2].text, "Third");
967    }
968
969    #[test]
970    fn test_contribute_to_index_extracts_cross_file_links() {
971        let rule = MD051LinkFragments::new();
972        let content = "See [docs](other.md#installation) and [more](../guide.md#getting-started)";
973        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
974
975        let mut file_index = FileIndex::new();
976        rule.contribute_to_index(&ctx, &mut file_index);
977
978        assert_eq!(file_index.cross_file_links.len(), 2);
979        assert_eq!(file_index.cross_file_links[0].target_path, "other.md");
980        assert_eq!(file_index.cross_file_links[0].fragment, "installation");
981        assert_eq!(file_index.cross_file_links[1].target_path, "../guide.md");
982        assert_eq!(file_index.cross_file_links[1].fragment, "getting-started");
983    }
984
985    #[test]
986    fn test_cross_file_check_valid_fragment() {
987        use crate::workspace_index::WorkspaceIndex;
988
989        let rule = MD051LinkFragments::new();
990
991        // Build workspace index with target file
992        let mut workspace_index = WorkspaceIndex::new();
993        let mut target_file_index = FileIndex::new();
994        target_file_index.add_heading(HeadingIndex {
995            text: "Installation Guide".to_string(),
996            auto_anchor: "installation-guide".to_string(),
997            custom_anchor: None,
998            line: 1,
999        });
1000        workspace_index.insert_file(PathBuf::from("docs/install.md"), target_file_index);
1001
1002        // Create a FileIndex for the file being checked
1003        let mut current_file_index = FileIndex::new();
1004        current_file_index.add_cross_file_link(CrossFileLinkIndex {
1005            target_path: "install.md".to_string(),
1006            fragment: "installation-guide".to_string(),
1007            line: 3,
1008            column: 5,
1009        });
1010
1011        let warnings = rule
1012            .cross_file_check(Path::new("docs/readme.md"), &current_file_index, &workspace_index)
1013            .unwrap();
1014
1015        // Should find no warnings since fragment exists
1016        assert!(warnings.is_empty());
1017    }
1018
1019    #[test]
1020    fn test_cross_file_check_invalid_fragment() {
1021        use crate::workspace_index::WorkspaceIndex;
1022
1023        let rule = MD051LinkFragments::new();
1024
1025        // Build workspace index with target file
1026        let mut workspace_index = WorkspaceIndex::new();
1027        let mut target_file_index = FileIndex::new();
1028        target_file_index.add_heading(HeadingIndex {
1029            text: "Installation Guide".to_string(),
1030            auto_anchor: "installation-guide".to_string(),
1031            custom_anchor: None,
1032            line: 1,
1033        });
1034        workspace_index.insert_file(PathBuf::from("docs/install.md"), target_file_index);
1035
1036        // Create a FileIndex with a cross-file link pointing to non-existent fragment
1037        let mut current_file_index = FileIndex::new();
1038        current_file_index.add_cross_file_link(CrossFileLinkIndex {
1039            target_path: "install.md".to_string(),
1040            fragment: "nonexistent".to_string(),
1041            line: 3,
1042            column: 5,
1043        });
1044
1045        let warnings = rule
1046            .cross_file_check(Path::new("docs/readme.md"), &current_file_index, &workspace_index)
1047            .unwrap();
1048
1049        // Should find one warning since fragment doesn't exist
1050        assert_eq!(warnings.len(), 1);
1051        assert!(warnings[0].message.contains("nonexistent"));
1052        assert!(warnings[0].message.contains("install.md"));
1053    }
1054
1055    #[test]
1056    fn test_cross_file_check_custom_anchor_match() {
1057        use crate::workspace_index::WorkspaceIndex;
1058
1059        let rule = MD051LinkFragments::new();
1060
1061        // Build workspace index with target file that has custom anchor
1062        let mut workspace_index = WorkspaceIndex::new();
1063        let mut target_file_index = FileIndex::new();
1064        target_file_index.add_heading(HeadingIndex {
1065            text: "Installation Guide".to_string(),
1066            auto_anchor: "installation-guide".to_string(),
1067            custom_anchor: Some("install".to_string()),
1068            line: 1,
1069        });
1070        workspace_index.insert_file(PathBuf::from("docs/install.md"), target_file_index);
1071
1072        // Link uses custom anchor
1073        let mut current_file_index = FileIndex::new();
1074        current_file_index.add_cross_file_link(CrossFileLinkIndex {
1075            target_path: "install.md".to_string(),
1076            fragment: "install".to_string(),
1077            line: 3,
1078            column: 5,
1079        });
1080
1081        let warnings = rule
1082            .cross_file_check(Path::new("docs/readme.md"), &current_file_index, &workspace_index)
1083            .unwrap();
1084
1085        // Should find no warnings since custom anchor matches
1086        assert!(warnings.is_empty());
1087    }
1088
1089    #[test]
1090    fn test_cross_file_check_target_not_in_workspace() {
1091        use crate::workspace_index::WorkspaceIndex;
1092
1093        let rule = MD051LinkFragments::new();
1094
1095        // Empty workspace index
1096        let workspace_index = WorkspaceIndex::new();
1097
1098        // Link to file not in workspace
1099        let mut current_file_index = FileIndex::new();
1100        current_file_index.add_cross_file_link(CrossFileLinkIndex {
1101            target_path: "external.md".to_string(),
1102            fragment: "heading".to_string(),
1103            line: 3,
1104            column: 5,
1105        });
1106
1107        let warnings = rule
1108            .cross_file_check(Path::new("docs/readme.md"), &current_file_index, &workspace_index)
1109            .unwrap();
1110
1111        // Should not warn about files not in workspace
1112        assert!(warnings.is_empty());
1113    }
1114
1115    #[test]
1116    fn test_wikilinks_skipped_in_check() {
1117        // Wikilinks should not trigger MD051 warnings for missing fragments
1118        let rule = MD051LinkFragments::new();
1119
1120        let content = r#"# Test Document
1121
1122## Valid Heading
1123
1124[[Microsoft#Windows OS]]
1125[[SomePage#section]]
1126[[page|Display Text]]
1127[[path/to/page#section]]
1128"#;
1129        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
1130        let result = rule.check(&ctx).unwrap();
1131
1132        assert!(
1133            result.is_empty(),
1134            "Wikilinks should not trigger MD051 warnings. Got: {result:?}"
1135        );
1136    }
1137
1138    #[test]
1139    fn test_wikilinks_not_added_to_cross_file_index() {
1140        // Wikilinks should not be added to the cross-file link index
1141        let rule = MD051LinkFragments::new();
1142
1143        let content = r#"# Test Document
1144
1145[[Microsoft#Windows OS]]
1146[[SomePage#section]]
1147[Regular Link](other.md#section)
1148"#;
1149        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
1150
1151        let mut file_index = FileIndex::new();
1152        rule.contribute_to_index(&ctx, &mut file_index);
1153
1154        // Should only have one cross-file link (the regular markdown link)
1155        // Wikilinks should not be added
1156        let cross_file_links = &file_index.cross_file_links;
1157        assert_eq!(
1158            cross_file_links.len(),
1159            1,
1160            "Only regular markdown links should be indexed, not wikilinks. Got: {cross_file_links:?}"
1161        );
1162        assert_eq!(file_index.cross_file_links[0].target_path, "other.md");
1163        assert_eq!(file_index.cross_file_links[0].fragment, "section");
1164    }
1165}