rumdl_lib/rules/
md052_reference_links_images.rs

1use crate::rule::{LintError, LintResult, LintWarning, Rule, Severity};
2use crate::utils::mkdocs_patterns::is_mkdocs_auto_reference;
3use crate::utils::range_utils::calculate_match_range;
4use crate::utils::regex_cache::{HTML_COMMENT_PATTERN, SHORTCUT_REF_REGEX};
5use crate::utils::skip_context::{is_in_front_matter, is_in_math_context, is_in_table_cell};
6use lazy_static::lazy_static;
7use regex::Regex;
8use std::collections::{HashMap, HashSet};
9
10lazy_static! {
11    // Pattern to match reference definitions [ref]: url
12    // Note: \S* instead of \S+ to allow empty definitions like [ref]:
13    // The capturing group handles nested brackets to support cases like [`union[t, none]`]:
14    static ref REF_REGEX: Regex = Regex::new(r"^\s*\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]:\s*.*").unwrap();
15
16    // Pattern for list items to exclude from reference checks (standard regex is fine)
17    static ref LIST_ITEM_REGEX: Regex = Regex::new(r"^\s*[-*+]\s+(?:\[[xX\s]\]\s+)?").unwrap();
18
19    // Pattern for code blocks (standard regex is fine)
20    static ref FENCED_CODE_START: Regex = Regex::new(r"^(\s*)(`{3,}|~{3,})").unwrap();
21
22    // Pattern for output example sections (standard regex is fine)
23    static ref OUTPUT_EXAMPLE_START: Regex = Regex::new(r"^#+\s*(?:Output|Example|Output Style|Output Format)\s*$").unwrap();
24
25    // Pattern for GitHub alerts/callouts in blockquotes (e.g., > [!NOTE], > [!TIP], etc.)
26    // Extended to include additional common alert types
27    static ref GITHUB_ALERT_REGEX: Regex = Regex::new(r"^\s*>\s*\[!(NOTE|TIP|IMPORTANT|WARNING|CAUTION|INFO|SUCCESS|FAILURE|DANGER|BUG|EXAMPLE|QUOTE)\]").unwrap();
28
29    // Pattern to detect URLs that may contain brackets (IPv6, API endpoints, etc.)
30    // This pattern specifically looks for:
31    // - IPv6 addresses: https://[::1] or https://[2001:db8::1]
32    // - IPv6 with zone IDs: https://[fe80::1%eth0]
33    // - IPv6 mixed notation: https://[::ffff:192.0.2.1]
34    // - API paths with array notation: https://api.example.com/users[0]
35    // But NOT markdown reference links that happen to follow URLs
36    static ref URL_WITH_BRACKETS: Regex = Regex::new(
37        r"https?://(?:\[[0-9a-fA-F:.%]+\]|[^\s\[\]]+/[^\s]*\[\d+\])"
38    ).unwrap();
39}
40
41/// Rule MD052: Reference links and images should use reference style
42///
43/// See [docs/md052.md](../../docs/md052.md) for full documentation, configuration, and examples.
44///
45/// This rule is triggered when a reference link or image uses a reference that isn't defined.
46#[derive(Clone, Default)]
47pub struct MD052ReferenceLinkImages {}
48
49impl MD052ReferenceLinkImages {
50    pub fn new() -> Self {
51        Self {}
52    }
53
54    /// Check if a pattern is likely NOT a markdown reference
55    /// Returns true if this pattern should be skipped
56    fn is_likely_not_reference(text: &str) -> bool {
57        // Skip numeric patterns (array indices, ranges)
58        if text.chars().all(|c| c.is_ascii_digit()) {
59            return true;
60        }
61
62        // Skip numeric ranges like [1:3], [0:10], etc.
63        if text.contains(':') && text.chars().all(|c| c.is_ascii_digit() || c == ':') {
64            return true;
65        }
66
67        // Skip patterns that look like config sections [tool.something], [section.subsection]
68        // But not if they contain other non-alphanumeric chars like hyphens or underscores
69        if text.contains('.') && !text.contains(' ') && !text.contains('-') && !text.contains('_') {
70            // Config sections typically have dots, no spaces, and only alphanumeric + dots
71            return true;
72        }
73
74        // Skip glob/wildcard patterns like [*], [...], [**]
75        if text == "*" || text == "..." || text == "**" {
76            return true;
77        }
78
79        // Skip patterns that look like file paths [dir/file], [src/utils]
80        if text.contains('/') && !text.contains(' ') && !text.starts_with("http") {
81            return true;
82        }
83
84        // Skip programming type annotations like [int, str], [Dict[str, Any]]
85        // These typically have commas and/or nested brackets
86        if text.contains(',') || text.contains('[') || text.contains(']') {
87            // Check if it looks like a type annotation pattern
88            return true;
89        }
90
91        // Note: We don't filter out patterns with backticks because backticks in reference names
92        // are valid markdown syntax, e.g., [`dataclasses.InitVar`] is a valid reference name
93
94        // Skip patterns that look like module/class paths ONLY if they don't have backticks
95        // Backticks indicate intentional code formatting in a reference name
96        // e.g., skip [dataclasses.initvar] but allow [`typing.ClassVar`]
97        if !text.contains('`')
98            && text.contains('.')
99            && !text.contains(' ')
100            && !text.contains('-')
101            && !text.contains('_')
102        {
103            return true;
104        }
105
106        // Note: We don't filter based on word count anymore because legitimate references
107        // can have many words, like "python language reference for import statements"
108        // Word count filtering was causing false positives where valid references were
109        // being incorrectly flagged as unused
110
111        // Skip patterns that are just punctuation or operators
112        if text.chars().all(|c| !c.is_alphanumeric() && c != ' ') {
113            return true;
114        }
115
116        // Skip very short non-word patterns (likely operators or syntax)
117        if text.len() <= 2 && !text.chars().all(|c| c.is_alphabetic()) {
118            return true;
119        }
120
121        // Skip quoted patterns like ["E501"], ["ALL"], ["E", "F"]
122        if (text.starts_with('"') && text.ends_with('"'))
123            || (text.starts_with('\'') && text.ends_with('\''))
124            || text.contains('"')
125            || text.contains('\'')
126        {
127            return true;
128        }
129
130        // Skip descriptive patterns with colon like [default: the project root]
131        // But allow simple numeric ranges which are handled above
132        if text.contains(':') && text.contains(' ') {
133            return true;
134        }
135
136        // Skip alert/admonition patterns like [!WARN], [!NOTE], etc.
137        if text.starts_with('!') {
138            return true;
139        }
140
141        // Skip single uppercase letters (likely type parameters) like [T], [U], [K], [V]
142        if text.len() == 1 && text.chars().all(|c| c.is_ascii_uppercase()) {
143            return true;
144        }
145
146        // Skip common programming type names and short identifiers
147        // that are likely not markdown references
148        let common_non_refs = [
149            "object", "Object", "any", "Any", "inv", "void", "bool", "int", "float", "str", "char", "i8", "i16", "i32",
150            "i64", "i128", "isize", "u8", "u16", "u32", "u64", "u128", "usize", "f32", "f64",
151        ];
152
153        if common_non_refs.contains(&text) {
154            return true;
155        }
156
157        false
158    }
159
160    /// Check if a position is inside any code span
161    fn is_in_code_span(line: usize, col: usize, code_spans: &[crate::lint_context::CodeSpan]) -> bool {
162        code_spans
163            .iter()
164            .any(|span| span.line == line && col >= span.start_col && col < span.end_col)
165    }
166
167    /// Check if a byte position is within an HTML comment
168    fn is_in_html_comment(content: &str, byte_pos: usize) -> bool {
169        for m in HTML_COMMENT_PATTERN.find_iter(content) {
170            if m.start() <= byte_pos && byte_pos < m.end() {
171                return true;
172            }
173        }
174        false
175    }
176
177    /// Check if a byte position is within an HTML tag
178    fn is_in_html_tag(ctx: &crate::lint_context::LintContext, byte_pos: usize) -> bool {
179        // Check HTML tags
180        for html_tag in ctx.html_tags().iter() {
181            if html_tag.byte_offset <= byte_pos && byte_pos < html_tag.byte_end {
182                return true;
183            }
184        }
185        false
186    }
187
188    fn extract_references(&self, content: &str, mkdocs_mode: bool) -> HashSet<String> {
189        use crate::config::MarkdownFlavor;
190        use crate::utils::skip_context::is_mkdocs_snippet_line;
191
192        let mut references = HashSet::new();
193        let mut in_code_block = false;
194        let mut code_fence_marker = String::new();
195
196        for line in content.lines() {
197            // Skip lines that look like MkDocs snippet markers (only in MkDocs mode)
198            if is_mkdocs_snippet_line(
199                line,
200                if mkdocs_mode {
201                    MarkdownFlavor::MkDocs
202                } else {
203                    MarkdownFlavor::Standard
204                },
205            ) {
206                continue;
207            }
208            // Handle code block boundaries
209            if let Some(cap) = FENCED_CODE_START.captures(line) {
210                if let Some(fence) = cap.get(2) {
211                    // Get the fence marker (``` or ~~~) without the indentation
212                    let fence_str = fence.as_str();
213                    if !in_code_block {
214                        in_code_block = true;
215                        code_fence_marker = fence_str.to_string();
216                    } else if line.trim_start().starts_with(&code_fence_marker) {
217                        // Check if this could be a closing fence
218                        let trimmed = line.trim_start();
219                        // A closing fence should be just the fence characters, possibly with trailing whitespace
220                        if trimmed.starts_with(&code_fence_marker) {
221                            let after_fence = &trimmed[code_fence_marker.len()..];
222                            if after_fence.trim().is_empty() {
223                                in_code_block = false;
224                                code_fence_marker.clear();
225                            }
226                        }
227                    }
228                }
229                continue;
230            }
231
232            // Skip lines in code blocks
233            if in_code_block {
234                continue;
235            }
236
237            // Check for abbreviation syntax (*[ABBR]: Definition) and skip it
238            // Abbreviations are not reference links and should not be tracked
239            if line.trim_start().starts_with("*[") {
240                continue;
241            }
242
243            if let Some(cap) = REF_REGEX.captures(line) {
244                // Store references in lowercase for case-insensitive comparison
245                if let Some(reference) = cap.get(1) {
246                    references.insert(reference.as_str().to_lowercase());
247                }
248            }
249        }
250
251        references
252    }
253
254    fn find_undefined_references(
255        &self,
256        content: &str,
257        references: &HashSet<String>,
258        ctx: &crate::lint_context::LintContext,
259        mkdocs_mode: bool,
260    ) -> Vec<(usize, usize, usize, String)> {
261        let mut undefined = Vec::new();
262        let mut reported_refs = HashMap::new();
263        let mut in_code_block = false;
264        let mut code_fence_marker = String::new();
265        let mut in_example_section = false;
266
267        // Get code spans once for the entire function
268        let code_spans = ctx.code_spans();
269
270        // Use cached data for reference links and images
271        for link in &ctx.links {
272            if !link.is_reference {
273                continue; // Skip inline links
274            }
275
276            // Skip links inside code spans
277            if Self::is_in_code_span(link.line, link.start_col, &code_spans) {
278                continue;
279            }
280
281            // Skip links inside HTML comments
282            if Self::is_in_html_comment(content, link.byte_offset) {
283                continue;
284            }
285
286            // Skip links inside HTML tags
287            if Self::is_in_html_tag(ctx, link.byte_offset) {
288                continue;
289            }
290
291            // Skip links inside math contexts
292            if is_in_math_context(ctx, link.byte_offset) {
293                continue;
294            }
295
296            // Skip links inside table cells
297            if is_in_table_cell(ctx, link.line, link.start_col) {
298                continue;
299            }
300
301            // Skip links inside frontmatter (convert from 1-based to 0-based line numbers)
302            if is_in_front_matter(content, link.line.saturating_sub(1)) {
303                continue;
304            }
305
306            if let Some(ref_id) = &link.reference_id {
307                let reference_lower = ref_id.to_lowercase();
308
309                // Skip MkDocs auto-references if in MkDocs mode
310                // Check both the reference_id and the link text for shorthand references
311                if mkdocs_mode && (is_mkdocs_auto_reference(ref_id) || is_mkdocs_auto_reference(&link.text)) {
312                    continue;
313                }
314
315                // Check if reference is defined
316                if !references.contains(&reference_lower) && !reported_refs.contains_key(&reference_lower) {
317                    // Check if the line is in an example section or list item
318                    if let Some(line_info) = ctx.line_info(link.line) {
319                        if OUTPUT_EXAMPLE_START.is_match(&line_info.content) {
320                            in_example_section = true;
321                            continue;
322                        }
323
324                        if in_example_section {
325                            continue;
326                        }
327
328                        // Skip list items
329                        if LIST_ITEM_REGEX.is_match(&line_info.content) {
330                            continue;
331                        }
332
333                        // Skip lines that are HTML content
334                        let trimmed = line_info.content.trim_start();
335                        if trimmed.starts_with('<') {
336                            continue;
337                        }
338                    }
339
340                    let match_len = link.byte_end - link.byte_offset;
341                    undefined.push((link.line - 1, link.start_col, match_len, ref_id.clone()));
342                    reported_refs.insert(reference_lower, true);
343                }
344            }
345        }
346
347        // Use cached data for reference images
348        for image in &ctx.images {
349            if !image.is_reference {
350                continue; // Skip inline images
351            }
352
353            // Skip images inside code spans
354            if Self::is_in_code_span(image.line, image.start_col, &code_spans) {
355                continue;
356            }
357
358            // Skip images inside HTML comments
359            if Self::is_in_html_comment(content, image.byte_offset) {
360                continue;
361            }
362
363            // Skip images inside HTML tags
364            if Self::is_in_html_tag(ctx, image.byte_offset) {
365                continue;
366            }
367
368            // Skip images inside math contexts
369            if is_in_math_context(ctx, image.byte_offset) {
370                continue;
371            }
372
373            // Skip images inside table cells
374            if is_in_table_cell(ctx, image.line, image.start_col) {
375                continue;
376            }
377
378            // Skip images inside frontmatter (convert from 1-based to 0-based line numbers)
379            if is_in_front_matter(content, image.line.saturating_sub(1)) {
380                continue;
381            }
382
383            if let Some(ref_id) = &image.reference_id {
384                let reference_lower = ref_id.to_lowercase();
385
386                // Skip MkDocs auto-references if in MkDocs mode
387                // Check both the reference_id and the alt text for shorthand references
388                if mkdocs_mode && (is_mkdocs_auto_reference(ref_id) || is_mkdocs_auto_reference(&image.alt_text)) {
389                    continue;
390                }
391
392                // Check if reference is defined
393                if !references.contains(&reference_lower) && !reported_refs.contains_key(&reference_lower) {
394                    // Check if the line is in an example section or list item
395                    if let Some(line_info) = ctx.line_info(image.line) {
396                        if OUTPUT_EXAMPLE_START.is_match(&line_info.content) {
397                            in_example_section = true;
398                            continue;
399                        }
400
401                        if in_example_section {
402                            continue;
403                        }
404
405                        // Skip list items
406                        if LIST_ITEM_REGEX.is_match(&line_info.content) {
407                            continue;
408                        }
409
410                        // Skip lines that are HTML content
411                        let trimmed = line_info.content.trim_start();
412                        if trimmed.starts_with('<') {
413                            continue;
414                        }
415                    }
416
417                    let match_len = image.byte_end - image.byte_offset;
418                    undefined.push((image.line - 1, image.start_col, match_len, ref_id.clone()));
419                    reported_refs.insert(reference_lower, true);
420                }
421            }
422        }
423
424        // Build a set of byte ranges that are already covered by parsed links/images
425        let mut covered_ranges: Vec<(usize, usize)> = Vec::new();
426
427        // Add ranges from parsed links
428        for link in &ctx.links {
429            covered_ranges.push((link.byte_offset, link.byte_end));
430        }
431
432        // Add ranges from parsed images
433        for image in &ctx.images {
434            covered_ranges.push((image.byte_offset, image.byte_end));
435        }
436
437        // Sort ranges by start position
438        covered_ranges.sort_by_key(|&(start, _)| start);
439
440        // Handle shortcut references [text] which aren't captured in ctx.links
441        // Need to use regex for these
442        let lines: Vec<&str> = content.lines().collect();
443        in_example_section = false; // Reset for line-by-line processing
444
445        for (line_num, line) in lines.iter().enumerate() {
446            // Skip lines in frontmatter (line_num is already 0-based)
447            if is_in_front_matter(content, line_num) {
448                continue;
449            }
450
451            // Handle code blocks
452            if let Some(cap) = FENCED_CODE_START.captures(line) {
453                if let Some(fence) = cap.get(2) {
454                    // Get the fence marker (``` or ~~~) without the indentation
455                    let fence_str = fence.as_str();
456                    if !in_code_block {
457                        in_code_block = true;
458                        code_fence_marker = fence_str.to_string();
459                    } else if line.trim_start().starts_with(&code_fence_marker) {
460                        // Check if this could be a closing fence
461                        let trimmed = line.trim_start();
462                        // A closing fence should be just the fence characters, possibly with trailing whitespace
463                        if trimmed.starts_with(&code_fence_marker) {
464                            let after_fence = &trimmed[code_fence_marker.len()..];
465                            if after_fence.trim().is_empty() {
466                                in_code_block = false;
467                                code_fence_marker.clear();
468                            }
469                        }
470                    }
471                }
472                continue;
473            }
474
475            if in_code_block {
476                continue;
477            }
478
479            // Check for example sections
480            if OUTPUT_EXAMPLE_START.is_match(line) {
481                in_example_section = true;
482                continue;
483            }
484
485            if in_example_section {
486                // Check if we're exiting the example section (another heading)
487                if line.starts_with('#') && !OUTPUT_EXAMPLE_START.is_match(line) {
488                    in_example_section = false;
489                } else {
490                    continue;
491                }
492            }
493
494            // Skip list items
495            if LIST_ITEM_REGEX.is_match(line) {
496                continue;
497            }
498
499            // Skip lines that are HTML content
500            let trimmed_line = line.trim_start();
501            if trimmed_line.starts_with('<') {
502                continue;
503            }
504
505            // Skip GitHub alerts/callouts (e.g., > [!TIP])
506            if GITHUB_ALERT_REGEX.is_match(line) {
507                continue;
508            }
509
510            // Skip abbreviation definitions (*[ABBR]: Definition)
511            // These are not reference links and should not be checked
512            if trimmed_line.starts_with("*[") {
513                continue;
514            }
515
516            // Collect positions of brackets that are part of URLs (IPv6, etc.)
517            // so we can exclude them from reference checking
518            let mut url_bracket_ranges: Vec<(usize, usize)> = Vec::new();
519            for mat in URL_WITH_BRACKETS.find_iter(line) {
520                // Find all bracket pairs within this URL match
521                let url_str = mat.as_str();
522                let url_start = mat.start();
523
524                // Find brackets within the URL (e.g., in https://[::1]:8080)
525                let mut idx = 0;
526                while idx < url_str.len() {
527                    if let Some(bracket_start) = url_str[idx..].find('[') {
528                        let bracket_start_abs = url_start + idx + bracket_start;
529                        if let Some(bracket_end) = url_str[idx + bracket_start + 1..].find(']') {
530                            let bracket_end_abs = url_start + idx + bracket_start + 1 + bracket_end + 1;
531                            url_bracket_ranges.push((bracket_start_abs, bracket_end_abs));
532                            idx += bracket_start + bracket_end + 2;
533                        } else {
534                            break;
535                        }
536                    } else {
537                        break;
538                    }
539                }
540            }
541
542            // Check shortcut references: [reference]
543            if let Ok(captures) = SHORTCUT_REF_REGEX.captures_iter(line).collect::<Result<Vec<_>, _>>() {
544                for cap in captures {
545                    if let Some(ref_match) = cap.get(1) {
546                        // Check if this bracket is part of a URL (IPv6, etc.)
547                        let bracket_start = cap.get(0).unwrap().start();
548                        let bracket_end = cap.get(0).unwrap().end();
549
550                        // Skip if this bracket pair is within any URL bracket range
551                        let is_in_url = url_bracket_ranges
552                            .iter()
553                            .any(|&(url_start, url_end)| bracket_start >= url_start && bracket_end <= url_end);
554
555                        if is_in_url {
556                            continue;
557                        }
558
559                        let reference = ref_match.as_str();
560                        let reference_lower = reference.to_lowercase();
561
562                        // Skip patterns that are likely not markdown references
563                        if Self::is_likely_not_reference(reference) {
564                            continue;
565                        }
566
567                        // Skip GitHub alerts (including extended types)
568                        if let Some(alert_type) = reference.strip_prefix('!')
569                            && matches!(
570                                alert_type,
571                                "NOTE"
572                                    | "TIP"
573                                    | "WARNING"
574                                    | "IMPORTANT"
575                                    | "CAUTION"
576                                    | "INFO"
577                                    | "SUCCESS"
578                                    | "FAILURE"
579                                    | "DANGER"
580                                    | "BUG"
581                                    | "EXAMPLE"
582                                    | "QUOTE"
583                            )
584                        {
585                            continue;
586                        }
587
588                        // Skip MkDocs snippet section markers like [start:section] or [end:section]
589                        // when they appear as part of snippet syntax (e.g., # -8<- [start:section])
590                        if mkdocs_mode
591                            && (reference.starts_with("start:") || reference.starts_with("end:"))
592                            && (crate::utils::mkdocs_snippets::is_snippet_section_start(line)
593                                || crate::utils::mkdocs_snippets::is_snippet_section_end(line))
594                        {
595                            continue;
596                        }
597
598                        // Skip MkDocs auto-references if in MkDocs mode
599                        if mkdocs_mode && is_mkdocs_auto_reference(reference) {
600                            continue;
601                        }
602
603                        if !references.contains(&reference_lower) && !reported_refs.contains_key(&reference_lower) {
604                            let full_match = cap.get(0).unwrap();
605                            let col = full_match.start();
606
607                            // Skip if inside code span
608                            let code_spans = ctx.code_spans();
609                            if Self::is_in_code_span(line_num + 1, col, &code_spans) {
610                                continue;
611                            }
612
613                            // Check if this position is within a covered range
614                            let line_start_byte = ctx.line_offsets[line_num];
615                            let byte_pos = line_start_byte + col;
616
617                            // Skip if inside HTML comment
618                            if Self::is_in_html_comment(content, byte_pos) {
619                                continue;
620                            }
621
622                            // Skip if inside HTML tag
623                            if Self::is_in_html_tag(ctx, byte_pos) {
624                                continue;
625                            }
626
627                            // Skip if inside math context
628                            if is_in_math_context(ctx, byte_pos) {
629                                continue;
630                            }
631
632                            // Skip if inside table cell
633                            if is_in_table_cell(ctx, line_num + 1, col) {
634                                continue;
635                            }
636
637                            let byte_end = byte_pos + (full_match.end() - full_match.start());
638
639                            // Check if this shortcut ref overlaps with any parsed link/image
640                            let mut is_covered = false;
641                            for &(range_start, range_end) in &covered_ranges {
642                                if range_start <= byte_pos && byte_end <= range_end {
643                                    // This shortcut ref is completely within a parsed link/image
644                                    is_covered = true;
645                                    break;
646                                }
647                                if range_start > byte_end {
648                                    // No need to check further (ranges are sorted)
649                                    break;
650                                }
651                            }
652
653                            if is_covered {
654                                continue;
655                            }
656
657                            // More sophisticated checks to avoid false positives
658
659                            // Check 1: If preceded by ], this might be part of [text][ref]
660                            // Look for the pattern ...][ref] and check if there's a matching [ before
661                            let line_chars: Vec<char> = line.chars().collect();
662                            if col > 0 && col <= line_chars.len() && line_chars.get(col - 1) == Some(&']') {
663                                // Look backwards for a [ that would make this [text][ref]
664                                let mut bracket_count = 1; // We already saw one ]
665                                let mut check_pos = col.saturating_sub(2);
666                                let mut found_opening = false;
667
668                                while check_pos > 0 && check_pos < line_chars.len() {
669                                    match line_chars.get(check_pos) {
670                                        Some(&']') => bracket_count += 1,
671                                        Some(&'[') => {
672                                            bracket_count -= 1;
673                                            if bracket_count == 0 {
674                                                // Check if this [ is escaped
675                                                if check_pos == 0 || line_chars.get(check_pos - 1) != Some(&'\\') {
676                                                    found_opening = true;
677                                                }
678                                                break;
679                                            }
680                                        }
681                                        _ => {}
682                                    }
683                                    if check_pos == 0 {
684                                        break;
685                                    }
686                                    check_pos = check_pos.saturating_sub(1);
687                                }
688
689                                if found_opening {
690                                    // This is part of [text][ref], skip it
691                                    continue;
692                                }
693                            }
694
695                            // Check 2: If there's an escaped bracket pattern before this
696                            // e.g., \[text\][ref], the [ref] shouldn't be treated as a shortcut
697                            let before_text = &line[..col];
698                            if before_text.contains("\\]") {
699                                // Check if there's a \[ before the \]
700                                if let Some(escaped_close_pos) = before_text.rfind("\\]") {
701                                    let search_text = &before_text[..escaped_close_pos];
702                                    if search_text.contains("\\[") {
703                                        // This looks like \[...\][ref], skip it
704                                        continue;
705                                    }
706                                }
707                            }
708
709                            let match_len = full_match.end() - full_match.start();
710                            undefined.push((line_num, col, match_len, reference.to_string()));
711                            reported_refs.insert(reference_lower, true);
712                        }
713                    }
714                }
715            }
716        }
717
718        undefined
719    }
720}
721
722impl Rule for MD052ReferenceLinkImages {
723    fn name(&self) -> &'static str {
724        "MD052"
725    }
726
727    fn description(&self) -> &'static str {
728        "Reference links and images should use a reference that exists"
729    }
730
731    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
732        let content = ctx.content;
733        let mut warnings = Vec::new();
734
735        // Check if we're in MkDocs mode from the context
736        let mkdocs_mode = ctx.flavor == crate::config::MarkdownFlavor::MkDocs;
737
738        let references = self.extract_references(content, mkdocs_mode);
739
740        // Use optimized detection method with cached link/image data
741        for (line_num, col, match_len, reference) in
742            self.find_undefined_references(content, &references, ctx, mkdocs_mode)
743        {
744            let lines: Vec<&str> = content.lines().collect();
745            let line_content = lines.get(line_num).unwrap_or(&"");
746
747            // Calculate precise character range for the entire undefined reference
748            let (start_line, start_col, end_line, end_col) =
749                calculate_match_range(line_num + 1, line_content, col, match_len);
750
751            warnings.push(LintWarning {
752                rule_name: Some(self.name()),
753                line: start_line,
754                column: start_col,
755                end_line,
756                end_column: end_col,
757                message: format!("Reference '{reference}' not found"),
758                severity: Severity::Warning,
759                fix: None,
760            });
761        }
762
763        Ok(warnings)
764    }
765
766    /// Check if this rule should be skipped for performance
767    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
768        // Skip if content is empty or has no reference-style links/images
769        ctx.content.is_empty() || (!ctx.content.contains("](") && !ctx.content.contains("]["))
770    }
771
772    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
773        let content = ctx.content;
774        // No automatic fix available for undefined references
775        Ok(content.to_string())
776    }
777
778    fn as_any(&self) -> &dyn std::any::Any {
779        self
780    }
781
782    fn from_config(_config: &crate::config::Config) -> Box<dyn Rule>
783    where
784        Self: Sized,
785    {
786        // Flavor is now accessed from LintContext during check
787        Box::new(MD052ReferenceLinkImages::new())
788    }
789}
790
791#[cfg(test)]
792mod tests {
793    use super::*;
794    use crate::lint_context::LintContext;
795
796    #[test]
797    fn test_valid_reference_link() {
798        let rule = MD052ReferenceLinkImages::new();
799        let content = "[text][ref]\n\n[ref]: https://example.com";
800        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
801        let result = rule.check(&ctx).unwrap();
802
803        assert_eq!(result.len(), 0);
804    }
805
806    #[test]
807    fn test_undefined_reference_link() {
808        let rule = MD052ReferenceLinkImages::new();
809        let content = "[text][undefined]";
810        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
811        let result = rule.check(&ctx).unwrap();
812
813        assert_eq!(result.len(), 1);
814        assert!(result[0].message.contains("Reference 'undefined' not found"));
815    }
816
817    #[test]
818    fn test_valid_reference_image() {
819        let rule = MD052ReferenceLinkImages::new();
820        let content = "![alt][img]\n\n[img]: image.jpg";
821        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
822        let result = rule.check(&ctx).unwrap();
823
824        assert_eq!(result.len(), 0);
825    }
826
827    #[test]
828    fn test_undefined_reference_image() {
829        let rule = MD052ReferenceLinkImages::new();
830        let content = "![alt][missing]";
831        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
832        let result = rule.check(&ctx).unwrap();
833
834        assert_eq!(result.len(), 1);
835        assert!(result[0].message.contains("Reference 'missing' not found"));
836    }
837
838    #[test]
839    fn test_case_insensitive_references() {
840        let rule = MD052ReferenceLinkImages::new();
841        let content = "[Text][REF]\n\n[ref]: https://example.com";
842        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
843        let result = rule.check(&ctx).unwrap();
844
845        assert_eq!(result.len(), 0);
846    }
847
848    #[test]
849    fn test_shortcut_reference_valid() {
850        let rule = MD052ReferenceLinkImages::new();
851        let content = "[ref]\n\n[ref]: https://example.com";
852        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
853        let result = rule.check(&ctx).unwrap();
854
855        assert_eq!(result.len(), 0);
856    }
857
858    #[test]
859    fn test_shortcut_reference_undefined() {
860        let rule = MD052ReferenceLinkImages::new();
861        let content = "[undefined]";
862        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
863        let result = rule.check(&ctx).unwrap();
864
865        assert_eq!(result.len(), 1);
866        assert!(result[0].message.contains("Reference 'undefined' not found"));
867    }
868
869    #[test]
870    fn test_inline_links_ignored() {
871        let rule = MD052ReferenceLinkImages::new();
872        let content = "[text](https://example.com)";
873        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
874        let result = rule.check(&ctx).unwrap();
875
876        assert_eq!(result.len(), 0);
877    }
878
879    #[test]
880    fn test_inline_images_ignored() {
881        let rule = MD052ReferenceLinkImages::new();
882        let content = "![alt](image.jpg)";
883        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
884        let result = rule.check(&ctx).unwrap();
885
886        assert_eq!(result.len(), 0);
887    }
888
889    #[test]
890    fn test_references_in_code_blocks_ignored() {
891        let rule = MD052ReferenceLinkImages::new();
892        let content = "```\n[undefined]\n```\n\n[ref]: https://example.com";
893        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
894        let result = rule.check(&ctx).unwrap();
895
896        assert_eq!(result.len(), 0);
897    }
898
899    #[test]
900    fn test_references_in_inline_code_ignored() {
901        let rule = MD052ReferenceLinkImages::new();
902        let content = "`[undefined]`";
903        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
904        let result = rule.check(&ctx).unwrap();
905
906        // References inside inline code spans should be ignored
907        assert_eq!(result.len(), 0);
908    }
909
910    #[test]
911    fn test_comprehensive_inline_code_detection() {
912        let rule = MD052ReferenceLinkImages::new();
913        let content = r#"# Test
914
915This `[inside]` should be ignored.
916This [outside] should be flagged.
917Reference links `[text][ref]` in code are ignored.
918Regular reference [text][missing] should be flagged.
919Images `![alt][img]` in code are ignored.
920Regular image ![alt][badimg] should be flagged.
921
922Multiple `[one]` and `[two]` in code ignored, but [three] is not.
923
924```
925[code block content] should be ignored
926```
927
928`Multiple [refs] in [same] code span` ignored."#;
929
930        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
931        let result = rule.check(&ctx).unwrap();
932
933        // Should only flag: outside, missing, badimg, three (4 total)
934        assert_eq!(result.len(), 4);
935
936        let messages: Vec<&str> = result.iter().map(|w| &*w.message).collect();
937        assert!(messages.iter().any(|m| m.contains("outside")));
938        assert!(messages.iter().any(|m| m.contains("missing")));
939        assert!(messages.iter().any(|m| m.contains("badimg")));
940        assert!(messages.iter().any(|m| m.contains("three")));
941
942        // Should NOT flag any references inside code spans
943        assert!(!messages.iter().any(|m| m.contains("inside")));
944        assert!(!messages.iter().any(|m| m.contains("one")));
945        assert!(!messages.iter().any(|m| m.contains("two")));
946        assert!(!messages.iter().any(|m| m.contains("refs")));
947        assert!(!messages.iter().any(|m| m.contains("same")));
948    }
949
950    #[test]
951    fn test_multiple_undefined_references() {
952        let rule = MD052ReferenceLinkImages::new();
953        let content = "[link1][ref1] [link2][ref2] [link3][ref3]";
954        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
955        let result = rule.check(&ctx).unwrap();
956
957        assert_eq!(result.len(), 3);
958        assert!(result[0].message.contains("ref1"));
959        assert!(result[1].message.contains("ref2"));
960        assert!(result[2].message.contains("ref3"));
961    }
962
963    #[test]
964    fn test_mixed_valid_and_undefined() {
965        let rule = MD052ReferenceLinkImages::new();
966        let content = "[valid][ref] [invalid][missing]\n\n[ref]: https://example.com";
967        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
968        let result = rule.check(&ctx).unwrap();
969
970        assert_eq!(result.len(), 1);
971        assert!(result[0].message.contains("missing"));
972    }
973
974    #[test]
975    fn test_empty_reference() {
976        let rule = MD052ReferenceLinkImages::new();
977        let content = "[text][]\n\n[ref]: https://example.com";
978        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
979        let result = rule.check(&ctx).unwrap();
980
981        // Empty reference should use the link text as reference
982        assert_eq!(result.len(), 1);
983    }
984
985    #[test]
986    fn test_escaped_brackets_ignored() {
987        let rule = MD052ReferenceLinkImages::new();
988        let content = "\\[not a link\\]";
989        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
990        let result = rule.check(&ctx).unwrap();
991
992        assert_eq!(result.len(), 0);
993    }
994
995    #[test]
996    fn test_list_items_ignored() {
997        let rule = MD052ReferenceLinkImages::new();
998        let content = "- [undefined]\n* [another]\n+ [third]";
999        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
1000        let result = rule.check(&ctx).unwrap();
1001
1002        // List items that look like shortcut references should be ignored
1003        assert_eq!(result.len(), 0);
1004    }
1005
1006    #[test]
1007    fn test_output_example_section_ignored() {
1008        let rule = MD052ReferenceLinkImages::new();
1009        let content = "## Output\n\n[undefined]\n\n## Normal Section\n\n[missing]";
1010        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
1011        let result = rule.check(&ctx).unwrap();
1012
1013        // Only the reference outside the Output section should be flagged
1014        assert_eq!(result.len(), 1);
1015        assert!(result[0].message.contains("missing"));
1016    }
1017
1018    #[test]
1019    fn test_reference_definitions_in_code_blocks_ignored() {
1020        let rule = MD052ReferenceLinkImages::new();
1021        let content = "[link][ref]\n\n```\n[ref]: https://example.com\n```";
1022        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
1023        let result = rule.check(&ctx).unwrap();
1024
1025        // Reference defined in code block should not count
1026        assert_eq!(result.len(), 1);
1027        assert!(result[0].message.contains("ref"));
1028    }
1029
1030    #[test]
1031    fn test_multiple_references_to_same_undefined() {
1032        let rule = MD052ReferenceLinkImages::new();
1033        let content = "[first][missing] [second][missing] [third][missing]";
1034        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
1035        let result = rule.check(&ctx).unwrap();
1036
1037        // Should only report once per unique reference
1038        assert_eq!(result.len(), 1);
1039        assert!(result[0].message.contains("missing"));
1040    }
1041
1042    #[test]
1043    fn test_reference_with_special_characters() {
1044        let rule = MD052ReferenceLinkImages::new();
1045        let content = "[text][ref-with-hyphens]\n\n[ref-with-hyphens]: https://example.com";
1046        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
1047        let result = rule.check(&ctx).unwrap();
1048
1049        assert_eq!(result.len(), 0);
1050    }
1051
1052    #[test]
1053    fn test_issue_51_html_attribute_not_reference() {
1054        // Test for issue #51 - HTML attributes with square brackets shouldn't be treated as references
1055        let rule = MD052ReferenceLinkImages::new();
1056        let content = r#"# Example
1057
1058## Test
1059
1060Want to fill out this form?
1061
1062<form method="post">
1063    <input type="email" name="fields[email]" id="drip-email" placeholder="email@domain.com">
1064</form>"#;
1065        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
1066        let result = rule.check(&ctx).unwrap();
1067
1068        assert_eq!(
1069            result.len(),
1070            0,
1071            "HTML attributes with square brackets should not be flagged as undefined references"
1072        );
1073    }
1074
1075    #[test]
1076    fn test_extract_references() {
1077        let rule = MD052ReferenceLinkImages::new();
1078        let content = "[ref1]: url1\n[Ref2]: url2\n[REF3]: url3";
1079        let refs = rule.extract_references(content, false);
1080
1081        assert_eq!(refs.len(), 3);
1082        assert!(refs.contains("ref1"));
1083        assert!(refs.contains("ref2"));
1084        assert!(refs.contains("ref3"));
1085    }
1086
1087    #[test]
1088    fn test_inline_code_not_flagged() {
1089        let rule = MD052ReferenceLinkImages::new();
1090
1091        // Test that arrays in inline code are not flagged as references
1092        let content = r#"# Test
1093
1094Configure with `["JavaScript", "GitHub", "Node.js"]` in your settings.
1095
1096Also, `[todo]` is not a reference link.
1097
1098But this [reference] should be flagged.
1099
1100And this `[inline code]` should not be flagged.
1101"#;
1102
1103        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
1104        let warnings = rule.check(&ctx).unwrap();
1105
1106        // Should only flag [reference], not the ones in backticks
1107        assert_eq!(warnings.len(), 1, "Should only flag one undefined reference");
1108        assert!(warnings[0].message.contains("'reference'"));
1109    }
1110
1111    #[test]
1112    fn test_code_block_references_ignored() {
1113        let rule = MD052ReferenceLinkImages::new();
1114
1115        let content = r#"# Test
1116
1117```markdown
1118[undefined] reference in code block
1119![undefined] image in code block
1120```
1121
1122[real-undefined] reference outside
1123"#;
1124
1125        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
1126        let warnings = rule.check(&ctx).unwrap();
1127
1128        // Should only flag [real-undefined], not the ones in code block
1129        assert_eq!(warnings.len(), 1);
1130        assert!(warnings[0].message.contains("'real-undefined'"));
1131    }
1132
1133    #[test]
1134    fn test_html_comments_ignored() {
1135        // Test for issue #20 - MD052 should not flag content inside HTML comments
1136        let rule = MD052ReferenceLinkImages::new();
1137
1138        // Test the exact case from issue #20
1139        let content = r#"<!--- write fake_editor.py 'import sys\nopen(*sys.argv[1:], mode="wt").write("2 3 4 4 2 3 2")' -->
1140<!--- set_env EDITOR 'python3 fake_editor.py' -->
1141
1142```bash
1143$ python3 vote.py
11443 votes for: 2
11452 votes for: 3, 4
1146```"#;
1147        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
1148        let result = rule.check(&ctx).unwrap();
1149        assert_eq!(result.len(), 0, "Should not flag [1:] inside HTML comments");
1150
1151        // Test various reference patterns inside HTML comments
1152        let content = r#"<!-- This is [ref1] and [ref2][ref3] -->
1153Normal [text][undefined]
1154<!-- Another [comment][with] references -->"#;
1155        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
1156        let result = rule.check(&ctx).unwrap();
1157        assert_eq!(
1158            result.len(),
1159            1,
1160            "Should only flag the undefined reference outside comments"
1161        );
1162        assert!(result[0].message.contains("undefined"));
1163
1164        // Test multi-line HTML comments
1165        let content = r#"<!--
1166[ref1]
1167[ref2][ref3]
1168-->
1169[actual][undefined]"#;
1170        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
1171        let result = rule.check(&ctx).unwrap();
1172        assert_eq!(
1173            result.len(),
1174            1,
1175            "Should not flag references in multi-line HTML comments"
1176        );
1177        assert!(result[0].message.contains("undefined"));
1178
1179        // Test mixed scenarios
1180        let content = r#"<!-- Comment with [1:] pattern -->
1181Valid [link][ref]
1182<!-- More [refs][in][comments] -->
1183![image][missing]
1184
1185[ref]: https://example.com"#;
1186        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
1187        let result = rule.check(&ctx).unwrap();
1188        assert_eq!(result.len(), 1, "Should only flag missing image reference");
1189        assert!(result[0].message.contains("missing"));
1190    }
1191
1192    #[test]
1193    fn test_frontmatter_ignored() {
1194        // Test for issue #24 - MD052 should not flag content inside frontmatter
1195        let rule = MD052ReferenceLinkImages::new();
1196
1197        // Test YAML frontmatter with arrays and references
1198        let content = r#"---
1199layout: post
1200title: "My Jekyll Post"
1201date: 2023-01-01
1202categories: blog
1203tags: ["test", "example"]
1204author: John Doe
1205---
1206
1207# My Blog Post
1208
1209This is the actual markdown content that should be linted.
1210
1211[undefined] reference should be flagged.
1212
1213## Section 1
1214
1215Some content here."#;
1216        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
1217        let result = rule.check(&ctx).unwrap();
1218
1219        // Should only flag [undefined] in the content, not the ["test", "example"] array in frontmatter
1220        assert_eq!(
1221            result.len(),
1222            1,
1223            "Should only flag the undefined reference outside frontmatter"
1224        );
1225        assert!(result[0].message.contains("undefined"));
1226
1227        // Test TOML frontmatter
1228        let content = r#"+++
1229title = "My Post"
1230tags = ["example", "test"]
1231+++
1232
1233# Content
1234
1235[missing] reference should be flagged."#;
1236        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
1237        let result = rule.check(&ctx).unwrap();
1238        assert_eq!(
1239            result.len(),
1240            1,
1241            "Should only flag the undefined reference outside TOML frontmatter"
1242        );
1243        assert!(result[0].message.contains("missing"));
1244    }
1245
1246    #[test]
1247    fn test_mkdocs_snippet_markers_not_flagged() {
1248        // Test for issue #68 - MkDocs snippet selection markers should not be flagged as undefined references
1249        let rule = MD052ReferenceLinkImages::new();
1250
1251        // Test snippet section markers
1252        let content = r#"# Document with MkDocs Snippets
1253
1254Some content here.
1255
1256# -8<- [start:remote-content]
1257
1258This is the remote content section.
1259
1260# -8<- [end:remote-content]
1261
1262More content here.
1263
1264<!-- --8<-- [start:another-section] -->
1265Content in another section
1266<!-- --8<-- [end:another-section] -->"#;
1267        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::MkDocs);
1268        let result = rule.check(&ctx).unwrap();
1269
1270        // Should not flag any snippet markers as undefined references
1271        assert_eq!(
1272            result.len(),
1273            0,
1274            "Should not flag MkDocs snippet markers as undefined references"
1275        );
1276
1277        // Test that the snippet marker lines are properly skipped
1278        // but regular undefined references on other lines are still caught
1279        let content = r#"# Document
1280
1281# -8<- [start:section]
1282Content with [reference] inside snippet section
1283# -8<- [end:section]
1284
1285Regular [undefined] reference outside snippet markers."#;
1286        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::MkDocs);
1287        let result = rule.check(&ctx).unwrap();
1288
1289        assert_eq!(
1290            result.len(),
1291            2,
1292            "Should flag undefined references but skip snippet marker lines"
1293        );
1294        // The references inside the content should be flagged, but not start: and end:
1295        assert!(result[0].message.contains("reference"));
1296        assert!(result[1].message.contains("undefined"));
1297
1298        // Test in standard mode - should flag the markers as undefined
1299        let content = r#"# Document
1300
1301# -8<- [start:section]
1302# -8<- [end:section]"#;
1303        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
1304        let result = rule.check(&ctx).unwrap();
1305
1306        assert_eq!(
1307            result.len(),
1308            2,
1309            "In standard mode, snippet markers should be flagged as undefined references"
1310        );
1311    }
1312
1313    #[test]
1314    fn test_github_alerts_not_flagged() {
1315        // Test for issue #60 - GitHub alerts should not be flagged as undefined references
1316        let rule = MD052ReferenceLinkImages::new();
1317
1318        // Test various GitHub alert types
1319        let content = r#"# Document with GitHub Alerts
1320
1321> [!NOTE]
1322> This is a note alert.
1323
1324> [!TIP]
1325> This is a tip alert.
1326
1327> [!IMPORTANT]
1328> This is an important alert.
1329
1330> [!WARNING]
1331> This is a warning alert.
1332
1333> [!CAUTION]
1334> This is a caution alert.
1335
1336Regular content with [undefined] reference."#;
1337        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
1338        let result = rule.check(&ctx).unwrap();
1339
1340        // Should only flag the undefined reference, not the GitHub alerts
1341        assert_eq!(
1342            result.len(),
1343            1,
1344            "Should only flag the undefined reference, not GitHub alerts"
1345        );
1346        assert!(result[0].message.contains("undefined"));
1347        assert_eq!(result[0].line, 18); // Line with [undefined]
1348
1349        // Test GitHub alerts with additional content
1350        let content = r#"> [!TIP]
1351> Here's a useful tip about [something].
1352> Multiple lines are allowed.
1353
1354[something] is mentioned but not defined."#;
1355        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
1356        let result = rule.check(&ctx).unwrap();
1357
1358        // Should flag only the [something] outside blockquotes
1359        // The test shows we're only catching one, which might be correct behavior
1360        // matching markdownlint's approach
1361        assert_eq!(result.len(), 1, "Should flag undefined reference");
1362        assert!(result[0].message.contains("something"));
1363
1364        // Test GitHub alerts with proper references
1365        let content = r#"> [!NOTE]
1366> See [reference] for more details.
1367
1368[reference]: https://example.com"#;
1369        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
1370        let result = rule.check(&ctx).unwrap();
1371
1372        // Should not flag anything - [!NOTE] is GitHub alert and [reference] is defined
1373        assert_eq!(result.len(), 0, "Should not flag GitHub alerts or defined references");
1374    }
1375}