mdbook_lint_core/rules/standard/
md052.rs

1//! MD052 - Reference links and images should use a label that is defined
2//!
3//! This rule checks for reference links and images that use undefined labels.
4//! Uses byte-by-byte parsing for accurate context-aware detection.
5//!
6//! ## Correct
7//!
8//! ```markdown
9//! [Link text][label]
10//! [Collapsed][]
11//!
12//! [label]: https://example.com
13//! [collapsed]: https://example.com
14//! ```
15//!
16//! ## Incorrect
17//!
18//! ```markdown
19//! [Link text][undefined-label]
20//! [Collapsed][]
21//!
22//! [defined]: https://example.com
23//! ```
24
25use crate::error::Result;
26use crate::{
27    Document, Violation,
28    rule::{Rule, RuleCategory, RuleMetadata},
29    violation::Severity,
30};
31use comrak::nodes::AstNode;
32use std::collections::HashSet;
33
34/// MD052 - Reference links and images should use a label that is defined
35pub struct MD052 {
36    ignored_labels: Vec<String>,
37    #[allow(dead_code)]
38    shortcut_syntax: bool,
39}
40
41impl Default for MD052 {
42    fn default() -> Self {
43        Self::new()
44    }
45}
46
47impl MD052 {
48    /// Create a new MD052 rule instance
49    pub fn new() -> Self {
50        Self {
51            ignored_labels: vec!["x".to_string()], // Default ignores checkbox syntax
52            shortcut_syntax: false,
53        }
54    }
55
56    /// Set the list of ignored labels
57    #[allow(dead_code)]
58    pub fn ignored_labels(mut self, labels: Vec<String>) -> Self {
59        self.ignored_labels = labels;
60        self
61    }
62
63    /// Set whether to include shortcut syntax
64    #[allow(dead_code)]
65    pub fn shortcut_syntax(mut self, include: bool) -> Self {
66        self.shortcut_syntax = include;
67        self
68    }
69
70    /// Parse reference definitions from document content
71    fn collect_defined_labels(&self, document: &Document) -> HashSet<String> {
72        let mut definitions = HashSet::new();
73        let mut parser = RefDefParser::new(document.content.as_bytes());
74
75        while let Some(def) = parser.next_definition() {
76            definitions.insert(def.label.to_lowercase());
77        }
78
79        definitions
80    }
81
82    /// Check for undefined reference labels using byte parsing
83    fn check_reference_labels(&self, document: &Document) -> Vec<Violation> {
84        let mut violations = Vec::new();
85        let defined_labels = self.collect_defined_labels(document);
86        let mut parser = LinkParser::new(document.content.as_bytes());
87
88        while let Some(link) = parser.next_link() {
89            match link {
90                LinkType::Reference {
91                    label,
92                    line,
93                    column,
94                } => {
95                    let label_lower = label.to_lowercase();
96                    if !self.ignored_labels.contains(&label_lower)
97                        && !defined_labels.contains(&label_lower)
98                    {
99                        violations.push(self.create_violation(
100                            format!("Reference link uses undefined label '{label}'"),
101                            line,
102                            column,
103                            Severity::Error,
104                        ));
105                    }
106                }
107                LinkType::Image {
108                    label,
109                    line,
110                    column,
111                } => {
112                    let label_lower = label.to_lowercase();
113                    if !self.ignored_labels.contains(&label_lower)
114                        && !defined_labels.contains(&label_lower)
115                    {
116                        violations.push(self.create_violation(
117                            format!("Reference image uses undefined label '{label}'"),
118                            line,
119                            column,
120                            Severity::Error,
121                        ));
122                    }
123                }
124                _ => {} // Ignore inline links
125            }
126        }
127
128        violations
129    }
130}
131
132impl Rule for MD052 {
133    fn id(&self) -> &'static str {
134        "MD052"
135    }
136
137    fn name(&self) -> &'static str {
138        "reference-links-images"
139    }
140
141    fn description(&self) -> &'static str {
142        "Reference links and images should use a label that is defined"
143    }
144
145    fn metadata(&self) -> RuleMetadata {
146        RuleMetadata::stable(RuleCategory::Links)
147    }
148
149    fn check_with_ast<'a>(
150        &self,
151        document: &Document,
152        _ast: Option<&'a AstNode<'a>>,
153    ) -> Result<Vec<Violation>> {
154        // This rule doesn't need AST - works entirely with byte parsing
155        let violations = self.check_reference_labels(document);
156        Ok(violations)
157    }
158}
159
160/// Reference definition found in the document
161#[derive(Debug)]
162struct RefDefinition {
163    label: String,
164}
165
166/// Parser for reference definitions like `[label]: url`
167struct RefDefParser<'a> {
168    input: &'a [u8],
169    pos: usize,
170    line: usize,
171}
172
173impl<'a> RefDefParser<'a> {
174    fn new(input: &'a [u8]) -> Self {
175        Self {
176            input,
177            pos: 0,
178            line: 1,
179        }
180    }
181
182    fn next_definition(&mut self) -> Option<RefDefinition> {
183        while self.pos < self.input.len() {
184            // Skip whitespace at beginning of line
185            self.skip_whitespace();
186
187            if self.pos >= self.input.len() {
188                break;
189            }
190
191            // Check if line starts with '['
192            if self.current_byte() == Some(b'[') {
193                if let Some(def) = self.try_parse_definition() {
194                    return Some(def);
195                } else {
196                    // Move forward if parsing failed
197                    self.pos += 1;
198                }
199            } else {
200                // Move to next line
201                self.skip_to_next_line();
202            }
203        }
204        None
205    }
206
207    fn try_parse_definition(&mut self) -> Option<RefDefinition> {
208        let start_pos = self.pos;
209
210        // Skip '['
211        self.pos += 1;
212
213        // Parse label
214        let label = self.parse_ref_label()?;
215
216        // Expect ']'
217        if self.current_byte() != Some(b']') {
218            self.pos = start_pos;
219            return None;
220        }
221        self.pos += 1;
222
223        // Expect ':'
224        if self.current_byte() != Some(b':') {
225            self.pos = start_pos;
226            return None;
227        }
228        self.pos += 1;
229
230        // Must have whitespace or end of line after ':'
231        if let Some(ch) = self.current_byte()
232            && ch != b' '
233            && ch != b'\t'
234            && ch != b'\n'
235            && ch != b'\r'
236        {
237            self.pos = start_pos;
238            return None;
239        }
240
241        Some(RefDefinition { label })
242    }
243
244    fn parse_ref_label(&mut self) -> Option<String> {
245        let mut label = String::new();
246        let mut has_content = false;
247
248        while let Some(ch) = self.current_byte() {
249            match ch {
250                b']' => {
251                    if has_content {
252                        return Some(label);
253                    } else {
254                        return None; // Empty label
255                    }
256                }
257                b'\n' | b'\r' => return None, // Newline in label
258                _ => {
259                    label.push(ch as char);
260                    has_content = true;
261                    self.pos += 1;
262                }
263            }
264        }
265        None
266    }
267
268    fn skip_to_next_line(&mut self) {
269        while let Some(ch) = self.current_byte() {
270            self.pos += 1;
271            if ch == b'\n' {
272                self.line += 1;
273                break;
274            }
275        }
276    }
277
278    fn skip_whitespace(&mut self) {
279        while self.pos < self.input.len() {
280            match self.input[self.pos] {
281                b' ' | b'\t' => self.pos += 1,
282                _ => break,
283            }
284        }
285    }
286
287    fn current_byte(&self) -> Option<u8> {
288        self.input.get(self.pos).copied()
289    }
290}
291
292/// Link types found in the document
293#[derive(Debug)]
294enum LinkType {
295    Reference {
296        label: String,
297        line: usize,
298        column: usize,
299    },
300    Image {
301        label: String,
302        line: usize,
303        column: usize,
304    },
305    Inline, // We don't care about inline links for this rule
306}
307
308/// Parser for links in markdown content
309struct LinkParser<'a> {
310    input: &'a [u8],
311    pos: usize,
312    line: usize,
313    line_start: usize,
314    in_code_block: bool,
315}
316
317impl<'a> LinkParser<'a> {
318    fn new(input: &'a [u8]) -> Self {
319        Self {
320            input,
321            pos: 0,
322            line: 1,
323            line_start: 0,
324            in_code_block: false,
325        }
326    }
327
328    fn next_link(&mut self) -> Option<LinkType> {
329        while self.pos < self.input.len() {
330            match self.current_byte()? {
331                b'`' => {
332                    if self.is_code_fence() {
333                        self.toggle_code_block();
334                    } else {
335                        self.skip_code_span();
336                    }
337                }
338                b'[' if !self.in_code_block => {
339                    if let Some(link) = self.try_parse_link() {
340                        return Some(link);
341                    } else {
342                        self.pos += 1;
343                    }
344                }
345                b'!' if !self.in_code_block => {
346                    if self.peek_byte(1) == Some(b'[') {
347                        if let Some(image) = self.try_parse_image() {
348                            return Some(image);
349                        } else {
350                            self.pos += 1;
351                        }
352                    } else {
353                        self.pos += 1;
354                    }
355                }
356                b'\n' => {
357                    self.line += 1;
358                    self.line_start = self.pos + 1;
359                    self.pos += 1;
360                }
361                _ => self.pos += 1,
362            }
363        }
364        None
365    }
366
367    fn try_parse_link(&mut self) -> Option<LinkType> {
368        let start_pos = self.pos;
369        let start_line = self.line;
370        let start_col = self.pos - self.line_start + 1;
371
372        // Skip '['
373        self.pos += 1;
374
375        // Parse link text
376        let _text = self.parse_link_text()?;
377
378        // Expect ']'
379        if self.current_byte() != Some(b']') {
380            self.pos = start_pos + 1; // Move forward to avoid infinite loop
381            return None;
382        }
383        self.pos += 1;
384
385        // Check what follows
386        match self.current_byte() {
387            Some(b'(') => {
388                // Inline link [text](url) - skip it
389                self.skip_inline_url();
390                Some(LinkType::Inline)
391            }
392            Some(b'[') => {
393                // Reference link [text][label] or collapsed [text][]
394                self.pos += 1;
395                let label = self.parse_reference_label().unwrap_or_default();
396
397                // If label is empty, this is a collapsed reference [text][]
398                // Use the text as the label
399                let final_label = if label.is_empty() { _text } else { label };
400
401                Some(LinkType::Reference {
402                    label: final_label,
403                    line: start_line,
404                    column: start_col,
405                })
406            }
407            _ => {
408                // Could be shortcut reference [label] but we need to check
409                // if it's actually at end of word/sentence
410                if self.is_likely_reference() {
411                    Some(LinkType::Reference {
412                        label: _text,
413                        line: start_line,
414                        column: start_col,
415                    })
416                } else {
417                    None
418                }
419            }
420        }
421    }
422
423    fn try_parse_image(&mut self) -> Option<LinkType> {
424        let start_pos = self.pos;
425        let start_line = self.line;
426        let start_col = self.pos - self.line_start + 1;
427
428        // Skip '!['
429        self.pos += 2;
430
431        // Parse alt text
432        let _alt_text = self.parse_link_text()?;
433
434        // Expect ']'
435        if self.current_byte() != Some(b']') {
436            self.pos = start_pos + 1; // Move forward to avoid infinite loop
437            return None;
438        }
439        self.pos += 1;
440
441        // Check what follows
442        match self.current_byte() {
443            Some(b'(') => {
444                // Inline image ![alt](url) - skip it
445                self.skip_inline_url();
446                Some(LinkType::Inline)
447            }
448            Some(b'[') => {
449                // Reference image ![alt][label] or collapsed ![alt][]
450                self.pos += 1;
451                let label = self.parse_reference_label().unwrap_or_default();
452
453                // If label is empty, this is a collapsed reference ![alt][]
454                // Use the alt text as the label
455                let final_label = if label.is_empty() { _alt_text } else { label };
456
457                Some(LinkType::Image {
458                    label: final_label,
459                    line: start_line,
460                    column: start_col,
461                })
462            }
463            _ => {
464                // Shortcut reference ![label]
465                Some(LinkType::Image {
466                    label: _alt_text,
467                    line: start_line,
468                    column: start_col,
469                })
470            }
471        }
472    }
473
474    fn parse_link_text(&mut self) -> Option<String> {
475        let mut text = String::new();
476        let mut bracket_depth = 0;
477
478        while self.pos < self.input.len() {
479            let ch = self.input[self.pos];
480            match ch {
481                b'[' => {
482                    bracket_depth += 1;
483                    text.push(ch as char);
484                    self.pos += 1;
485                }
486                b']' => {
487                    if bracket_depth > 0 {
488                        bracket_depth -= 1;
489                        text.push(ch as char);
490                        self.pos += 1;
491                    } else {
492                        return Some(text);
493                    }
494                }
495                b'\\' => {
496                    // Handle escaped characters
497                    self.pos += 1;
498                    if self.pos < self.input.len() {
499                        let escaped = self.input[self.pos];
500                        text.push('\\');
501                        text.push(escaped as char);
502                        self.pos += 1;
503                    }
504                }
505                b'\n' => return None, // Newline breaks link
506                _ => {
507                    text.push(ch as char);
508                    self.pos += 1;
509                }
510            }
511        }
512        None
513    }
514
515    fn parse_reference_label(&mut self) -> Option<String> {
516        let mut label = String::new();
517
518        while self.pos < self.input.len() {
519            let ch = self.input[self.pos];
520            match ch {
521                b']' => {
522                    self.pos += 1;
523                    return Some(label); // Return even if empty for collapsed refs
524                }
525                b'\n' => return None, // Newline breaks reference
526                _ => {
527                    label.push(ch as char);
528                    self.pos += 1;
529                }
530            }
531        }
532        None
533    }
534
535    fn skip_inline_url(&mut self) {
536        // Skip '('
537        if self.pos < self.input.len() && self.input[self.pos] == b'(' {
538            self.pos += 1;
539        }
540
541        let mut paren_depth = 1;
542        while self.pos < self.input.len() {
543            let ch = self.input[self.pos];
544            match ch {
545                b'(' => {
546                    paren_depth += 1;
547                    self.pos += 1;
548                }
549                b')' => {
550                    paren_depth -= 1;
551                    self.pos += 1;
552                    if paren_depth == 0 {
553                        break;
554                    }
555                }
556                b'\\' => {
557                    // Skip escaped character
558                    self.pos += 1;
559                    if self.pos < self.input.len() {
560                        self.pos += 1;
561                    }
562                }
563                _ => self.pos += 1,
564            }
565        }
566    }
567
568    fn skip_code_span(&mut self) {
569        let start = self.pos;
570        self.pos += 1;
571
572        // Count opening backticks
573        let mut backticks = 1;
574        while self.pos < self.input.len() && self.input[self.pos] == b'`' {
575            backticks += 1;
576            self.pos += 1;
577        }
578
579        // Find matching closing backticks
580        let mut found = 0;
581        while self.pos < self.input.len() {
582            let ch = self.input[self.pos];
583            if ch == b'`' {
584                found += 1;
585                self.pos += 1;
586                if found == backticks {
587                    return;
588                }
589            } else {
590                found = 0;
591                self.pos += 1;
592                if ch == b'\n' {
593                    self.line += 1;
594                    self.line_start = self.pos;
595                }
596            }
597        }
598
599        // If we didn't find closing backticks, reset
600        self.pos = start + 1;
601    }
602
603    fn is_code_fence(&mut self) -> bool {
604        let _start = self.pos;
605
606        // Check if we're at start of line (possibly with whitespace)
607        let mut line_pos = self.line_start;
608        while line_pos < self.pos {
609            match self.input.get(line_pos) {
610                Some(b' ') | Some(b'\t') => line_pos += 1,
611                _ => return false, // Non-whitespace before backticks
612            }
613        }
614
615        // Count consecutive backticks
616        let mut count = 0;
617        let mut pos = self.pos;
618        while pos < self.input.len() && self.input[pos] == b'`' {
619            count += 1;
620            pos += 1;
621        }
622
623        count >= 3
624    }
625
626    fn toggle_code_block(&mut self) {
627        self.in_code_block = !self.in_code_block;
628        // Skip the entire code fence line
629        while self.pos < self.input.len() {
630            let ch = self.input[self.pos];
631            self.pos += 1;
632            if ch == b'\n' {
633                self.line += 1;
634                self.line_start = self.pos;
635                break;
636            }
637        }
638    }
639
640    fn is_likely_reference(&self) -> bool {
641        // Simple heuristic: if followed by whitespace, punctuation, or end of line
642        if self.pos >= self.input.len() {
643            return true; // End of file
644        }
645
646        matches!(
647            self.input[self.pos],
648            b' ' | b'\t' | b'\n' | b'\r' | b'.' | b',' | b';' | b':' | b'!' | b'?'
649        )
650    }
651
652    fn current_byte(&self) -> Option<u8> {
653        self.input.get(self.pos).copied()
654    }
655
656    fn peek_byte(&self, offset: usize) -> Option<u8> {
657        self.input.get(self.pos + offset).copied()
658    }
659}
660
661#[cfg(test)]
662mod tests {
663    use super::*;
664    use crate::test_helpers::{
665        assert_no_violations, assert_single_violation, assert_violation_count,
666    };
667
668    #[test]
669    fn test_valid_references() {
670        let content = r#"[Full reference][label]
671[Collapsed reference][]
672
673[label]: https://example.com
674[collapsed reference]: https://example.com
675"#;
676
677        assert_no_violations(MD052::new(), content);
678    }
679
680    #[test]
681    fn test_undefined_reference() {
682        let content = r#"[Link text][undefined-label]
683
684[defined]: https://example.com
685"#;
686
687        let violation = assert_single_violation(MD052::new(), content);
688        assert_eq!(violation.line, 1);
689        assert!(violation.message.contains("undefined-label"));
690    }
691
692    #[test]
693    fn test_ignored_labels() {
694        let content = r#"[Checkbox][x]
695"#;
696
697        assert_no_violations(MD052::new(), content); // 'x' is ignored by default
698    }
699
700    #[test]
701    fn test_case_insensitive_matching() {
702        let content = r#"[Link][LABEL]
703
704[label]: https://example.com
705"#;
706
707        assert_no_violations(MD052::new(), content);
708    }
709
710    #[test]
711    fn test_collapsed_reference() {
712        let content = r#"[Label][]
713
714[label]: https://example.com
715"#;
716
717        assert_no_violations(MD052::new(), content);
718    }
719
720    #[test]
721    fn test_multiple_undefined_references() {
722        let content = r#"[Link 1][undefined1]
723[Link 2][undefined2]
724
725[defined]: https://example.com
726"#;
727
728        let violations = assert_violation_count(MD052::new(), content, 2);
729        assert!(violations[0].message.contains("undefined1"));
730        assert!(violations[1].message.contains("undefined2"));
731    }
732
733    #[test]
734    fn test_reference_images() {
735        let content = r#"![Alt text][undefined-image]
736
737[defined]: https://example.com
738"#;
739
740        let violation = assert_single_violation(MD052::new(), content);
741        assert_eq!(violation.line, 1);
742        assert!(violation.message.contains("undefined-image"));
743    }
744
745    #[test]
746    fn test_inline_links_ignored() {
747        let content = r#"[Inline link](https://example.com)
748![Inline image](image.png)
749"#;
750
751        assert_no_violations(MD052::new(), content);
752    }
753
754    #[test]
755    fn test_code_spans_ignored() {
756        let content = r#"`[not a link][label]`
757
758[label]: https://example.com
759"#;
760
761        assert_no_violations(MD052::new(), content);
762    }
763
764    #[test]
765    fn test_code_blocks_ignored() {
766        let content = r#"```
767[not a link][undefined]
768```
769
770[defined]: https://example.com
771"#;
772
773        assert_no_violations(MD052::new(), content);
774    }
775
776    #[test]
777    fn test_nested_brackets() {
778        let content = r#"[Link with [nested] text][label]
779
780[label]: https://example.com
781"#;
782
783        assert_no_violations(MD052::new(), content);
784    }
785
786    #[test]
787    fn test_escaped_brackets() {
788        let content = r#"\[Not a link\][label]
789
790[label]: https://example.com
791"#;
792
793        assert_no_violations(MD052::new(), content);
794    }
795
796    #[test]
797    fn test_shortcut_references() {
798        let content = r#"[label] is a shortcut reference.
799
800[label]: https://example.com
801"#;
802
803        assert_no_violations(MD052::new(), content);
804    }
805}