rumdl_lib/rules/
md040_fenced_code_language.rs

1use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
2use crate::utils::range_utils::{LineIndex, calculate_line_range};
3
4/// Rule MD040: Fenced code blocks should have a language
5///
6/// See [docs/md040.md](../../docs/md040.md) for full documentation, configuration, and examples.
7
8#[derive(Debug, Default, Clone)]
9pub struct MD040FencedCodeLanguage;
10
11impl Rule for MD040FencedCodeLanguage {
12    fn name(&self) -> &'static str {
13        "MD040"
14    }
15
16    fn description(&self) -> &'static str {
17        "Code blocks should have a language specified"
18    }
19
20    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
21        let content = ctx.content;
22        let _line_index = LineIndex::new(content.to_string());
23
24        let mut warnings = Vec::new();
25
26        let mut in_code_block = false;
27        let mut current_fence_marker: Option<String> = None;
28        let mut opening_fence_indent: usize = 0;
29
30        // Pre-compute disabled state to avoid O(n²) complexity
31        let mut is_disabled = false;
32
33        for (i, line) in content.lines().enumerate() {
34            let trimmed = line.trim();
35
36            // Update disabled state incrementally
37            if let Some(rules) = crate::rule::parse_disable_comment(trimmed)
38                && (rules.is_empty() || rules.contains(&self.name()))
39            {
40                is_disabled = true;
41            }
42            if let Some(rules) = crate::rule::parse_enable_comment(trimmed)
43                && (rules.is_empty() || rules.contains(&self.name()))
44            {
45                is_disabled = false;
46            }
47
48            // Skip processing if rule is disabled
49            if is_disabled {
50                continue;
51            }
52
53            // Determine fence marker if this is a fence line
54            let fence_marker = if trimmed.starts_with("```") {
55                let backtick_count = trimmed.chars().take_while(|&c| c == '`').count();
56                if backtick_count >= 3 {
57                    Some("`".repeat(backtick_count))
58                } else {
59                    None
60                }
61            } else if trimmed.starts_with("~~~") {
62                let tilde_count = trimmed.chars().take_while(|&c| c == '~').count();
63                if tilde_count >= 3 {
64                    Some("~".repeat(tilde_count))
65                } else {
66                    None
67                }
68            } else {
69                None
70            };
71
72            if let Some(fence_marker) = fence_marker {
73                if in_code_block {
74                    // We're inside a code block, check if this closes it
75                    if let Some(ref current_marker) = current_fence_marker {
76                        let current_indent = line.len() - line.trim_start().len();
77                        // Only close if the fence marker exactly matches the opening marker AND has no content after
78                        // AND the indentation is not greater than the opening fence
79                        if fence_marker == *current_marker
80                            && trimmed[current_marker.len()..].trim().is_empty()
81                            && current_indent <= opening_fence_indent
82                        {
83                            // This closes the current code block
84                            in_code_block = false;
85                            current_fence_marker = None;
86                            opening_fence_indent = 0;
87                        }
88                        // else: This is content inside a code block, ignore completely
89                    }
90                } else {
91                    // We're outside a code block, this opens one
92                    // Check if language is specified
93                    let after_fence = trimmed[fence_marker.len()..].trim();
94
95                    // Check if it has MkDocs title attribute but no language
96                    // Pattern: ``` title="Title" (missing language)
97                    // Valid: ```python title="Title" or ```py title="Title"
98                    let has_title_only =
99                        ctx.flavor == crate::config::MarkdownFlavor::MkDocs && after_fence.starts_with("title=");
100
101                    if after_fence.is_empty() || has_title_only {
102                        // Calculate precise character range for the entire fence line that needs a language
103                        let (start_line, start_col, end_line, end_col) = calculate_line_range(i + 1, line);
104
105                        warnings.push(LintWarning {
106                            rule_name: Some(self.name()),
107                            line: start_line,
108                            column: start_col,
109                            end_line,
110                            end_column: end_col,
111                            message: "Code block (```) missing language".to_string(),
112                            severity: Severity::Warning,
113                            fix: Some(Fix {
114                                range: {
115                                    // Replace just the fence marker with fence+language
116                                    let trimmed_start = line.len() - line.trim_start().len();
117                                    let fence_len = fence_marker.len();
118                                    let line_start_byte = ctx.line_offsets.get(i).copied().unwrap_or(0);
119                                    let fence_start_byte = line_start_byte + trimmed_start;
120                                    let fence_end_byte = fence_start_byte + fence_len;
121                                    fence_start_byte..fence_end_byte
122                                },
123                                replacement: format!("{fence_marker}text"),
124                            }),
125                        });
126                    }
127
128                    in_code_block = true;
129                    current_fence_marker = Some(fence_marker);
130                    opening_fence_indent = line.len() - line.trim_start().len();
131                }
132            }
133            // If we're inside a code block and this line is not a fence, ignore it
134        }
135
136        Ok(warnings)
137    }
138
139    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
140        let content = ctx.content;
141        let _line_index = LineIndex::new(content.to_string());
142
143        let mut result = String::new();
144        let mut in_code_block = false;
145        let mut current_fence_marker: Option<String> = None;
146        let mut fence_needs_language = false;
147        let mut original_indent = String::new();
148        let mut opening_fence_indent: usize = 0;
149
150        let lines: Vec<&str> = content.lines().collect();
151
152        // Helper function to check if we're in a nested context
153        let is_in_nested_context = |line_idx: usize| -> bool {
154            // Look for blockquote or list context above this line
155            for i in (0..line_idx).rev() {
156                let line = lines.get(i).unwrap_or(&"");
157                let trimmed = line.trim();
158
159                // If we hit a blank line, check if context continues
160                if trimmed.is_empty() {
161                    continue;
162                }
163
164                // Check for blockquote markers
165                if line.trim_start().starts_with('>') {
166                    return true;
167                }
168
169                // Check for list markers with sufficient indentation
170                if line.len() - line.trim_start().len() >= 2 {
171                    let after_indent = line.trim_start();
172                    if after_indent.starts_with("- ")
173                        || after_indent.starts_with("* ")
174                        || after_indent.starts_with("+ ")
175                        || (after_indent.len() > 2
176                            && after_indent.as_bytes().first().is_some_and(|&b| b.is_ascii_digit())
177                            && after_indent.as_bytes().get(1) == Some(&b'.')
178                            && after_indent.as_bytes().get(2) == Some(&b' '))
179                    {
180                        return true;
181                    }
182                }
183
184                // If we find content that's not indented, we're not in nested context
185                if line.starts_with(|c: char| !c.is_whitespace()) {
186                    break;
187                }
188            }
189            false
190        };
191
192        // Pre-compute disabled state to avoid O(n²) complexity
193        let mut is_disabled = false;
194
195        for (i, line) in lines.iter().enumerate() {
196            let trimmed = line.trim();
197
198            // Update disabled state incrementally
199            if let Some(rules) = crate::rule::parse_disable_comment(trimmed)
200                && (rules.is_empty() || rules.contains(&self.name()))
201            {
202                is_disabled = true;
203            }
204            if let Some(rules) = crate::rule::parse_enable_comment(trimmed)
205                && (rules.is_empty() || rules.contains(&self.name()))
206            {
207                is_disabled = false;
208            }
209
210            // Skip processing if rule is disabled, preserve the line as-is
211            if is_disabled {
212                result.push_str(line);
213                result.push('\n');
214                continue;
215            }
216
217            // Determine fence marker if this is a fence line
218            let fence_marker = if trimmed.starts_with("```") {
219                let backtick_count = trimmed.chars().take_while(|&c| c == '`').count();
220                if backtick_count >= 3 {
221                    Some("`".repeat(backtick_count))
222                } else {
223                    None
224                }
225            } else if trimmed.starts_with("~~~") {
226                let tilde_count = trimmed.chars().take_while(|&c| c == '~').count();
227                if tilde_count >= 3 {
228                    Some("~".repeat(tilde_count))
229                } else {
230                    None
231                }
232            } else {
233                None
234            };
235
236            if let Some(fence_marker) = fence_marker {
237                if in_code_block {
238                    // We're inside a code block, check if this closes it
239                    if let Some(ref current_marker) = current_fence_marker {
240                        let current_indent = line.len() - line.trim_start().len();
241                        if fence_marker == *current_marker
242                            && trimmed[current_marker.len()..].trim().is_empty()
243                            && current_indent <= opening_fence_indent
244                        {
245                            // This closes the current code block
246                            if fence_needs_language {
247                                // Use the same indentation as the opening fence
248                                result.push_str(&format!("{original_indent}{trimmed}\n"));
249                            } else {
250                                // Preserve original line as-is
251                                result.push_str(line);
252                                result.push('\n');
253                            }
254                            in_code_block = false;
255                            current_fence_marker = None;
256                            fence_needs_language = false;
257                            original_indent.clear();
258                            opening_fence_indent = 0;
259                        } else {
260                            // This is content inside a code block (different fence marker) - preserve exactly as-is
261                            result.push_str(line);
262                            result.push('\n');
263                        }
264                    } else {
265                        // This shouldn't happen, but preserve as content
266                        result.push_str(line);
267                        result.push('\n');
268                    }
269                } else {
270                    // We're outside a code block, this opens one
271                    // Capture the original indentation
272                    let line_indent = line[..line.len() - line.trim_start().len()].to_string();
273
274                    // Add 'text' as default language for opening fence if no language specified
275                    let after_fence = trimmed[fence_marker.len()..].trim();
276
277                    // Check if it has MkDocs title attribute but no language
278                    let has_title_only =
279                        ctx.flavor == crate::config::MarkdownFlavor::MkDocs && after_fence.starts_with("title=");
280
281                    if after_fence.is_empty() || has_title_only {
282                        // Decide whether to preserve indentation based on context
283                        let should_preserve_indent = is_in_nested_context(i);
284
285                        if should_preserve_indent {
286                            // Preserve indentation for nested contexts
287                            original_indent = line_indent;
288                            if has_title_only {
289                                // Insert language before title attribute
290                                result.push_str(&format!("{original_indent}{fence_marker}text {after_fence}\n"));
291                            } else {
292                                result.push_str(&format!("{original_indent}{fence_marker}text\n"));
293                            }
294                        } else {
295                            // Remove indentation for standalone code blocks
296                            original_indent = String::new();
297                            if has_title_only {
298                                // Insert language before title attribute
299                                result.push_str(&format!("{fence_marker}text {after_fence}\n"));
300                            } else {
301                                result.push_str(&format!("{fence_marker}text\n"));
302                            }
303                        }
304                        fence_needs_language = true;
305                    } else {
306                        // Keep original line as-is since it already has a language
307                        result.push_str(line);
308                        result.push('\n');
309                        fence_needs_language = false;
310                    }
311
312                    in_code_block = true;
313                    current_fence_marker = Some(fence_marker);
314                    opening_fence_indent = line.len() - line.trim_start().len();
315                }
316            } else if in_code_block {
317                // We're inside a code block and this is not a fence line - preserve exactly as-is
318                result.push_str(line);
319                result.push('\n');
320            } else {
321                // We're outside code blocks and this is not a fence line - preserve as-is
322                result.push_str(line);
323                result.push('\n');
324            }
325        }
326
327        // Remove trailing newline if the original content didn't have one
328        if !content.ends_with('\n') {
329            result.pop();
330        }
331
332        Ok(result)
333    }
334
335    /// Get the category of this rule for selective processing
336    fn category(&self) -> RuleCategory {
337        RuleCategory::CodeBlock
338    }
339
340    /// Check if this rule should be skipped
341    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
342        let content = ctx.content;
343        content.is_empty() || (!content.contains("```") && !content.contains("~~~"))
344    }
345
346    fn as_any(&self) -> &dyn std::any::Any {
347        self
348    }
349
350    fn from_config(_config: &crate::config::Config) -> Box<dyn Rule>
351    where
352        Self: Sized,
353    {
354        Box::new(MD040FencedCodeLanguage)
355    }
356}
357
358#[cfg(test)]
359mod tests {
360    use super::*;
361    use crate::lint_context::LintContext;
362
363    fn run_check(content: &str) -> LintResult {
364        let rule = MD040FencedCodeLanguage;
365        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
366        rule.check(&ctx)
367    }
368
369    fn run_fix(content: &str) -> Result<String, LintError> {
370        let rule = MD040FencedCodeLanguage;
371        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
372        rule.fix(&ctx)
373    }
374
375    #[test]
376    fn test_code_blocks_with_language_specified() {
377        // Basic test with language
378        let content = r#"# Test
379
380```python
381print("Hello, world!")
382```
383
384```javascript
385console.log("Hello!");
386```
387"#;
388        let result = run_check(content).unwrap();
389        assert!(result.is_empty(), "No warnings expected for code blocks with language");
390    }
391
392    #[test]
393    fn test_code_blocks_without_language() {
394        let content = r#"# Test
395
396```
397print("Hello, world!")
398```
399"#;
400        let result = run_check(content).unwrap();
401        assert_eq!(result.len(), 1);
402        assert_eq!(result[0].message, "Code block (```) missing language");
403        assert_eq!(result[0].line, 3);
404    }
405
406    #[test]
407    fn test_code_blocks_with_empty_language() {
408        // Test with spaces after the fence
409        let content = r#"# Test
410
411```
412print("Hello, world!")
413```
414"#;
415        let result = run_check(content).unwrap();
416        assert_eq!(result.len(), 1);
417        assert_eq!(result[0].message, "Code block (```) missing language");
418    }
419
420    #[test]
421    fn test_indented_code_blocks_should_be_ignored() {
422        // Indented code blocks (4 spaces) should not trigger the rule
423        let content = r#"# Test
424
425    This is an indented code block
426    It should not trigger MD040
427"#;
428        let result = run_check(content).unwrap();
429        assert!(result.is_empty(), "Indented code blocks should be ignored");
430    }
431
432    #[test]
433    fn test_inline_code_spans_should_be_ignored() {
434        let content = r#"# Test
435
436This is `inline code` and should not trigger warnings.
437
438Use the `print()` function.
439"#;
440        let result = run_check(content).unwrap();
441        assert!(result.is_empty(), "Inline code spans should be ignored");
442    }
443
444    #[test]
445    fn test_tildes_vs_backticks_for_fences() {
446        // Test tilde fences without language
447        let content_tildes_no_lang = r#"# Test
448
449~~~
450code here
451~~~
452"#;
453        let result = run_check(content_tildes_no_lang).unwrap();
454        assert_eq!(result.len(), 1);
455        assert_eq!(result[0].message, "Code block (```) missing language");
456
457        // Test tilde fences with language
458        let content_tildes_with_lang = r#"# Test
459
460~~~python
461code here
462~~~
463"#;
464        let result = run_check(content_tildes_with_lang).unwrap();
465        assert!(result.is_empty());
466
467        // Mixed fences
468        let content_mixed = r#"# Test
469
470```python
471code here
472```
473
474~~~javascript
475more code
476~~~
477
478```
479no language
480```
481
482~~~
483also no language
484~~~
485"#;
486        let result = run_check(content_mixed).unwrap();
487        assert_eq!(result.len(), 2);
488    }
489
490    #[test]
491    fn test_language_with_additional_parameters() {
492        let content = r#"# Test
493
494```python {highlight=[1,2]}
495print("Line 1")
496print("Line 2")
497```
498
499```javascript {.line-numbers startFrom="10"}
500console.log("Hello");
501```
502
503```ruby {data-line="1,3-4"}
504puts "Hello"
505puts "World"
506puts "!"
507```
508"#;
509        let result = run_check(content).unwrap();
510        assert!(
511            result.is_empty(),
512            "Code blocks with language and parameters should pass"
513        );
514    }
515
516    #[test]
517    fn test_multiple_code_blocks_in_document() {
518        let content = r#"# Test Document
519
520First block without language:
521```
522code here
523```
524
525Second block with language:
526```python
527print("hello")
528```
529
530Third block without language:
531```
532more code
533```
534
535Fourth block with language:
536```javascript
537console.log("test");
538```
539"#;
540        let result = run_check(content).unwrap();
541        assert_eq!(result.len(), 2);
542        assert_eq!(result[0].line, 4);
543        assert_eq!(result[1].line, 14);
544    }
545
546    #[test]
547    fn test_nested_code_blocks_in_lists() {
548        let content = r#"# Test
549
550- Item 1
551  ```python
552  print("nested with language")
553  ```
554
555- Item 2
556  ```
557  nested without language
558  ```
559
560- Item 3
561  - Nested item
562    ```javascript
563    console.log("deeply nested");
564    ```
565
566  - Another nested
567    ```
568    no language
569    ```
570"#;
571        let result = run_check(content).unwrap();
572        assert_eq!(result.len(), 2);
573        // Check that it detects the blocks without language
574        assert_eq!(result[0].line, 9);
575        assert_eq!(result[1].line, 20);
576    }
577
578    #[test]
579    fn test_code_blocks_in_blockquotes() {
580        let content = r#"# Test
581
582> This is a blockquote
583> ```python
584> print("with language")
585> ```
586
587> Another blockquote
588> ```
589> without language
590> ```
591"#;
592        let result = run_check(content).unwrap();
593        // The implementation doesn't detect code blocks inside blockquotes
594        // This is by design to avoid complexity with nested structures
595        assert_eq!(result.len(), 0);
596    }
597
598    #[test]
599    fn test_fix_method_adds_text_language() {
600        let content = r#"# Test
601
602```
603code without language
604```
605
606```python
607already has language
608```
609
610```
611another block without
612```
613"#;
614        let fixed = run_fix(content).unwrap();
615        assert!(fixed.contains("```text"));
616        assert!(fixed.contains("```python"));
617        assert_eq!(fixed.matches("```text").count(), 2);
618    }
619
620    #[test]
621    fn test_fix_preserves_indentation() {
622        let content = r#"# Test
623
624- List item
625  ```
626  indented code block
627  ```
628"#;
629        let fixed = run_fix(content).unwrap();
630        // The implementation appears to remove indentation for standalone blocks
631        // but preserve it for nested contexts. This test case seems to be treating
632        // it as a standalone block.
633        assert!(fixed.contains("```text"));
634        assert!(fixed.contains("  indented code block"));
635    }
636
637    #[test]
638    fn test_fix_with_tilde_fences() {
639        let content = r#"# Test
640
641~~~
642code with tildes
643~~~
644"#;
645        let fixed = run_fix(content).unwrap();
646        assert!(fixed.contains("~~~text"));
647    }
648
649    #[test]
650    fn test_longer_fence_markers() {
651        let content = r#"# Test
652
653````
654code with four backticks
655````
656
657`````python
658code with five backticks and language
659`````
660
661~~~~~~
662code with six tildes
663~~~~~~
664"#;
665        let result = run_check(content).unwrap();
666        assert_eq!(result.len(), 2);
667
668        let fixed = run_fix(content).unwrap();
669        assert!(fixed.contains("````text"));
670        assert!(fixed.contains("~~~~~~text"));
671        assert!(fixed.contains("`````python"));
672    }
673
674    #[test]
675    fn test_nested_code_blocks_different_markers() {
676        let content = r#"# Test
677
678````markdown
679This is a markdown block
680
681```python
682# This is nested code
683print("hello")
684```
685
686More markdown
687````
688"#;
689        let result = run_check(content).unwrap();
690        assert!(
691            result.is_empty(),
692            "Nested code blocks with different markers should not trigger warnings"
693        );
694    }
695
696    #[test]
697    fn test_disable_enable_comments() {
698        let content = r#"# Test
699
700<!-- rumdl-disable MD040 -->
701```
702this should not trigger warning
703```
704<!-- rumdl-enable MD040 -->
705
706```
707this should trigger warning
708```
709"#;
710        let result = run_check(content).unwrap();
711        assert_eq!(result.len(), 1);
712        assert_eq!(result[0].line, 9);
713    }
714
715    #[test]
716    fn test_fence_with_language_only_on_closing() {
717        // Edge case: language on closing fence should not be interpreted
718        let content = r#"# Test
719
720```
721code
722```python
723"#;
724        let result = run_check(content).unwrap();
725        assert_eq!(result.len(), 1);
726    }
727
728    #[test]
729    fn test_incomplete_code_blocks() {
730        // Test unclosed code block
731        let content = r#"# Test
732
733```python
734this code block is not closed"#;
735        let result = run_check(content).unwrap();
736        assert!(
737            result.is_empty(),
738            "Unclosed code blocks with language should not trigger warnings"
739        );
740
741        // Test unclosed code block without language
742        let content_no_lang = r#"# Test
743
744```
745this code block is not closed"#;
746        let result = run_check(content_no_lang).unwrap();
747        assert_eq!(result.len(), 1);
748    }
749
750    #[test]
751    fn test_fix_preserves_original_formatting() {
752        let content = r#"# Test
753
754```
755code
756```
757
758No newline at end"#;
759        let fixed = run_fix(content).unwrap();
760        assert!(!fixed.ends_with('\n'), "Fix should preserve lack of trailing newline");
761
762        let content_with_newline = "# Test\n\n```\ncode\n```\n";
763        let fixed = run_fix(content_with_newline).unwrap();
764        assert!(fixed.ends_with('\n'), "Fix should preserve trailing newline");
765    }
766
767    #[test]
768    fn test_edge_case_backticks_in_content() {
769        let content = r#"# Test
770
771```javascript
772console.log(`template string with backticks`);
773// This line has ``` in a comment
774```
775"#;
776        let result = run_check(content).unwrap();
777        assert!(
778            result.is_empty(),
779            "Backticks inside code blocks should not affect parsing"
780        );
781    }
782
783    #[test]
784    fn test_empty_document() {
785        let content = "";
786        let result = run_check(content).unwrap();
787        assert!(result.is_empty());
788    }
789
790    #[test]
791    fn test_should_skip_optimization() {
792        let rule = MD040FencedCodeLanguage;
793
794        // Document without code fences should skip
795        let ctx = LintContext::new("# Just a header\n\nSome text", crate::config::MarkdownFlavor::Standard);
796        assert!(rule.should_skip(&ctx));
797
798        // Document with backtick fences should not skip
799        let ctx = LintContext::new("```\ncode\n```", crate::config::MarkdownFlavor::Standard);
800        assert!(!rule.should_skip(&ctx));
801
802        // Document with tilde fences should not skip
803        let ctx = LintContext::new("~~~\ncode\n~~~", crate::config::MarkdownFlavor::Standard);
804        assert!(!rule.should_skip(&ctx));
805
806        // Empty document should skip
807        let ctx = LintContext::new("", crate::config::MarkdownFlavor::Standard);
808        assert!(rule.should_skip(&ctx));
809    }
810}