rumdl_lib/rules/
md040_fenced_code_language.rs

1use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
2use crate::utils::range_utils::{LineIndex, calculate_line_range};
3
4/// Rule MD040: Fenced code blocks should have a language
5///
6/// See [docs/md040.md](../../docs/md040.md) for full documentation, configuration, and examples.
7
8#[derive(Debug, Default, Clone)]
9pub struct MD040FencedCodeLanguage;
10
11impl Rule for MD040FencedCodeLanguage {
12    fn name(&self) -> &'static str {
13        "MD040"
14    }
15
16    fn description(&self) -> &'static str {
17        "Code blocks should have a language specified"
18    }
19
20    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
21        let content = ctx.content;
22        let _line_index = LineIndex::new(content.to_string());
23
24        let mut warnings = Vec::new();
25
26        let mut in_code_block = false;
27        let mut current_fence_marker: Option<String> = None;
28        let mut opening_fence_indent: usize = 0;
29
30        // Pre-compute disabled state to avoid O(n²) complexity
31        let mut is_disabled = false;
32
33        for (i, line) in content.lines().enumerate() {
34            let trimmed = line.trim();
35
36            // Update disabled state incrementally
37            if let Some(rules) = crate::rule::parse_disable_comment(trimmed)
38                && (rules.is_empty() || rules.contains(&self.name()))
39            {
40                is_disabled = true;
41            }
42            if let Some(rules) = crate::rule::parse_enable_comment(trimmed)
43                && (rules.is_empty() || rules.contains(&self.name()))
44            {
45                is_disabled = false;
46            }
47
48            // Skip processing if rule is disabled
49            if is_disabled {
50                continue;
51            }
52
53            // Determine fence marker if this is a fence line
54            let fence_marker = if trimmed.starts_with("```") {
55                let backtick_count = trimmed.chars().take_while(|&c| c == '`').count();
56                if backtick_count >= 3 {
57                    Some("`".repeat(backtick_count))
58                } else {
59                    None
60                }
61            } else if trimmed.starts_with("~~~") {
62                let tilde_count = trimmed.chars().take_while(|&c| c == '~').count();
63                if tilde_count >= 3 {
64                    Some("~".repeat(tilde_count))
65                } else {
66                    None
67                }
68            } else {
69                None
70            };
71
72            if let Some(fence_marker) = fence_marker {
73                if in_code_block {
74                    // We're inside a code block, check if this closes it
75                    if let Some(ref current_marker) = current_fence_marker {
76                        let current_indent = line.len() - line.trim_start().len();
77                        // Only close if the fence marker exactly matches the opening marker AND has no content after
78                        // AND the indentation is not greater than the opening fence
79                        if fence_marker == *current_marker
80                            && trimmed[current_marker.len()..].trim().is_empty()
81                            && current_indent <= opening_fence_indent
82                        {
83                            // This closes the current code block
84                            in_code_block = false;
85                            current_fence_marker = None;
86                            opening_fence_indent = 0;
87                        }
88                        // else: This is content inside a code block, ignore completely
89                    }
90                } else {
91                    // We're outside a code block, this opens one
92                    // Check if language is specified
93                    let after_fence = trimmed[fence_marker.len()..].trim();
94
95                    // Check if it has MkDocs title attribute but no language
96                    // Pattern: ``` title="Title" (missing language)
97                    // Valid: ```python title="Title" or ```py title="Title"
98                    let has_title_only =
99                        ctx.flavor == crate::config::MarkdownFlavor::MkDocs && after_fence.starts_with("title=");
100
101                    if after_fence.is_empty() || has_title_only {
102                        // Calculate precise character range for the entire fence line that needs a language
103                        let (start_line, start_col, end_line, end_col) = calculate_line_range(i + 1, line);
104
105                        warnings.push(LintWarning {
106                            rule_name: Some(self.name()),
107                            line: start_line,
108                            column: start_col,
109                            end_line,
110                            end_column: end_col,
111                            message: "Code block (```) missing language".to_string(),
112                            severity: Severity::Warning,
113                            fix: Some(Fix {
114                                range: {
115                                    // Replace just the fence marker with fence+language
116                                    let trimmed_start = line.len() - line.trim_start().len();
117                                    let fence_len = fence_marker.len();
118                                    let line_start_byte = ctx.line_offsets.get(i).copied().unwrap_or(0);
119                                    let fence_start_byte = line_start_byte + trimmed_start;
120                                    let fence_end_byte = fence_start_byte + fence_len;
121                                    fence_start_byte..fence_end_byte
122                                },
123                                replacement: format!("{fence_marker}text"),
124                            }),
125                        });
126                    }
127
128                    in_code_block = true;
129                    current_fence_marker = Some(fence_marker);
130                    opening_fence_indent = line.len() - line.trim_start().len();
131                }
132            }
133            // If we're inside a code block and this line is not a fence, ignore it
134        }
135
136        Ok(warnings)
137    }
138
139    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
140        let content = ctx.content;
141        let _line_index = LineIndex::new(content.to_string());
142
143        let mut result = String::new();
144        let mut in_code_block = false;
145        let mut current_fence_marker: Option<String> = None;
146        let mut fence_needs_language = false;
147        let mut original_indent = String::new();
148        let mut opening_fence_indent: usize = 0;
149
150        let lines: Vec<&str> = content.lines().collect();
151
152        // Helper function to check if we're in a nested context
153        let is_in_nested_context = |line_idx: usize| -> bool {
154            // Look for blockquote or list context above this line
155            for i in (0..line_idx).rev() {
156                let line = lines.get(i).unwrap_or(&"");
157                let trimmed = line.trim();
158
159                // If we hit a blank line, check if context continues
160                if trimmed.is_empty() {
161                    continue;
162                }
163
164                // Check for blockquote markers
165                if line.trim_start().starts_with('>') {
166                    return true;
167                }
168
169                // Check for list markers with sufficient indentation
170                if line.len() - line.trim_start().len() >= 2 {
171                    let after_indent = line.trim_start();
172                    if after_indent.starts_with("- ")
173                        || after_indent.starts_with("* ")
174                        || after_indent.starts_with("+ ")
175                        || (after_indent.len() > 2
176                            && after_indent.as_bytes().first().is_some_and(|&b| b.is_ascii_digit())
177                            && after_indent.as_bytes().get(1) == Some(&b'.')
178                            && after_indent.as_bytes().get(2) == Some(&b' '))
179                    {
180                        return true;
181                    }
182                }
183
184                // If we find content that's not indented, we're not in nested context
185                if line.starts_with(|c: char| !c.is_whitespace()) {
186                    break;
187                }
188            }
189            false
190        };
191
192        // Pre-compute disabled state to avoid O(n²) complexity
193        let mut is_disabled = false;
194
195        for (i, line) in lines.iter().enumerate() {
196            let trimmed = line.trim();
197
198            // Update disabled state incrementally
199            if let Some(rules) = crate::rule::parse_disable_comment(trimmed)
200                && (rules.is_empty() || rules.contains(&self.name()))
201            {
202                is_disabled = true;
203            }
204            if let Some(rules) = crate::rule::parse_enable_comment(trimmed)
205                && (rules.is_empty() || rules.contains(&self.name()))
206            {
207                is_disabled = false;
208            }
209
210            // Skip processing if rule is disabled, preserve the line as-is
211            if is_disabled {
212                result.push_str(line);
213                result.push('\n');
214                continue;
215            }
216
217            // Determine fence marker if this is a fence line
218            let fence_marker = if trimmed.starts_with("```") {
219                let backtick_count = trimmed.chars().take_while(|&c| c == '`').count();
220                if backtick_count >= 3 {
221                    Some("`".repeat(backtick_count))
222                } else {
223                    None
224                }
225            } else if trimmed.starts_with("~~~") {
226                let tilde_count = trimmed.chars().take_while(|&c| c == '~').count();
227                if tilde_count >= 3 {
228                    Some("~".repeat(tilde_count))
229                } else {
230                    None
231                }
232            } else {
233                None
234            };
235
236            if let Some(fence_marker) = fence_marker {
237                if in_code_block {
238                    // We're inside a code block, check if this closes it
239                    if let Some(ref current_marker) = current_fence_marker {
240                        let current_indent = line.len() - line.trim_start().len();
241                        if fence_marker == *current_marker
242                            && trimmed[current_marker.len()..].trim().is_empty()
243                            && current_indent <= opening_fence_indent
244                        {
245                            // This closes the current code block
246                            if fence_needs_language {
247                                // Use the same indentation as the opening fence
248                                result.push_str(&format!("{original_indent}{trimmed}\n"));
249                            } else {
250                                // Preserve original line as-is
251                                result.push_str(line);
252                                result.push('\n');
253                            }
254                            in_code_block = false;
255                            current_fence_marker = None;
256                            fence_needs_language = false;
257                            original_indent.clear();
258                            opening_fence_indent = 0;
259                        } else {
260                            // This is content inside a code block (different fence marker) - preserve exactly as-is
261                            result.push_str(line);
262                            result.push('\n');
263                        }
264                    } else {
265                        // This shouldn't happen, but preserve as content
266                        result.push_str(line);
267                        result.push('\n');
268                    }
269                } else {
270                    // We're outside a code block, this opens one
271                    // Capture the original indentation
272                    let line_indent = line[..line.len() - line.trim_start().len()].to_string();
273
274                    // Add 'text' as default language for opening fence if no language specified
275                    let after_fence = trimmed[fence_marker.len()..].trim();
276
277                    // Check if it has MkDocs title attribute but no language
278                    let has_title_only =
279                        ctx.flavor == crate::config::MarkdownFlavor::MkDocs && after_fence.starts_with("title=");
280
281                    if after_fence.is_empty() || has_title_only {
282                        // Decide whether to preserve indentation based on context
283                        let should_preserve_indent = is_in_nested_context(i);
284
285                        if should_preserve_indent {
286                            // Preserve indentation for nested contexts
287                            original_indent = line_indent;
288                            if has_title_only {
289                                // Insert language before title attribute
290                                result.push_str(&format!("{original_indent}{fence_marker}text {after_fence}\n"));
291                            } else {
292                                result.push_str(&format!("{original_indent}{fence_marker}text\n"));
293                            }
294                        } else {
295                            // Remove indentation for standalone code blocks
296                            original_indent = String::new();
297                            if has_title_only {
298                                // Insert language before title attribute
299                                result.push_str(&format!("{fence_marker}text {after_fence}\n"));
300                            } else {
301                                result.push_str(&format!("{fence_marker}text\n"));
302                            }
303                        }
304                        fence_needs_language = true;
305                    } else {
306                        // Keep original line as-is since it already has a language
307                        result.push_str(line);
308                        result.push('\n');
309                        fence_needs_language = false;
310                    }
311
312                    in_code_block = true;
313                    current_fence_marker = Some(fence_marker);
314                    opening_fence_indent = line.len() - line.trim_start().len();
315                }
316            } else if in_code_block {
317                // We're inside a code block and this is not a fence line - preserve exactly as-is
318                result.push_str(line);
319                result.push('\n');
320            } else {
321                // We're outside code blocks and this is not a fence line - preserve as-is
322                result.push_str(line);
323                result.push('\n');
324            }
325        }
326
327        // Remove trailing newline if the original content didn't have one
328        if !content.ends_with('\n') {
329            result.pop();
330        }
331
332        Ok(result)
333    }
334
335    /// Get the category of this rule for selective processing
336    fn category(&self) -> RuleCategory {
337        RuleCategory::CodeBlock
338    }
339
340    /// Check if this rule should be skipped
341    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
342        ctx.content.is_empty() || (!ctx.likely_has_code() && !ctx.has_char('~'))
343    }
344
345    fn as_any(&self) -> &dyn std::any::Any {
346        self
347    }
348
349    fn from_config(_config: &crate::config::Config) -> Box<dyn Rule>
350    where
351        Self: Sized,
352    {
353        Box::new(MD040FencedCodeLanguage)
354    }
355}
356
357#[cfg(test)]
358mod tests {
359    use super::*;
360    use crate::lint_context::LintContext;
361
362    fn run_check(content: &str) -> LintResult {
363        let rule = MD040FencedCodeLanguage;
364        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
365        rule.check(&ctx)
366    }
367
368    fn run_fix(content: &str) -> Result<String, LintError> {
369        let rule = MD040FencedCodeLanguage;
370        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
371        rule.fix(&ctx)
372    }
373
374    #[test]
375    fn test_code_blocks_with_language_specified() {
376        // Basic test with language
377        let content = r#"# Test
378
379```python
380print("Hello, world!")
381```
382
383```javascript
384console.log("Hello!");
385```
386"#;
387        let result = run_check(content).unwrap();
388        assert!(result.is_empty(), "No warnings expected for code blocks with language");
389    }
390
391    #[test]
392    fn test_code_blocks_without_language() {
393        let content = r#"# Test
394
395```
396print("Hello, world!")
397```
398"#;
399        let result = run_check(content).unwrap();
400        assert_eq!(result.len(), 1);
401        assert_eq!(result[0].message, "Code block (```) missing language");
402        assert_eq!(result[0].line, 3);
403    }
404
405    #[test]
406    fn test_code_blocks_with_empty_language() {
407        // Test with spaces after the fence
408        let content = r#"# Test
409
410```
411print("Hello, world!")
412```
413"#;
414        let result = run_check(content).unwrap();
415        assert_eq!(result.len(), 1);
416        assert_eq!(result[0].message, "Code block (```) missing language");
417    }
418
419    #[test]
420    fn test_indented_code_blocks_should_be_ignored() {
421        // Indented code blocks (4 spaces) should not trigger the rule
422        let content = r#"# Test
423
424    This is an indented code block
425    It should not trigger MD040
426"#;
427        let result = run_check(content).unwrap();
428        assert!(result.is_empty(), "Indented code blocks should be ignored");
429    }
430
431    #[test]
432    fn test_inline_code_spans_should_be_ignored() {
433        let content = r#"# Test
434
435This is `inline code` and should not trigger warnings.
436
437Use the `print()` function.
438"#;
439        let result = run_check(content).unwrap();
440        assert!(result.is_empty(), "Inline code spans should be ignored");
441    }
442
443    #[test]
444    fn test_tildes_vs_backticks_for_fences() {
445        // Test tilde fences without language
446        let content_tildes_no_lang = r#"# Test
447
448~~~
449code here
450~~~
451"#;
452        let result = run_check(content_tildes_no_lang).unwrap();
453        assert_eq!(result.len(), 1);
454        assert_eq!(result[0].message, "Code block (```) missing language");
455
456        // Test tilde fences with language
457        let content_tildes_with_lang = r#"# Test
458
459~~~python
460code here
461~~~
462"#;
463        let result = run_check(content_tildes_with_lang).unwrap();
464        assert!(result.is_empty());
465
466        // Mixed fences
467        let content_mixed = r#"# Test
468
469```python
470code here
471```
472
473~~~javascript
474more code
475~~~
476
477```
478no language
479```
480
481~~~
482also no language
483~~~
484"#;
485        let result = run_check(content_mixed).unwrap();
486        assert_eq!(result.len(), 2);
487    }
488
489    #[test]
490    fn test_language_with_additional_parameters() {
491        let content = r#"# Test
492
493```python {highlight=[1,2]}
494print("Line 1")
495print("Line 2")
496```
497
498```javascript {.line-numbers startFrom="10"}
499console.log("Hello");
500```
501
502```ruby {data-line="1,3-4"}
503puts "Hello"
504puts "World"
505puts "!"
506```
507"#;
508        let result = run_check(content).unwrap();
509        assert!(
510            result.is_empty(),
511            "Code blocks with language and parameters should pass"
512        );
513    }
514
515    #[test]
516    fn test_multiple_code_blocks_in_document() {
517        let content = r#"# Test Document
518
519First block without language:
520```
521code here
522```
523
524Second block with language:
525```python
526print("hello")
527```
528
529Third block without language:
530```
531more code
532```
533
534Fourth block with language:
535```javascript
536console.log("test");
537```
538"#;
539        let result = run_check(content).unwrap();
540        assert_eq!(result.len(), 2);
541        assert_eq!(result[0].line, 4);
542        assert_eq!(result[1].line, 14);
543    }
544
545    #[test]
546    fn test_nested_code_blocks_in_lists() {
547        let content = r#"# Test
548
549- Item 1
550  ```python
551  print("nested with language")
552  ```
553
554- Item 2
555  ```
556  nested without language
557  ```
558
559- Item 3
560  - Nested item
561    ```javascript
562    console.log("deeply nested");
563    ```
564
565  - Another nested
566    ```
567    no language
568    ```
569"#;
570        let result = run_check(content).unwrap();
571        assert_eq!(result.len(), 2);
572        // Check that it detects the blocks without language
573        assert_eq!(result[0].line, 9);
574        assert_eq!(result[1].line, 20);
575    }
576
577    #[test]
578    fn test_code_blocks_in_blockquotes() {
579        let content = r#"# Test
580
581> This is a blockquote
582> ```python
583> print("with language")
584> ```
585
586> Another blockquote
587> ```
588> without language
589> ```
590"#;
591        let result = run_check(content).unwrap();
592        // The implementation doesn't detect code blocks inside blockquotes
593        // This is by design to avoid complexity with nested structures
594        assert_eq!(result.len(), 0);
595    }
596
597    #[test]
598    fn test_fix_method_adds_text_language() {
599        let content = r#"# Test
600
601```
602code without language
603```
604
605```python
606already has language
607```
608
609```
610another block without
611```
612"#;
613        let fixed = run_fix(content).unwrap();
614        assert!(fixed.contains("```text"));
615        assert!(fixed.contains("```python"));
616        assert_eq!(fixed.matches("```text").count(), 2);
617    }
618
619    #[test]
620    fn test_fix_preserves_indentation() {
621        let content = r#"# Test
622
623- List item
624  ```
625  indented code block
626  ```
627"#;
628        let fixed = run_fix(content).unwrap();
629        // The implementation appears to remove indentation for standalone blocks
630        // but preserve it for nested contexts. This test case seems to be treating
631        // it as a standalone block.
632        assert!(fixed.contains("```text"));
633        assert!(fixed.contains("  indented code block"));
634    }
635
636    #[test]
637    fn test_fix_with_tilde_fences() {
638        let content = r#"# Test
639
640~~~
641code with tildes
642~~~
643"#;
644        let fixed = run_fix(content).unwrap();
645        assert!(fixed.contains("~~~text"));
646    }
647
648    #[test]
649    fn test_longer_fence_markers() {
650        let content = r#"# Test
651
652````
653code with four backticks
654````
655
656`````python
657code with five backticks and language
658`````
659
660~~~~~~
661code with six tildes
662~~~~~~
663"#;
664        let result = run_check(content).unwrap();
665        assert_eq!(result.len(), 2);
666
667        let fixed = run_fix(content).unwrap();
668        assert!(fixed.contains("````text"));
669        assert!(fixed.contains("~~~~~~text"));
670        assert!(fixed.contains("`````python"));
671    }
672
673    #[test]
674    fn test_nested_code_blocks_different_markers() {
675        let content = r#"# Test
676
677````markdown
678This is a markdown block
679
680```python
681# This is nested code
682print("hello")
683```
684
685More markdown
686````
687"#;
688        let result = run_check(content).unwrap();
689        assert!(
690            result.is_empty(),
691            "Nested code blocks with different markers should not trigger warnings"
692        );
693    }
694
695    #[test]
696    fn test_disable_enable_comments() {
697        let content = r#"# Test
698
699<!-- rumdl-disable MD040 -->
700```
701this should not trigger warning
702```
703<!-- rumdl-enable MD040 -->
704
705```
706this should trigger warning
707```
708"#;
709        let result = run_check(content).unwrap();
710        assert_eq!(result.len(), 1);
711        assert_eq!(result[0].line, 9);
712    }
713
714    #[test]
715    fn test_fence_with_language_only_on_closing() {
716        // Edge case: language on closing fence should not be interpreted
717        let content = r#"# Test
718
719```
720code
721```python
722"#;
723        let result = run_check(content).unwrap();
724        assert_eq!(result.len(), 1);
725    }
726
727    #[test]
728    fn test_incomplete_code_blocks() {
729        // Test unclosed code block
730        let content = r#"# Test
731
732```python
733this code block is not closed"#;
734        let result = run_check(content).unwrap();
735        assert!(
736            result.is_empty(),
737            "Unclosed code blocks with language should not trigger warnings"
738        );
739
740        // Test unclosed code block without language
741        let content_no_lang = r#"# Test
742
743```
744this code block is not closed"#;
745        let result = run_check(content_no_lang).unwrap();
746        assert_eq!(result.len(), 1);
747    }
748
749    #[test]
750    fn test_fix_preserves_original_formatting() {
751        let content = r#"# Test
752
753```
754code
755```
756
757No newline at end"#;
758        let fixed = run_fix(content).unwrap();
759        assert!(!fixed.ends_with('\n'), "Fix should preserve lack of trailing newline");
760
761        let content_with_newline = "# Test\n\n```\ncode\n```\n";
762        let fixed = run_fix(content_with_newline).unwrap();
763        assert!(fixed.ends_with('\n'), "Fix should preserve trailing newline");
764    }
765
766    #[test]
767    fn test_edge_case_backticks_in_content() {
768        let content = r#"# Test
769
770```javascript
771console.log(`template string with backticks`);
772// This line has ``` in a comment
773```
774"#;
775        let result = run_check(content).unwrap();
776        assert!(
777            result.is_empty(),
778            "Backticks inside code blocks should not affect parsing"
779        );
780    }
781
782    #[test]
783    fn test_empty_document() {
784        let content = "";
785        let result = run_check(content).unwrap();
786        assert!(result.is_empty());
787    }
788
789    #[test]
790    fn test_should_skip_optimization() {
791        let rule = MD040FencedCodeLanguage;
792
793        // Document without code fences should skip
794        let ctx = LintContext::new("# Just a header\n\nSome text", crate::config::MarkdownFlavor::Standard);
795        assert!(rule.should_skip(&ctx));
796
797        // Document with backtick fences should not skip
798        let ctx = LintContext::new("```\ncode\n```", crate::config::MarkdownFlavor::Standard);
799        assert!(!rule.should_skip(&ctx));
800
801        // Document with tilde fences should not skip
802        let ctx = LintContext::new("~~~\ncode\n~~~", crate::config::MarkdownFlavor::Standard);
803        assert!(!rule.should_skip(&ctx));
804
805        // Empty document should skip
806        let ctx = LintContext::new("", crate::config::MarkdownFlavor::Standard);
807        assert!(rule.should_skip(&ctx));
808    }
809}