rumdl_lib/rules/
md040_fenced_code_language.rs

1use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
2use crate::utils::range_utils::calculate_line_range;
3
4/// Rule MD040: Fenced code blocks should have a language
5///
6/// See [docs/md040.md](../../docs/md040.md) for full documentation, configuration, and examples.
7
8#[derive(Debug, Default, Clone)]
9pub struct MD040FencedCodeLanguage;
10
11impl Rule for MD040FencedCodeLanguage {
12    fn name(&self) -> &'static str {
13        "MD040"
14    }
15
16    fn description(&self) -> &'static str {
17        "Code blocks should have a language specified"
18    }
19
20    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
21        let content = ctx.content;
22        let _line_index = &ctx.line_index;
23
24        let mut warnings = Vec::new();
25
26        let mut in_code_block = false;
27        let mut current_fence_marker: Option<String> = None;
28        let mut opening_fence_indent: usize = 0;
29
30        // Pre-compute disabled state to avoid O(n²) complexity
31        let mut is_disabled = false;
32
33        for (i, line) in content.lines().enumerate() {
34            let trimmed = line.trim();
35
36            // Update disabled state incrementally
37            if let Some(rules) = crate::rule::parse_disable_comment(trimmed)
38                && (rules.is_empty() || rules.contains(&self.name()))
39            {
40                is_disabled = true;
41            }
42            if let Some(rules) = crate::rule::parse_enable_comment(trimmed)
43                && (rules.is_empty() || rules.contains(&self.name()))
44            {
45                is_disabled = false;
46            }
47
48            // Skip processing if rule is disabled
49            if is_disabled {
50                continue;
51            }
52
53            // Determine fence marker if this is a fence line
54            let fence_marker = if trimmed.starts_with("```") {
55                let backtick_count = trimmed.chars().take_while(|&c| c == '`').count();
56                if backtick_count >= 3 {
57                    Some("`".repeat(backtick_count))
58                } else {
59                    None
60                }
61            } else if trimmed.starts_with("~~~") {
62                let tilde_count = trimmed.chars().take_while(|&c| c == '~').count();
63                if tilde_count >= 3 {
64                    Some("~".repeat(tilde_count))
65                } else {
66                    None
67                }
68            } else {
69                None
70            };
71
72            if let Some(fence_marker) = fence_marker {
73                if in_code_block {
74                    // We're inside a code block, check if this closes it
75                    if let Some(ref current_marker) = current_fence_marker {
76                        let current_indent = line.len() - line.trim_start().len();
77                        // Only close if the fence marker exactly matches the opening marker AND has no content after
78                        // AND the indentation is not greater than the opening fence
79                        if fence_marker == *current_marker
80                            && trimmed[current_marker.len()..].trim().is_empty()
81                            && current_indent <= opening_fence_indent
82                        {
83                            // This closes the current code block
84                            in_code_block = false;
85                            current_fence_marker = None;
86                            opening_fence_indent = 0;
87                        }
88                        // else: This is content inside a code block, ignore completely
89                    }
90                } else {
91                    // We're outside a code block, this opens one
92                    // Check if language is specified
93                    let after_fence = trimmed[fence_marker.len()..].trim();
94
95                    // Check if it has MkDocs title attribute but no language
96                    // Pattern: ``` title="Title" (missing language)
97                    // Valid: ```python title="Title" or ```py title="Title"
98                    let has_title_only =
99                        ctx.flavor == crate::config::MarkdownFlavor::MkDocs && after_fence.starts_with("title=");
100
101                    // Check for Quarto/RMarkdown code chunk syntax: {language} or {language, options}
102                    // Examples: ```{python}, ```{r}, ```{r, echo=FALSE}
103                    let has_quarto_syntax = ctx.flavor == crate::config::MarkdownFlavor::Quarto
104                        && after_fence.starts_with('{')
105                        && after_fence.contains('}');
106
107                    if (after_fence.is_empty() || has_title_only) && !has_quarto_syntax {
108                        // Calculate precise character range for the entire fence line that needs a language
109                        let (start_line, start_col, end_line, end_col) = calculate_line_range(i + 1, line);
110
111                        warnings.push(LintWarning {
112                            rule_name: Some(self.name().to_string()),
113                            line: start_line,
114                            column: start_col,
115                            end_line,
116                            end_column: end_col,
117                            message: "Code block (```) missing language".to_string(),
118                            severity: Severity::Warning,
119                            fix: Some(Fix {
120                                range: {
121                                    // Replace just the fence marker with fence+language
122                                    let trimmed_start = line.len() - line.trim_start().len();
123                                    let fence_len = fence_marker.len();
124                                    let line_start_byte = ctx.line_offsets.get(i).copied().unwrap_or(0);
125                                    let fence_start_byte = line_start_byte + trimmed_start;
126                                    let fence_end_byte = fence_start_byte + fence_len;
127                                    fence_start_byte..fence_end_byte
128                                },
129                                replacement: format!("{fence_marker}text"),
130                            }),
131                        });
132                    }
133
134                    in_code_block = true;
135                    current_fence_marker = Some(fence_marker);
136                    opening_fence_indent = line.len() - line.trim_start().len();
137                }
138            }
139            // If we're inside a code block and this line is not a fence, ignore it
140        }
141
142        Ok(warnings)
143    }
144
145    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
146        let content = ctx.content;
147        let _line_index = &ctx.line_index;
148
149        let mut result = String::new();
150        let mut in_code_block = false;
151        let mut current_fence_marker: Option<String> = None;
152        let mut fence_needs_language = false;
153        let mut original_indent = String::new();
154        let mut opening_fence_indent: usize = 0;
155
156        let lines: Vec<&str> = content.lines().collect();
157
158        // Pre-compute disabled state to avoid O(n²) complexity
159        let mut is_disabled = false;
160
161        for line in lines.iter() {
162            let trimmed = line.trim();
163
164            // Update disabled state incrementally
165            if let Some(rules) = crate::rule::parse_disable_comment(trimmed)
166                && (rules.is_empty() || rules.contains(&self.name()))
167            {
168                is_disabled = true;
169            }
170            if let Some(rules) = crate::rule::parse_enable_comment(trimmed)
171                && (rules.is_empty() || rules.contains(&self.name()))
172            {
173                is_disabled = false;
174            }
175
176            // Skip processing if rule is disabled, preserve the line as-is
177            if is_disabled {
178                result.push_str(line);
179                result.push('\n');
180                continue;
181            }
182
183            // Determine fence marker if this is a fence line
184            let fence_marker = if trimmed.starts_with("```") {
185                let backtick_count = trimmed.chars().take_while(|&c| c == '`').count();
186                if backtick_count >= 3 {
187                    Some("`".repeat(backtick_count))
188                } else {
189                    None
190                }
191            } else if trimmed.starts_with("~~~") {
192                let tilde_count = trimmed.chars().take_while(|&c| c == '~').count();
193                if tilde_count >= 3 {
194                    Some("~".repeat(tilde_count))
195                } else {
196                    None
197                }
198            } else {
199                None
200            };
201
202            if let Some(fence_marker) = fence_marker {
203                if in_code_block {
204                    // We're inside a code block, check if this closes it
205                    if let Some(ref current_marker) = current_fence_marker {
206                        let current_indent = line.len() - line.trim_start().len();
207                        if fence_marker == *current_marker
208                            && trimmed[current_marker.len()..].trim().is_empty()
209                            && current_indent <= opening_fence_indent
210                        {
211                            // This closes the current code block
212                            if fence_needs_language {
213                                // Use the same indentation as the opening fence
214                                result.push_str(&format!("{original_indent}{trimmed}\n"));
215                            } else {
216                                // Preserve original line as-is
217                                result.push_str(line);
218                                result.push('\n');
219                            }
220                            in_code_block = false;
221                            current_fence_marker = None;
222                            fence_needs_language = false;
223                            original_indent.clear();
224                            opening_fence_indent = 0;
225                        } else {
226                            // This is content inside a code block (different fence marker) - preserve exactly as-is
227                            result.push_str(line);
228                            result.push('\n');
229                        }
230                    } else {
231                        // This shouldn't happen, but preserve as content
232                        result.push_str(line);
233                        result.push('\n');
234                    }
235                } else {
236                    // We're outside a code block, this opens one
237                    // Capture the original indentation
238                    let line_indent = line[..line.len() - line.trim_start().len()].to_string();
239
240                    // Add 'text' as default language for opening fence if no language specified
241                    let after_fence = trimmed[fence_marker.len()..].trim();
242
243                    // Check if it has MkDocs title attribute but no language
244                    let has_title_only =
245                        ctx.flavor == crate::config::MarkdownFlavor::MkDocs && after_fence.starts_with("title=");
246
247                    // Check for Quarto/RMarkdown code chunk syntax: {language} or {language, options}
248                    let has_quarto_syntax = ctx.flavor == crate::config::MarkdownFlavor::Quarto
249                        && after_fence.starts_with('{')
250                        && after_fence.contains('}');
251
252                    if (after_fence.is_empty() || has_title_only) && !has_quarto_syntax {
253                        // Always preserve the original indentation - adding a language tag should not change indentation
254                        original_indent = line_indent;
255                        if has_title_only {
256                            // Insert language before title attribute
257                            result.push_str(&format!("{original_indent}{fence_marker}text {after_fence}\n"));
258                        } else {
259                            result.push_str(&format!("{original_indent}{fence_marker}text\n"));
260                        }
261                        fence_needs_language = true;
262                    } else {
263                        // Keep original line as-is since it already has a language
264                        result.push_str(line);
265                        result.push('\n');
266                        fence_needs_language = false;
267                    }
268
269                    in_code_block = true;
270                    current_fence_marker = Some(fence_marker);
271                    opening_fence_indent = line.len() - line.trim_start().len();
272                }
273            } else if in_code_block {
274                // We're inside a code block and this is not a fence line - preserve exactly as-is
275                result.push_str(line);
276                result.push('\n');
277            } else {
278                // We're outside code blocks and this is not a fence line - preserve as-is
279                result.push_str(line);
280                result.push('\n');
281            }
282        }
283
284        // Remove trailing newline if the original content didn't have one
285        if !content.ends_with('\n') {
286            result.pop();
287        }
288
289        Ok(result)
290    }
291
292    /// Get the category of this rule for selective processing
293    fn category(&self) -> RuleCategory {
294        RuleCategory::CodeBlock
295    }
296
297    /// Check if this rule should be skipped
298    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
299        ctx.content.is_empty() || (!ctx.likely_has_code() && !ctx.has_char('~'))
300    }
301
302    fn as_any(&self) -> &dyn std::any::Any {
303        self
304    }
305
306    fn from_config(_config: &crate::config::Config) -> Box<dyn Rule>
307    where
308        Self: Sized,
309    {
310        Box::new(MD040FencedCodeLanguage)
311    }
312}
313
314#[cfg(test)]
315mod tests {
316    use super::*;
317    use crate::lint_context::LintContext;
318
319    fn run_check(content: &str) -> LintResult {
320        let rule = MD040FencedCodeLanguage;
321        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
322        rule.check(&ctx)
323    }
324
325    fn run_fix(content: &str) -> Result<String, LintError> {
326        let rule = MD040FencedCodeLanguage;
327        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
328        rule.fix(&ctx)
329    }
330
331    #[test]
332    fn test_code_blocks_with_language_specified() {
333        // Basic test with language
334        let content = r#"# Test
335
336```python
337print("Hello, world!")
338```
339
340```javascript
341console.log("Hello!");
342```
343"#;
344        let result = run_check(content).unwrap();
345        assert!(result.is_empty(), "No warnings expected for code blocks with language");
346    }
347
348    #[test]
349    fn test_code_blocks_without_language() {
350        let content = r#"# Test
351
352```
353print("Hello, world!")
354```
355"#;
356        let result = run_check(content).unwrap();
357        assert_eq!(result.len(), 1);
358        assert_eq!(result[0].message, "Code block (```) missing language");
359        assert_eq!(result[0].line, 3);
360    }
361
362    #[test]
363    fn test_code_blocks_with_empty_language() {
364        // Test with spaces after the fence
365        let content = r#"# Test
366
367```
368print("Hello, world!")
369```
370"#;
371        let result = run_check(content).unwrap();
372        assert_eq!(result.len(), 1);
373        assert_eq!(result[0].message, "Code block (```) missing language");
374    }
375
376    #[test]
377    fn test_indented_code_blocks_should_be_ignored() {
378        // Indented code blocks (4 spaces) should not trigger the rule
379        let content = r#"# Test
380
381    This is an indented code block
382    It should not trigger MD040
383"#;
384        let result = run_check(content).unwrap();
385        assert!(result.is_empty(), "Indented code blocks should be ignored");
386    }
387
388    #[test]
389    fn test_inline_code_spans_should_be_ignored() {
390        let content = r#"# Test
391
392This is `inline code` and should not trigger warnings.
393
394Use the `print()` function.
395"#;
396        let result = run_check(content).unwrap();
397        assert!(result.is_empty(), "Inline code spans should be ignored");
398    }
399
400    #[test]
401    fn test_tildes_vs_backticks_for_fences() {
402        // Test tilde fences without language
403        let content_tildes_no_lang = r#"# Test
404
405~~~
406code here
407~~~
408"#;
409        let result = run_check(content_tildes_no_lang).unwrap();
410        assert_eq!(result.len(), 1);
411        assert_eq!(result[0].message, "Code block (```) missing language");
412
413        // Test tilde fences with language
414        let content_tildes_with_lang = r#"# Test
415
416~~~python
417code here
418~~~
419"#;
420        let result = run_check(content_tildes_with_lang).unwrap();
421        assert!(result.is_empty());
422
423        // Mixed fences
424        let content_mixed = r#"# Test
425
426```python
427code here
428```
429
430~~~javascript
431more code
432~~~
433
434```
435no language
436```
437
438~~~
439also no language
440~~~
441"#;
442        let result = run_check(content_mixed).unwrap();
443        assert_eq!(result.len(), 2);
444    }
445
446    #[test]
447    fn test_language_with_additional_parameters() {
448        let content = r#"# Test
449
450```python {highlight=[1,2]}
451print("Line 1")
452print("Line 2")
453```
454
455```javascript {.line-numbers startFrom="10"}
456console.log("Hello");
457```
458
459```ruby {data-line="1,3-4"}
460puts "Hello"
461puts "World"
462puts "!"
463```
464"#;
465        let result = run_check(content).unwrap();
466        assert!(
467            result.is_empty(),
468            "Code blocks with language and parameters should pass"
469        );
470    }
471
472    #[test]
473    fn test_multiple_code_blocks_in_document() {
474        let content = r#"# Test Document
475
476First block without language:
477```
478code here
479```
480
481Second block with language:
482```python
483print("hello")
484```
485
486Third block without language:
487```
488more code
489```
490
491Fourth block with language:
492```javascript
493console.log("test");
494```
495"#;
496        let result = run_check(content).unwrap();
497        assert_eq!(result.len(), 2);
498        assert_eq!(result[0].line, 4);
499        assert_eq!(result[1].line, 14);
500    }
501
502    #[test]
503    fn test_nested_code_blocks_in_lists() {
504        let content = r#"# Test
505
506- Item 1
507  ```python
508  print("nested with language")
509  ```
510
511- Item 2
512  ```
513  nested without language
514  ```
515
516- Item 3
517  - Nested item
518    ```javascript
519    console.log("deeply nested");
520    ```
521
522  - Another nested
523    ```
524    no language
525    ```
526"#;
527        let result = run_check(content).unwrap();
528        assert_eq!(result.len(), 2);
529        // Check that it detects the blocks without language
530        assert_eq!(result[0].line, 9);
531        assert_eq!(result[1].line, 20);
532    }
533
534    #[test]
535    fn test_code_blocks_in_blockquotes() {
536        let content = r#"# Test
537
538> This is a blockquote
539> ```python
540> print("with language")
541> ```
542
543> Another blockquote
544> ```
545> without language
546> ```
547"#;
548        let result = run_check(content).unwrap();
549        // The implementation doesn't detect code blocks inside blockquotes
550        // This is by design to avoid complexity with nested structures
551        assert_eq!(result.len(), 0);
552    }
553
554    #[test]
555    fn test_fix_method_adds_text_language() {
556        let content = r#"# Test
557
558```
559code without language
560```
561
562```python
563already has language
564```
565
566```
567another block without
568```
569"#;
570        let fixed = run_fix(content).unwrap();
571        assert!(fixed.contains("```text"));
572        assert!(fixed.contains("```python"));
573        assert_eq!(fixed.matches("```text").count(), 2);
574    }
575
576    #[test]
577    fn test_fix_preserves_indentation() {
578        let content = r#"# Test
579
580- List item
581  ```
582  indented code block
583  ```
584"#;
585        let fixed = run_fix(content).unwrap();
586        // Should preserve indentation for list items
587        assert!(fixed.contains("  ```text"));
588        assert!(fixed.contains("  indented code block"));
589    }
590
591    #[test]
592    fn test_fix_preserves_indentation_numbered_list() {
593        // Test case from issue #122
594        let content = r#"1. Step 1
595
596    ```
597    foo
598    bar
599    ```
600"#;
601        let fixed = run_fix(content).unwrap();
602        // Should preserve 4-space indentation for numbered list content
603        assert!(fixed.contains("    ```text"));
604        assert!(fixed.contains("    foo"));
605        assert!(fixed.contains("    bar"));
606        // Should not remove indentation
607        assert!(!fixed.contains("\n```text\n"));
608    }
609
610    #[test]
611    fn test_fix_preserves_all_indentation() {
612        let content = r#"# Test
613
614Top-level code block:
615```
616top level
617```
618
6191. List item
620
621    ```
622    nested in list
623    ```
624
625Indented by 2 spaces:
626  ```
627  content
628  ```
629"#;
630        let fixed = run_fix(content).unwrap();
631
632        // All indentation should be preserved exactly as-is
633        assert!(
634            fixed.contains("```text\ntop level"),
635            "Top-level code block indentation preserved"
636        );
637        assert!(
638            fixed.contains("    ```text\n    nested in list"),
639            "List item code block indentation preserved"
640        );
641        assert!(
642            fixed.contains("  ```text\n  content"),
643            "2-space indented code block indentation preserved"
644        );
645    }
646
647    #[test]
648    fn test_fix_with_tilde_fences() {
649        let content = r#"# Test
650
651~~~
652code with tildes
653~~~
654"#;
655        let fixed = run_fix(content).unwrap();
656        assert!(fixed.contains("~~~text"));
657    }
658
659    #[test]
660    fn test_longer_fence_markers() {
661        let content = r#"# Test
662
663````
664code with four backticks
665````
666
667`````python
668code with five backticks and language
669`````
670
671~~~~~~
672code with six tildes
673~~~~~~
674"#;
675        let result = run_check(content).unwrap();
676        assert_eq!(result.len(), 2);
677
678        let fixed = run_fix(content).unwrap();
679        assert!(fixed.contains("````text"));
680        assert!(fixed.contains("~~~~~~text"));
681        assert!(fixed.contains("`````python"));
682    }
683
684    #[test]
685    fn test_nested_code_blocks_different_markers() {
686        let content = r#"# Test
687
688````markdown
689This is a markdown block
690
691```python
692# This is nested code
693print("hello")
694```
695
696More markdown
697````
698"#;
699        let result = run_check(content).unwrap();
700        assert!(
701            result.is_empty(),
702            "Nested code blocks with different markers should not trigger warnings"
703        );
704    }
705
706    #[test]
707    fn test_disable_enable_comments() {
708        let content = r#"# Test
709
710<!-- rumdl-disable MD040 -->
711```
712this should not trigger warning
713```
714<!-- rumdl-enable MD040 -->
715
716```
717this should trigger warning
718```
719"#;
720        let result = run_check(content).unwrap();
721        assert_eq!(result.len(), 1);
722        assert_eq!(result[0].line, 9);
723    }
724
725    #[test]
726    fn test_fence_with_language_only_on_closing() {
727        // Edge case: language on closing fence should not be interpreted
728        let content = r#"# Test
729
730```
731code
732```python
733"#;
734        let result = run_check(content).unwrap();
735        assert_eq!(result.len(), 1);
736    }
737
738    #[test]
739    fn test_incomplete_code_blocks() {
740        // Test unclosed code block
741        let content = r#"# Test
742
743```python
744this code block is not closed"#;
745        let result = run_check(content).unwrap();
746        assert!(
747            result.is_empty(),
748            "Unclosed code blocks with language should not trigger warnings"
749        );
750
751        // Test unclosed code block without language
752        let content_no_lang = r#"# Test
753
754```
755this code block is not closed"#;
756        let result = run_check(content_no_lang).unwrap();
757        assert_eq!(result.len(), 1);
758    }
759
760    #[test]
761    fn test_fix_preserves_original_formatting() {
762        let content = r#"# Test
763
764```
765code
766```
767
768No newline at end"#;
769        let fixed = run_fix(content).unwrap();
770        assert!(!fixed.ends_with('\n'), "Fix should preserve lack of trailing newline");
771
772        let content_with_newline = "# Test\n\n```\ncode\n```\n";
773        let fixed = run_fix(content_with_newline).unwrap();
774        assert!(fixed.ends_with('\n'), "Fix should preserve trailing newline");
775    }
776
777    #[test]
778    fn test_edge_case_backticks_in_content() {
779        let content = r#"# Test
780
781```javascript
782console.log(`template string with backticks`);
783// This line has ``` in a comment
784```
785"#;
786        let result = run_check(content).unwrap();
787        assert!(
788            result.is_empty(),
789            "Backticks inside code blocks should not affect parsing"
790        );
791    }
792
793    #[test]
794    fn test_empty_document() {
795        let content = "";
796        let result = run_check(content).unwrap();
797        assert!(result.is_empty());
798    }
799
800    #[test]
801    fn test_should_skip_optimization() {
802        let rule = MD040FencedCodeLanguage;
803
804        // Document without code fences should skip
805        let ctx = LintContext::new("# Just a header\n\nSome text", crate::config::MarkdownFlavor::Standard);
806        assert!(rule.should_skip(&ctx));
807
808        // Document with backtick fences should not skip
809        let ctx = LintContext::new("```\ncode\n```", crate::config::MarkdownFlavor::Standard);
810        assert!(!rule.should_skip(&ctx));
811
812        // Document with tilde fences should not skip
813        let ctx = LintContext::new("~~~\ncode\n~~~", crate::config::MarkdownFlavor::Standard);
814        assert!(!rule.should_skip(&ctx));
815
816        // Empty document should skip
817        let ctx = LintContext::new("", crate::config::MarkdownFlavor::Standard);
818        assert!(rule.should_skip(&ctx));
819    }
820
821    #[test]
822    fn test_quarto_code_chunk_syntax() {
823        let rule = MD040FencedCodeLanguage;
824
825        // Test Quarto {r} syntax - should NOT trigger warning
826        let content = r#"# Test
827
828```{r}
829x <- 1
830```
831
832```{python}
833x = 1
834```
835
836```{r, echo=FALSE}
837plot(x)
838```
839"#;
840        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Quarto);
841        let result = rule.check(&ctx).unwrap();
842        assert!(
843            result.is_empty(),
844            "Quarto code chunks with {{language}} syntax should not trigger warnings"
845        );
846
847        // Test that missing language DOES trigger warning for Quarto
848        let content_no_lang = r#"# Test
849
850```
851code without language
852```
853"#;
854        let ctx = LintContext::new(content_no_lang, crate::config::MarkdownFlavor::Quarto);
855        let result = rule.check(&ctx).unwrap();
856        assert_eq!(result.len(), 1, "Quarto files without language should trigger warning");
857
858        // Test that standard flavor still requires standard language syntax
859        let content_standard = r#"# Test
860
861```{python}
862code
863```
864"#;
865        let ctx = LintContext::new(content_standard, crate::config::MarkdownFlavor::Standard);
866        let result = rule.check(&ctx).unwrap();
867        // In standard flavor, {python} is considered "after_fence" content, so it's valid
868        // The fence marker is "```" and after_fence is "{python}", which is non-empty
869        assert!(
870            result.is_empty(),
871            "Standard flavor should accept any non-empty after_fence content"
872        );
873    }
874}