rumdl_lib/rules/
md040_fenced_code_language.rs

1use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
2use crate::utils::range_utils::calculate_line_range;
3use pulldown_cmark::{CodeBlockKind, Event, Options, Parser, Tag};
4
5/// Rule MD040: Fenced code blocks should have a language
6///
7/// See [docs/md040.md](../../docs/md040.md) for full documentation, configuration, and examples.
8struct FencedCodeBlock {
9    /// 0-indexed line number where the code block starts
10    line_idx: usize,
11    /// The language/info string (empty if no language specified)
12    language: String,
13    /// The fence marker used (``` or ~~~)
14    fence_marker: String,
15}
16
17#[derive(Debug, Default, Clone)]
18pub struct MD040FencedCodeLanguage;
19
20impl Rule for MD040FencedCodeLanguage {
21    fn name(&self) -> &'static str {
22        "MD040"
23    }
24
25    fn description(&self) -> &'static str {
26        "Code blocks should have a language specified"
27    }
28
29    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
30        let content = ctx.content;
31        let mut warnings = Vec::new();
32
33        // Use pulldown-cmark to detect fenced code blocks with language info
34        let fenced_blocks = detect_fenced_code_blocks(content, &ctx.line_offsets);
35
36        // Pre-compute disabled ranges for efficient lookup
37        let disabled_ranges = compute_disabled_ranges(content, self.name());
38
39        for block in fenced_blocks {
40            // Skip if this line is in a disabled range
41            if is_line_disabled(&disabled_ranges, block.line_idx) {
42                continue;
43            }
44
45            // Get the actual line content for additional checks
46            let line = content.lines().nth(block.line_idx).unwrap_or("");
47            let trimmed = line.trim();
48            let after_fence = trimmed.strip_prefix(&block.fence_marker).unwrap_or("").trim();
49
50            // Check if it has MkDocs title attribute but no language
51            let has_title_only =
52                ctx.flavor == crate::config::MarkdownFlavor::MkDocs && after_fence.starts_with("title=");
53
54            // Check for Quarto/RMarkdown code chunk syntax: {language} or {language, options}
55            let has_quarto_syntax = ctx.flavor == crate::config::MarkdownFlavor::Quarto
56                && after_fence.starts_with('{')
57                && after_fence.contains('}');
58
59            // Warn if no language and not using special syntax
60            if (block.language.is_empty() || has_title_only) && !has_quarto_syntax {
61                let (start_line, start_col, end_line, end_col) = calculate_line_range(block.line_idx + 1, line);
62
63                warnings.push(LintWarning {
64                    rule_name: Some(self.name().to_string()),
65                    line: start_line,
66                    column: start_col,
67                    end_line,
68                    end_column: end_col,
69                    message: "Code block (```) missing language".to_string(),
70                    severity: Severity::Warning,
71                    fix: Some(Fix {
72                        range: {
73                            let trimmed_start = line.len() - line.trim_start().len();
74                            let fence_len = block.fence_marker.len();
75                            let line_start_byte = ctx.line_offsets.get(block.line_idx).copied().unwrap_or(0);
76                            let fence_start_byte = line_start_byte + trimmed_start;
77                            let fence_end_byte = fence_start_byte + fence_len;
78                            fence_start_byte..fence_end_byte
79                        },
80                        replacement: format!("{}text", block.fence_marker),
81                    }),
82                });
83            }
84        }
85
86        Ok(warnings)
87    }
88
89    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
90        let content = ctx.content;
91
92        // Use pulldown-cmark to detect fenced code blocks
93        let fenced_blocks = detect_fenced_code_blocks(content, &ctx.line_offsets);
94
95        // Pre-compute disabled ranges
96        let disabled_ranges = compute_disabled_ranges(content, self.name());
97
98        // Build a set of line indices that need fixing
99        let mut lines_to_fix: std::collections::HashMap<usize, (&str, bool)> = std::collections::HashMap::new();
100
101        for block in &fenced_blocks {
102            if is_line_disabled(&disabled_ranges, block.line_idx) {
103                continue;
104            }
105
106            let line = content.lines().nth(block.line_idx).unwrap_or("");
107            let trimmed = line.trim();
108            let after_fence = trimmed.strip_prefix(&block.fence_marker).unwrap_or("").trim();
109
110            let has_title_only =
111                ctx.flavor == crate::config::MarkdownFlavor::MkDocs && after_fence.starts_with("title=");
112
113            let has_quarto_syntax = ctx.flavor == crate::config::MarkdownFlavor::Quarto
114                && after_fence.starts_with('{')
115                && after_fence.contains('}');
116
117            if (block.language.is_empty() || has_title_only) && !has_quarto_syntax {
118                lines_to_fix.insert(block.line_idx, (&block.fence_marker, has_title_only));
119            }
120        }
121
122        // Build the result by iterating through lines
123        let mut result = String::new();
124        for (i, line) in content.lines().enumerate() {
125            if let Some(&(fence_marker, has_title_only)) = lines_to_fix.get(&i) {
126                let indent = &line[..line.len() - line.trim_start().len()];
127                let trimmed = line.trim();
128                let after_fence = trimmed.strip_prefix(fence_marker).unwrap_or("").trim();
129
130                if has_title_only {
131                    result.push_str(&format!("{indent}{fence_marker}text {after_fence}\n"));
132                } else {
133                    result.push_str(&format!("{indent}{fence_marker}text\n"));
134                }
135            } else {
136                result.push_str(line);
137                result.push('\n');
138            }
139        }
140
141        // Remove trailing newline if the original content didn't have one
142        if !content.ends_with('\n') {
143            result.pop();
144        }
145
146        Ok(result)
147    }
148
149    /// Get the category of this rule for selective processing
150    fn category(&self) -> RuleCategory {
151        RuleCategory::CodeBlock
152    }
153
154    /// Check if this rule should be skipped
155    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
156        ctx.content.is_empty() || (!ctx.likely_has_code() && !ctx.has_char('~'))
157    }
158
159    fn as_any(&self) -> &dyn std::any::Any {
160        self
161    }
162
163    fn from_config(_config: &crate::config::Config) -> Box<dyn Rule>
164    where
165        Self: Sized,
166    {
167        Box::new(MD040FencedCodeLanguage)
168    }
169}
170
171/// Detect fenced code blocks using pulldown-cmark, returning info about each block's opening fence
172fn detect_fenced_code_blocks(content: &str, line_offsets: &[usize]) -> Vec<FencedCodeBlock> {
173    let mut blocks = Vec::new();
174    let options = Options::all();
175    let parser = Parser::new_ext(content, options).into_offset_iter();
176
177    for (event, range) in parser {
178        if let Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(info))) = event {
179            // Find the line index for this byte offset
180            let line_idx = line_offsets
181                .iter()
182                .enumerate()
183                .rev()
184                .find(|&(_, offset)| *offset <= range.start)
185                .map(|(idx, _)| idx)
186                .unwrap_or(0);
187
188            // Determine fence marker from the actual line content
189            let line = content.lines().nth(line_idx).unwrap_or("");
190            let trimmed = line.trim();
191            let fence_marker = if trimmed.starts_with('`') {
192                let count = trimmed.chars().take_while(|&c| c == '`').count();
193                "`".repeat(count)
194            } else if trimmed.starts_with('~') {
195                let count = trimmed.chars().take_while(|&c| c == '~').count();
196                "~".repeat(count)
197            } else {
198                "```".to_string() // Fallback
199            };
200
201            // Extract just the language (first word of info string)
202            let language = info.split_whitespace().next().unwrap_or("").to_string();
203
204            blocks.push(FencedCodeBlock {
205                line_idx,
206                language,
207                fence_marker,
208            });
209        }
210    }
211
212    blocks
213}
214
215/// Compute disabled line ranges from disable/enable comments
216fn compute_disabled_ranges(content: &str, rule_name: &str) -> Vec<(usize, usize)> {
217    let mut ranges = Vec::new();
218    let mut disabled_start: Option<usize> = None;
219
220    for (i, line) in content.lines().enumerate() {
221        let trimmed = line.trim();
222
223        if let Some(rules) = crate::rule::parse_disable_comment(trimmed)
224            && (rules.is_empty() || rules.contains(&rule_name))
225            && disabled_start.is_none()
226        {
227            disabled_start = Some(i);
228        }
229
230        if let Some(rules) = crate::rule::parse_enable_comment(trimmed)
231            && (rules.is_empty() || rules.contains(&rule_name))
232            && let Some(start) = disabled_start.take()
233        {
234            ranges.push((start, i));
235        }
236    }
237
238    // Handle unclosed disable
239    if let Some(start) = disabled_start {
240        ranges.push((start, usize::MAX));
241    }
242
243    ranges
244}
245
246/// Check if a line index is within a disabled range
247fn is_line_disabled(ranges: &[(usize, usize)], line_idx: usize) -> bool {
248    ranges.iter().any(|&(start, end)| line_idx >= start && line_idx < end)
249}
250
251#[cfg(test)]
252mod tests {
253    use super::*;
254    use crate::lint_context::LintContext;
255
256    fn run_check(content: &str) -> LintResult {
257        let rule = MD040FencedCodeLanguage;
258        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
259        rule.check(&ctx)
260    }
261
262    fn run_fix(content: &str) -> Result<String, LintError> {
263        let rule = MD040FencedCodeLanguage;
264        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
265        rule.fix(&ctx)
266    }
267
268    #[test]
269    fn test_code_blocks_with_language_specified() {
270        // Basic test with language
271        let content = r#"# Test
272
273```python
274print("Hello, world!")
275```
276
277```javascript
278console.log("Hello!");
279```
280"#;
281        let result = run_check(content).unwrap();
282        assert!(result.is_empty(), "No warnings expected for code blocks with language");
283    }
284
285    #[test]
286    fn test_code_blocks_without_language() {
287        let content = r#"# Test
288
289```
290print("Hello, world!")
291```
292"#;
293        let result = run_check(content).unwrap();
294        assert_eq!(result.len(), 1);
295        assert_eq!(result[0].message, "Code block (```) missing language");
296        assert_eq!(result[0].line, 3);
297    }
298
299    #[test]
300    fn test_code_blocks_with_empty_language() {
301        // Test with spaces after the fence
302        let content = r#"# Test
303
304```
305print("Hello, world!")
306```
307"#;
308        let result = run_check(content).unwrap();
309        assert_eq!(result.len(), 1);
310        assert_eq!(result[0].message, "Code block (```) missing language");
311    }
312
313    #[test]
314    fn test_indented_code_blocks_should_be_ignored() {
315        // Indented code blocks (4 spaces) should not trigger the rule
316        let content = r#"# Test
317
318    This is an indented code block
319    It should not trigger MD040
320"#;
321        let result = run_check(content).unwrap();
322        assert!(result.is_empty(), "Indented code blocks should be ignored");
323    }
324
325    #[test]
326    fn test_inline_code_spans_should_be_ignored() {
327        let content = r#"# Test
328
329This is `inline code` and should not trigger warnings.
330
331Use the `print()` function.
332"#;
333        let result = run_check(content).unwrap();
334        assert!(result.is_empty(), "Inline code spans should be ignored");
335    }
336
337    #[test]
338    fn test_tildes_vs_backticks_for_fences() {
339        // Test tilde fences without language
340        let content_tildes_no_lang = r#"# Test
341
342~~~
343code here
344~~~
345"#;
346        let result = run_check(content_tildes_no_lang).unwrap();
347        assert_eq!(result.len(), 1);
348        assert_eq!(result[0].message, "Code block (```) missing language");
349
350        // Test tilde fences with language
351        let content_tildes_with_lang = r#"# Test
352
353~~~python
354code here
355~~~
356"#;
357        let result = run_check(content_tildes_with_lang).unwrap();
358        assert!(result.is_empty());
359
360        // Mixed fences
361        let content_mixed = r#"# Test
362
363```python
364code here
365```
366
367~~~javascript
368more code
369~~~
370
371```
372no language
373```
374
375~~~
376also no language
377~~~
378"#;
379        let result = run_check(content_mixed).unwrap();
380        assert_eq!(result.len(), 2);
381    }
382
383    #[test]
384    fn test_language_with_additional_parameters() {
385        let content = r#"# Test
386
387```python {highlight=[1,2]}
388print("Line 1")
389print("Line 2")
390```
391
392```javascript {.line-numbers startFrom="10"}
393console.log("Hello");
394```
395
396```ruby {data-line="1,3-4"}
397puts "Hello"
398puts "World"
399puts "!"
400```
401"#;
402        let result = run_check(content).unwrap();
403        assert!(
404            result.is_empty(),
405            "Code blocks with language and parameters should pass"
406        );
407    }
408
409    #[test]
410    fn test_multiple_code_blocks_in_document() {
411        let content = r#"# Test Document
412
413First block without language:
414```
415code here
416```
417
418Second block with language:
419```python
420print("hello")
421```
422
423Third block without language:
424```
425more code
426```
427
428Fourth block with language:
429```javascript
430console.log("test");
431```
432"#;
433        let result = run_check(content).unwrap();
434        assert_eq!(result.len(), 2);
435        assert_eq!(result[0].line, 4);
436        assert_eq!(result[1].line, 14);
437    }
438
439    #[test]
440    fn test_nested_code_blocks_in_lists() {
441        let content = r#"# Test
442
443- Item 1
444  ```python
445  print("nested with language")
446  ```
447
448- Item 2
449  ```
450  nested without language
451  ```
452
453- Item 3
454  - Nested item
455    ```javascript
456    console.log("deeply nested");
457    ```
458
459  - Another nested
460    ```
461    no language
462    ```
463"#;
464        let result = run_check(content).unwrap();
465        assert_eq!(result.len(), 2);
466        // Check that it detects the blocks without language
467        assert_eq!(result[0].line, 9);
468        assert_eq!(result[1].line, 20);
469    }
470
471    #[test]
472    fn test_issue_257_list_indented_code_block_with_language() {
473        // Issue #257: MD040 incorrectly flagged closing fence as needing language
474        // when code block was inside a list item
475        let content = r#"- Sample code:
476- ```java
477      List<Map<String,String>> inputs = new List<Map<String,String>>();
478  ```
479"#;
480        // Should produce NO warnings - the code block has a language
481        let result = run_check(content).unwrap();
482        assert!(
483            result.is_empty(),
484            "List-indented code block with language should not trigger MD040. Got: {result:?}",
485        );
486
487        // Fix should NOT modify the content at all
488        let fixed = run_fix(content).unwrap();
489        assert_eq!(
490            fixed, content,
491            "Fix should not modify code blocks that already have a language"
492        );
493        // Specifically verify no `text` was added to closing fence
494        assert!(
495            !fixed.contains("```text"),
496            "Fix should not add 'text' to closing fence of code block with language"
497        );
498    }
499
500    #[test]
501    fn test_issue_257_multiple_list_indented_blocks() {
502        // Extended test for issue #257 with multiple scenarios
503        let content = r#"# Document
504
5051. Step one
506   ```python
507   print("hello")
508   ```
5092. Step two
510
511- Item with nested code:
512  ```bash
513  echo "test"
514  ```
515
516- Another item:
517  ```javascript
518  console.log("test");
519  ```
520"#;
521        // All blocks have languages, so no warnings
522        let result = run_check(content).unwrap();
523        assert!(
524            result.is_empty(),
525            "All list-indented code blocks have languages. Got: {result:?}",
526        );
527
528        // Fix should not modify anything
529        let fixed = run_fix(content).unwrap();
530        assert_eq!(
531            fixed, content,
532            "Fix should not modify content when all blocks have languages"
533        );
534    }
535
536    #[test]
537    fn test_code_blocks_in_blockquotes() {
538        let content = r#"# Test
539
540> This is a blockquote
541> ```python
542> print("with language")
543> ```
544
545> Another blockquote
546> ```
547> without language
548> ```
549"#;
550        let result = run_check(content).unwrap();
551        // Code blocks inside blockquotes ARE detected (pulldown-cmark handles nested structures)
552        // The second code block has no language, so 1 warning expected
553        assert_eq!(result.len(), 1);
554    }
555
556    #[test]
557    fn test_fix_method_adds_text_language() {
558        let content = r#"# Test
559
560```
561code without language
562```
563
564```python
565already has language
566```
567
568```
569another block without
570```
571"#;
572        let fixed = run_fix(content).unwrap();
573        assert!(fixed.contains("```text"));
574        assert!(fixed.contains("```python"));
575        assert_eq!(fixed.matches("```text").count(), 2);
576    }
577
578    #[test]
579    fn test_fix_preserves_indentation() {
580        let content = r#"# Test
581
582- List item
583  ```
584  indented code block
585  ```
586"#;
587        let fixed = run_fix(content).unwrap();
588        // Should preserve indentation for list items
589        assert!(fixed.contains("  ```text"));
590        assert!(fixed.contains("  indented code block"));
591    }
592
593    #[test]
594    fn test_fix_preserves_indentation_numbered_list() {
595        // Test case from issue #122
596        let content = r#"1. Step 1
597
598    ```
599    foo
600    bar
601    ```
602"#;
603        let fixed = run_fix(content).unwrap();
604        // Should preserve 4-space indentation for numbered list content
605        assert!(fixed.contains("    ```text"));
606        assert!(fixed.contains("    foo"));
607        assert!(fixed.contains("    bar"));
608        // Should not remove indentation
609        assert!(!fixed.contains("\n```text\n"));
610    }
611
612    #[test]
613    fn test_fix_preserves_all_indentation() {
614        let content = r#"# Test
615
616Top-level code block:
617```
618top level
619```
620
6211. List item
622
623    ```
624    nested in list
625    ```
626
627Indented by 2 spaces:
628  ```
629  content
630  ```
631"#;
632        let fixed = run_fix(content).unwrap();
633
634        // All indentation should be preserved exactly as-is
635        assert!(
636            fixed.contains("```text\ntop level"),
637            "Top-level code block indentation preserved"
638        );
639        assert!(
640            fixed.contains("    ```text\n    nested in list"),
641            "List item code block indentation preserved"
642        );
643        assert!(
644            fixed.contains("  ```text\n  content"),
645            "2-space indented code block indentation preserved"
646        );
647    }
648
649    #[test]
650    fn test_fix_with_tilde_fences() {
651        let content = r#"# Test
652
653~~~
654code with tildes
655~~~
656"#;
657        let fixed = run_fix(content).unwrap();
658        assert!(fixed.contains("~~~text"));
659    }
660
661    #[test]
662    fn test_longer_fence_markers() {
663        let content = r#"# Test
664
665````
666code with four backticks
667````
668
669`````python
670code with five backticks and language
671`````
672
673~~~~~~
674code with six tildes
675~~~~~~
676"#;
677        let result = run_check(content).unwrap();
678        assert_eq!(result.len(), 2);
679
680        let fixed = run_fix(content).unwrap();
681        assert!(fixed.contains("````text"));
682        assert!(fixed.contains("~~~~~~text"));
683        assert!(fixed.contains("`````python"));
684    }
685
686    #[test]
687    fn test_nested_code_blocks_different_markers() {
688        let content = r#"# Test
689
690````markdown
691This is a markdown block
692
693```python
694# This is nested code
695print("hello")
696```
697
698More markdown
699````
700"#;
701        let result = run_check(content).unwrap();
702        assert!(
703            result.is_empty(),
704            "Nested code blocks with different markers should not trigger warnings"
705        );
706    }
707
708    #[test]
709    fn test_disable_enable_comments() {
710        let content = r#"# Test
711
712<!-- rumdl-disable MD040 -->
713```
714this should not trigger warning
715```
716<!-- rumdl-enable MD040 -->
717
718```
719this should trigger warning
720```
721"#;
722        let result = run_check(content).unwrap();
723        assert_eq!(result.len(), 1);
724        assert_eq!(result[0].line, 9);
725    }
726
727    #[test]
728    fn test_fence_with_language_only_on_closing() {
729        // Edge case: language on closing fence should not be interpreted
730        let content = r#"# Test
731
732```
733code
734```python
735"#;
736        let result = run_check(content).unwrap();
737        assert_eq!(result.len(), 1);
738    }
739
740    #[test]
741    fn test_incomplete_code_blocks() {
742        // Test unclosed code block
743        let content = r#"# Test
744
745```python
746this code block is not closed"#;
747        let result = run_check(content).unwrap();
748        assert!(
749            result.is_empty(),
750            "Unclosed code blocks with language should not trigger warnings"
751        );
752
753        // Test unclosed code block without language
754        let content_no_lang = r#"# Test
755
756```
757this code block is not closed"#;
758        let result = run_check(content_no_lang).unwrap();
759        assert_eq!(result.len(), 1);
760    }
761
762    #[test]
763    fn test_fix_preserves_original_formatting() {
764        let content = r#"# Test
765
766```
767code
768```
769
770No newline at end"#;
771        let fixed = run_fix(content).unwrap();
772        assert!(!fixed.ends_with('\n'), "Fix should preserve lack of trailing newline");
773
774        let content_with_newline = "# Test\n\n```\ncode\n```\n";
775        let fixed = run_fix(content_with_newline).unwrap();
776        assert!(fixed.ends_with('\n'), "Fix should preserve trailing newline");
777    }
778
779    #[test]
780    fn test_edge_case_backticks_in_content() {
781        let content = r#"# Test
782
783```javascript
784console.log(`template string with backticks`);
785// This line has ``` in a comment
786```
787"#;
788        let result = run_check(content).unwrap();
789        assert!(
790            result.is_empty(),
791            "Backticks inside code blocks should not affect parsing"
792        );
793    }
794
795    #[test]
796    fn test_empty_document() {
797        let content = "";
798        let result = run_check(content).unwrap();
799        assert!(result.is_empty());
800    }
801
802    #[test]
803    fn test_should_skip_optimization() {
804        let rule = MD040FencedCodeLanguage;
805
806        // Document without code fences should skip
807        let ctx = LintContext::new(
808            "# Just a header\n\nSome text",
809            crate::config::MarkdownFlavor::Standard,
810            None,
811        );
812        assert!(rule.should_skip(&ctx));
813
814        // Document with backtick fences should not skip
815        let ctx = LintContext::new("```\ncode\n```", crate::config::MarkdownFlavor::Standard, None);
816        assert!(!rule.should_skip(&ctx));
817
818        // Document with tilde fences should not skip
819        let ctx = LintContext::new("~~~\ncode\n~~~", crate::config::MarkdownFlavor::Standard, None);
820        assert!(!rule.should_skip(&ctx));
821
822        // Empty document should skip
823        let ctx = LintContext::new("", crate::config::MarkdownFlavor::Standard, None);
824        assert!(rule.should_skip(&ctx));
825    }
826
827    #[test]
828    fn test_quarto_code_chunk_syntax() {
829        let rule = MD040FencedCodeLanguage;
830
831        // Test Quarto {r} syntax - should NOT trigger warning
832        let content = r#"# Test
833
834```{r}
835x <- 1
836```
837
838```{python}
839x = 1
840```
841
842```{r, echo=FALSE}
843plot(x)
844```
845"#;
846        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Quarto, None);
847        let result = rule.check(&ctx).unwrap();
848        assert!(
849            result.is_empty(),
850            "Quarto code chunks with {{language}} syntax should not trigger warnings"
851        );
852
853        // Test that missing language DOES trigger warning for Quarto
854        let content_no_lang = r#"# Test
855
856```
857code without language
858```
859"#;
860        let ctx = LintContext::new(content_no_lang, crate::config::MarkdownFlavor::Quarto, None);
861        let result = rule.check(&ctx).unwrap();
862        assert_eq!(result.len(), 1, "Quarto files without language should trigger warning");
863
864        // Test that standard flavor still requires standard language syntax
865        let content_standard = r#"# Test
866
867```{python}
868code
869```
870"#;
871        let ctx = LintContext::new(content_standard, crate::config::MarkdownFlavor::Standard, None);
872        let result = rule.check(&ctx).unwrap();
873        // In standard flavor, {python} is considered "after_fence" content, so it's valid
874        // The fence marker is "```" and after_fence is "{python}", which is non-empty
875        assert!(
876            result.is_empty(),
877            "Standard flavor should accept any non-empty after_fence content"
878        );
879    }
880}