Skip to main content

rumdl_lib/rules/
md040_fenced_code_language.rs

1use crate::linguist_data::{default_alias, get_aliases, is_valid_alias, resolve_canonical};
2use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
3use crate::rule_config_serde::{RuleConfig, load_rule_config};
4use crate::utils::range_utils::calculate_line_range;
5use pulldown_cmark::{CodeBlockKind, Event, Options, Parser, Tag};
6use std::collections::HashMap;
7
8/// Rule MD040: Fenced code blocks should have a language
9///
10/// See [docs/md040.md](../../docs/md040.md) for full documentation, configuration, and examples.
11pub mod md040_config;
12use md040_config::{LanguageStyle, MD040Config, UnknownLanguageAction};
13
14struct FencedCodeBlock {
15    /// 0-indexed line number where the code block starts
16    line_idx: usize,
17    /// The language/info string (empty if no language specified)
18    language: String,
19    /// The fence marker used (``` or ~~~)
20    fence_marker: String,
21}
22
23#[derive(Debug, Clone, Default)]
24pub struct MD040FencedCodeLanguage {
25    config: MD040Config,
26}
27
28impl MD040FencedCodeLanguage {
29    pub fn new() -> Self {
30        Self::default()
31    }
32
33    pub fn with_config(config: MD040Config) -> Self {
34        Self { config }
35    }
36
37    /// Validate the configuration and return any errors
38    fn validate_config(&self) -> Vec<String> {
39        let mut errors = Vec::new();
40
41        // Validate preferred-aliases: check that each alias is valid for its language
42        for (canonical, alias) in &self.config.preferred_aliases {
43            // Find the actual canonical name (case-insensitive)
44            if let Some(actual_canonical) = resolve_canonical(canonical) {
45                if !is_valid_alias(actual_canonical, alias)
46                    && let Some(valid_aliases) = get_aliases(actual_canonical)
47                {
48                    let valid_list: Vec<_> = valid_aliases.iter().take(5).collect();
49                    let valid_str = valid_list
50                        .iter()
51                        .map(|s| format!("'{s}'"))
52                        .collect::<Vec<_>>()
53                        .join(", ");
54                    let suffix = if valid_aliases.len() > 5 { ", ..." } else { "" };
55                    errors.push(format!(
56                        "Invalid alias '{alias}' for language '{actual_canonical}'. Valid aliases include: {valid_str}{suffix}"
57                    ));
58                }
59            } else {
60                errors.push(format!(
61                    "Unknown language '{canonical}' in preferred-aliases. Use GitHub Linguist canonical names."
62                ));
63            }
64        }
65
66        errors
67    }
68
69    /// Determine the preferred label for each canonical language in the document
70    fn compute_preferred_labels(
71        &self,
72        blocks: &[FencedCodeBlock],
73        disabled_ranges: &[(usize, usize)],
74    ) -> HashMap<String, String> {
75        // Group labels by canonical language
76        let mut by_canonical: HashMap<String, Vec<&str>> = HashMap::new();
77
78        for block in blocks {
79            if is_line_disabled(disabled_ranges, block.line_idx) {
80                continue;
81            }
82            if block.language.is_empty() {
83                continue;
84            }
85            if let Some(canonical) = resolve_canonical(&block.language) {
86                by_canonical
87                    .entry(canonical.to_string())
88                    .or_default()
89                    .push(&block.language);
90            }
91        }
92
93        // Determine winning label for each canonical language
94        let mut result = HashMap::new();
95
96        for (canonical, labels) in by_canonical {
97            // Check for user override first (case-insensitive lookup)
98            let winner = if let Some(preferred) = self
99                .config
100                .preferred_aliases
101                .iter()
102                .find(|(k, _)| k.eq_ignore_ascii_case(&canonical))
103                .map(|(_, v)| v.clone())
104            {
105                preferred
106            } else {
107                // Find most prevalent label
108                let mut counts: HashMap<&str, usize> = HashMap::new();
109                for label in &labels {
110                    *counts.entry(*label).or_default() += 1;
111                }
112
113                let max_count = counts.values().max().copied().unwrap_or(0);
114                let winners: Vec<_> = counts
115                    .iter()
116                    .filter(|(_, c)| **c == max_count)
117                    .map(|(l, _)| *l)
118                    .collect();
119
120                if winners.len() == 1 {
121                    winners[0].to_string()
122                } else {
123                    // Tie-break: use curated default if available, otherwise alphabetically first
124                    default_alias(&canonical)
125                        .filter(|default| winners.contains(default))
126                        .map(|s| s.to_string())
127                        .unwrap_or_else(|| winners.into_iter().min().unwrap().to_string())
128                }
129            };
130
131            result.insert(canonical, winner);
132        }
133
134        result
135    }
136
137    /// Check if a language is allowed based on config
138    fn check_language_allowed(&self, canonical: Option<&str>, original_label: &str) -> Option<String> {
139        // Allowlist takes precedence
140        if !self.config.allowed_languages.is_empty() {
141            let allowed = self.config.allowed_languages.join(", ");
142            let Some(canonical) = canonical else {
143                return Some(format!(
144                    "Language '{original_label}' is not in the allowed list: {allowed}"
145                ));
146            };
147            if !self
148                .config
149                .allowed_languages
150                .iter()
151                .any(|a| a.eq_ignore_ascii_case(canonical))
152            {
153                return Some(format!(
154                    "Language '{original_label}' ({canonical}) is not in the allowed list: {allowed}"
155                ));
156            }
157        } else if !self.config.disallowed_languages.is_empty()
158            && canonical.is_some_and(|canonical| {
159                self.config
160                    .disallowed_languages
161                    .iter()
162                    .any(|d| d.eq_ignore_ascii_case(canonical))
163            })
164        {
165            let canonical = canonical.unwrap_or("unknown");
166            return Some(format!("Language '{original_label}' ({canonical}) is disallowed"));
167        }
168        None
169    }
170
171    /// Check for unknown language based on config
172    fn check_unknown_language(&self, label: &str) -> Option<(String, Severity)> {
173        if resolve_canonical(label).is_some() {
174            return None;
175        }
176
177        match self.config.unknown_language_action {
178            UnknownLanguageAction::Ignore => None,
179            UnknownLanguageAction::Warn => Some((
180                format!("Unknown language '{label}' (not in GitHub Linguist). Syntax highlighting may not work."),
181                Severity::Warning,
182            )),
183            UnknownLanguageAction::Error => Some((
184                format!("Unknown language '{label}' (not in GitHub Linguist)"),
185                Severity::Error,
186            )),
187        }
188    }
189}
190
191impl Rule for MD040FencedCodeLanguage {
192    fn name(&self) -> &'static str {
193        "MD040"
194    }
195
196    fn description(&self) -> &'static str {
197        "Code blocks should have a language specified"
198    }
199
200    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
201        let content = ctx.content;
202        let mut warnings = Vec::new();
203
204        // Validate config and emit warnings for invalid configuration
205        for error in self.validate_config() {
206            warnings.push(LintWarning {
207                rule_name: Some(self.name().to_string()),
208                line: 0,
209                column: 0,
210                end_line: 0,
211                end_column: 0,
212                message: format!("[config error] {error}"),
213                severity: Severity::Error,
214                fix: None,
215            });
216        }
217
218        // Use pulldown-cmark to detect fenced code blocks with language info
219        let fenced_blocks = detect_fenced_code_blocks(content, &ctx.line_offsets);
220
221        // Pre-compute disabled ranges for efficient lookup
222        let disabled_ranges = compute_disabled_ranges(content, self.name());
223
224        // Compute preferred labels for consistent mode
225        let preferred_labels = if self.config.style == LanguageStyle::Consistent {
226            self.compute_preferred_labels(&fenced_blocks, &disabled_ranges)
227        } else {
228            HashMap::new()
229        };
230
231        for block in &fenced_blocks {
232            // Skip if this line is in a disabled range
233            if is_line_disabled(&disabled_ranges, block.line_idx) {
234                continue;
235            }
236
237            // Get the actual line content for additional checks
238            let line = content.lines().nth(block.line_idx).unwrap_or("");
239            let trimmed = line.trim();
240            let after_fence = trimmed.strip_prefix(&block.fence_marker).unwrap_or("").trim();
241
242            // Check if it has MkDocs title attribute but no language
243            let has_title_only =
244                ctx.flavor == crate::config::MarkdownFlavor::MkDocs && after_fence.starts_with("title=");
245
246            // Check for Quarto/RMarkdown code chunk syntax: {language} or {language, options}
247            let has_quarto_syntax = ctx.flavor == crate::config::MarkdownFlavor::Quarto
248                && after_fence.starts_with('{')
249                && after_fence.contains('}');
250
251            // Warn if no language and not using special syntax
252            if (block.language.is_empty() || has_title_only) && !has_quarto_syntax {
253                let (start_line, start_col, end_line, end_col) = calculate_line_range(block.line_idx + 1, line);
254
255                warnings.push(LintWarning {
256                    rule_name: Some(self.name().to_string()),
257                    line: start_line,
258                    column: start_col,
259                    end_line,
260                    end_column: end_col,
261                    message: "Code block (```) missing language".to_string(),
262                    severity: Severity::Warning,
263                    fix: Some(Fix {
264                        range: {
265                            let trimmed_start = line.len() - line.trim_start().len();
266                            let fence_len = block.fence_marker.len();
267                            let line_start_byte = ctx.line_offsets.get(block.line_idx).copied().unwrap_or(0);
268                            let fence_start_byte = line_start_byte + trimmed_start;
269                            let fence_end_byte = fence_start_byte + fence_len;
270                            fence_start_byte..fence_end_byte
271                        },
272                        replacement: format!("{}text", block.fence_marker),
273                    }),
274                });
275                continue;
276            }
277
278            // Skip further checks for special syntax
279            if has_quarto_syntax {
280                continue;
281            }
282
283            let canonical = resolve_canonical(&block.language);
284
285            // Check language restrictions (allowlist/denylist)
286            if let Some(msg) = self.check_language_allowed(canonical, &block.language) {
287                let (start_line, start_col, end_line, end_col) = calculate_line_range(block.line_idx + 1, line);
288
289                warnings.push(LintWarning {
290                    rule_name: Some(self.name().to_string()),
291                    line: start_line,
292                    column: start_col,
293                    end_line,
294                    end_column: end_col,
295                    message: msg,
296                    severity: Severity::Warning,
297                    fix: None,
298                });
299                continue;
300            }
301
302            // Check for unknown language (only if not handled by allowlist)
303            if canonical.is_none() {
304                if let Some((msg, severity)) = self.check_unknown_language(&block.language) {
305                    let (start_line, start_col, end_line, end_col) = calculate_line_range(block.line_idx + 1, line);
306
307                    warnings.push(LintWarning {
308                        rule_name: Some(self.name().to_string()),
309                        line: start_line,
310                        column: start_col,
311                        end_line,
312                        end_column: end_col,
313                        message: msg,
314                        severity,
315                        fix: None,
316                    });
317                }
318                continue;
319            }
320
321            // Check consistency
322            if self.config.style == LanguageStyle::Consistent
323                && let Some(preferred) = preferred_labels.get(canonical.unwrap())
324                && &block.language != preferred
325            {
326                let (start_line, start_col, end_line, end_col) = calculate_line_range(block.line_idx + 1, line);
327
328                let fix = find_label_span(line, &block.fence_marker).map(|(label_start, label_end)| {
329                    let line_start_byte = ctx.line_offsets.get(block.line_idx).copied().unwrap_or(0);
330                    Fix {
331                        range: (line_start_byte + label_start)..(line_start_byte + label_end),
332                        replacement: preferred.clone(),
333                    }
334                });
335                let lang = &block.language;
336                let canonical = canonical.unwrap();
337
338                warnings.push(LintWarning {
339                    rule_name: Some(self.name().to_string()),
340                    line: start_line,
341                    column: start_col,
342                    end_line,
343                    end_column: end_col,
344                    message: format!("Inconsistent language label '{lang}' for {canonical} (use '{preferred}')"),
345                    severity: Severity::Warning,
346                    fix,
347                });
348            }
349        }
350
351        Ok(warnings)
352    }
353
354    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
355        let content = ctx.content;
356
357        // Use pulldown-cmark to detect fenced code blocks
358        let fenced_blocks = detect_fenced_code_blocks(content, &ctx.line_offsets);
359
360        // Pre-compute disabled ranges
361        let disabled_ranges = compute_disabled_ranges(content, self.name());
362
363        // Compute preferred labels for consistent mode
364        let preferred_labels = if self.config.style == LanguageStyle::Consistent {
365            self.compute_preferred_labels(&fenced_blocks, &disabled_ranges)
366        } else {
367            HashMap::new()
368        };
369
370        // Build a set of line indices that need fixing
371        let mut lines_to_fix: std::collections::HashMap<usize, FixAction> = std::collections::HashMap::new();
372
373        for block in &fenced_blocks {
374            if is_line_disabled(&disabled_ranges, block.line_idx) {
375                continue;
376            }
377
378            let line = content.lines().nth(block.line_idx).unwrap_or("");
379            let trimmed = line.trim();
380            let after_fence = trimmed.strip_prefix(&block.fence_marker).unwrap_or("").trim();
381
382            let has_title_only =
383                ctx.flavor == crate::config::MarkdownFlavor::MkDocs && after_fence.starts_with("title=");
384
385            let has_quarto_syntax = ctx.flavor == crate::config::MarkdownFlavor::Quarto
386                && after_fence.starts_with('{')
387                && after_fence.contains('}');
388
389            if (block.language.is_empty() || has_title_only) && !has_quarto_syntax {
390                lines_to_fix.insert(
391                    block.line_idx,
392                    FixAction::AddLanguage {
393                        fence_marker: block.fence_marker.clone(),
394                        has_title_only,
395                    },
396                );
397            } else if !has_quarto_syntax
398                && self.config.style == LanguageStyle::Consistent
399                && let Some(canonical) = resolve_canonical(&block.language)
400                && let Some(preferred) = preferred_labels.get(canonical)
401                && &block.language != preferred
402            {
403                lines_to_fix.insert(
404                    block.line_idx,
405                    FixAction::NormalizeLabel {
406                        fence_marker: block.fence_marker.clone(),
407                        new_label: preferred.clone(),
408                    },
409                );
410            }
411        }
412
413        // Build the result by iterating through lines
414        let mut result = String::new();
415        for (i, line) in content.lines().enumerate() {
416            if let Some(action) = lines_to_fix.get(&i) {
417                match action {
418                    FixAction::AddLanguage {
419                        fence_marker,
420                        has_title_only,
421                    } => {
422                        let indent = &line[..line.len() - line.trim_start().len()];
423                        let trimmed = line.trim();
424                        let after_fence = trimmed.strip_prefix(fence_marker).unwrap_or("").trim();
425
426                        if *has_title_only {
427                            result.push_str(&format!("{indent}{fence_marker}text {after_fence}\n"));
428                        } else {
429                            result.push_str(&format!("{indent}{fence_marker}text\n"));
430                        }
431                    }
432                    FixAction::NormalizeLabel {
433                        fence_marker,
434                        new_label,
435                    } => {
436                        if let Some((label_start, label_end)) = find_label_span(line, fence_marker) {
437                            result.push_str(&line[..label_start]);
438                            result.push_str(new_label);
439                            result.push_str(&line[label_end..]);
440                            result.push('\n');
441                        } else {
442                            result.push_str(line);
443                            result.push('\n');
444                        }
445                    }
446                }
447            } else {
448                result.push_str(line);
449                result.push('\n');
450            }
451        }
452
453        // Remove trailing newline if the original content didn't have one
454        if !content.ends_with('\n') {
455            result.pop();
456        }
457
458        Ok(result)
459    }
460
461    /// Get the category of this rule for selective processing
462    fn category(&self) -> RuleCategory {
463        RuleCategory::CodeBlock
464    }
465
466    /// Check if this rule should be skipped
467    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
468        ctx.content.is_empty() || (!ctx.likely_has_code() && !ctx.has_char('~'))
469    }
470
471    fn as_any(&self) -> &dyn std::any::Any {
472        self
473    }
474
475    fn default_config_section(&self) -> Option<(String, toml::Value)> {
476        let default_config = MD040Config::default();
477        let json_value = serde_json::to_value(&default_config).ok()?;
478        let toml_value = crate::rule_config_serde::json_to_toml_value(&json_value)?;
479
480        if let toml::Value::Table(table) = toml_value {
481            if !table.is_empty() {
482                Some((MD040Config::RULE_NAME.to_string(), toml::Value::Table(table)))
483            } else {
484                None
485            }
486        } else {
487            None
488        }
489    }
490
491    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
492    where
493        Self: Sized,
494    {
495        let rule_config: MD040Config = load_rule_config(config);
496        Box::new(MD040FencedCodeLanguage::with_config(rule_config))
497    }
498}
499
500#[derive(Debug, Clone)]
501enum FixAction {
502    AddLanguage { fence_marker: String, has_title_only: bool },
503    NormalizeLabel { fence_marker: String, new_label: String },
504}
505
506/// Detect fenced code blocks using pulldown-cmark, returning info about each block's opening fence
507fn detect_fenced_code_blocks(content: &str, line_offsets: &[usize]) -> Vec<FencedCodeBlock> {
508    let mut blocks = Vec::new();
509    let options = Options::all();
510    let parser = Parser::new_ext(content, options).into_offset_iter();
511
512    for (event, range) in parser {
513        if let Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(info))) = event {
514            // Find the line index for this byte offset
515            let line_idx = line_idx_from_offset(line_offsets, range.start);
516
517            // Determine fence marker from the actual line content
518            let line_start = line_offsets.get(line_idx).copied().unwrap_or(0);
519            let line_end = line_offsets.get(line_idx + 1).copied().unwrap_or(content.len());
520            let line = content.get(line_start..line_end).unwrap_or("");
521            let trimmed = line.trim();
522            let fence_marker = if trimmed.starts_with('`') {
523                let count = trimmed.chars().take_while(|&c| c == '`').count();
524                "`".repeat(count)
525            } else if trimmed.starts_with('~') {
526                let count = trimmed.chars().take_while(|&c| c == '~').count();
527                "~".repeat(count)
528            } else {
529                "```".to_string() // Fallback
530            };
531
532            // Extract just the language (first word of info string)
533            let language = info.split_whitespace().next().unwrap_or("").to_string();
534
535            blocks.push(FencedCodeBlock {
536                line_idx,
537                language,
538                fence_marker,
539            });
540        }
541    }
542
543    blocks
544}
545
546#[inline]
547fn line_idx_from_offset(line_offsets: &[usize], offset: usize) -> usize {
548    match line_offsets.binary_search(&offset) {
549        Ok(idx) => idx,
550        Err(idx) => idx.saturating_sub(1),
551    }
552}
553
554/// Compute disabled line ranges from disable/enable comments
555fn compute_disabled_ranges(content: &str, rule_name: &str) -> Vec<(usize, usize)> {
556    let mut ranges = Vec::new();
557    let mut disabled_start: Option<usize> = None;
558
559    for (i, line) in content.lines().enumerate() {
560        let trimmed = line.trim();
561
562        if let Some(rules) = crate::rule::parse_disable_comment(trimmed)
563            && (rules.is_empty() || rules.contains(&rule_name))
564            && disabled_start.is_none()
565        {
566            disabled_start = Some(i);
567        }
568
569        if let Some(rules) = crate::rule::parse_enable_comment(trimmed)
570            && (rules.is_empty() || rules.contains(&rule_name))
571            && let Some(start) = disabled_start.take()
572        {
573            ranges.push((start, i));
574        }
575    }
576
577    // Handle unclosed disable
578    if let Some(start) = disabled_start {
579        ranges.push((start, usize::MAX));
580    }
581
582    ranges
583}
584
585/// Check if a line index is within a disabled range
586fn is_line_disabled(ranges: &[(usize, usize)], line_idx: usize) -> bool {
587    ranges.iter().any(|&(start, end)| line_idx >= start && line_idx < end)
588}
589
590/// Find the byte span of the language label in a fence line.
591fn find_label_span(line: &str, fence_marker: &str) -> Option<(usize, usize)> {
592    let trimmed_start = line.len() - line.trim_start().len();
593    let after_indent = &line[trimmed_start..];
594    if !after_indent.starts_with(fence_marker) {
595        return None;
596    }
597    let after_fence = &after_indent[fence_marker.len()..];
598
599    let label_start_rel = after_fence
600        .char_indices()
601        .find(|&(_, ch)| !ch.is_whitespace())
602        .map(|(idx, _)| idx)?;
603    let after_label = &after_fence[label_start_rel..];
604    let label_end_rel = after_label
605        .char_indices()
606        .find(|&(_, ch)| ch.is_whitespace())
607        .map(|(idx, _)| label_start_rel + idx)
608        .unwrap_or(after_fence.len());
609
610    Some((
611        trimmed_start + fence_marker.len() + label_start_rel,
612        trimmed_start + fence_marker.len() + label_end_rel,
613    ))
614}
615
616#[cfg(test)]
617mod tests {
618    use super::*;
619    use crate::lint_context::LintContext;
620
621    fn run_check(content: &str) -> LintResult {
622        let rule = MD040FencedCodeLanguage::default();
623        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
624        rule.check(&ctx)
625    }
626
627    fn run_check_with_config(content: &str, config: MD040Config) -> LintResult {
628        let rule = MD040FencedCodeLanguage::with_config(config);
629        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
630        rule.check(&ctx)
631    }
632
633    fn run_fix(content: &str) -> Result<String, LintError> {
634        let rule = MD040FencedCodeLanguage::default();
635        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
636        rule.fix(&ctx)
637    }
638
639    fn run_fix_with_config(content: &str, config: MD040Config) -> Result<String, LintError> {
640        let rule = MD040FencedCodeLanguage::with_config(config);
641        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
642        rule.fix(&ctx)
643    }
644
645    // =========================================================================
646    // Basic functionality tests
647    // =========================================================================
648
649    #[test]
650    fn test_code_blocks_with_language_specified() {
651        let content = r#"# Test
652
653```python
654print("Hello, world!")
655```
656
657```javascript
658console.log("Hello!");
659```
660"#;
661        let result = run_check(content).unwrap();
662        assert!(result.is_empty(), "No warnings expected for code blocks with language");
663    }
664
665    #[test]
666    fn test_code_blocks_without_language() {
667        let content = r#"# Test
668
669```
670print("Hello, world!")
671```
672"#;
673        let result = run_check(content).unwrap();
674        assert_eq!(result.len(), 1);
675        assert_eq!(result[0].message, "Code block (```) missing language");
676        assert_eq!(result[0].line, 3);
677    }
678
679    #[test]
680    fn test_fix_method_adds_text_language() {
681        let content = r#"# Test
682
683```
684code without language
685```
686
687```python
688already has language
689```
690
691```
692another block without
693```
694"#;
695        let fixed = run_fix(content).unwrap();
696        assert!(fixed.contains("```text"));
697        assert!(fixed.contains("```python"));
698        assert_eq!(fixed.matches("```text").count(), 2);
699    }
700
701    #[test]
702    fn test_fix_preserves_indentation() {
703        let content = r#"# Test
704
705- List item
706  ```
707  indented code block
708  ```
709"#;
710        let fixed = run_fix(content).unwrap();
711        assert!(fixed.contains("  ```text"));
712    }
713
714    // =========================================================================
715    // Consistent mode tests
716    // =========================================================================
717
718    #[test]
719    fn test_consistent_mode_detects_inconsistency() {
720        let content = r#"```bash
721echo hi
722```
723
724```sh
725echo there
726```
727
728```bash
729echo again
730```
731"#;
732        let config = MD040Config {
733            style: LanguageStyle::Consistent,
734            ..Default::default()
735        };
736        let result = run_check_with_config(content, config).unwrap();
737        assert_eq!(result.len(), 1);
738        assert!(result[0].message.contains("Inconsistent"));
739        assert!(result[0].message.contains("sh"));
740        assert!(result[0].message.contains("bash"));
741    }
742
743    #[test]
744    fn test_consistent_mode_fix_normalizes() {
745        let content = r#"```bash
746echo hi
747```
748
749```sh
750echo there
751```
752
753```bash
754echo again
755```
756"#;
757        let config = MD040Config {
758            style: LanguageStyle::Consistent,
759            ..Default::default()
760        };
761        let fixed = run_fix_with_config(content, config).unwrap();
762        assert_eq!(fixed.matches("```bash").count(), 3);
763        assert_eq!(fixed.matches("```sh").count(), 0);
764    }
765
766    #[test]
767    fn test_consistent_mode_tie_break_uses_curated_default() {
768        // When there's a tie (1 bash, 1 sh), should use curated default (bash)
769        let content = r#"```bash
770echo hi
771```
772
773```sh
774echo there
775```
776"#;
777        let config = MD040Config {
778            style: LanguageStyle::Consistent,
779            ..Default::default()
780        };
781        let fixed = run_fix_with_config(content, config).unwrap();
782        // bash is the curated default for Shell
783        assert_eq!(fixed.matches("```bash").count(), 2);
784    }
785
786    #[test]
787    fn test_consistent_mode_with_preferred_alias() {
788        let content = r#"```bash
789echo hi
790```
791
792```sh
793echo there
794```
795"#;
796        let mut preferred = HashMap::new();
797        preferred.insert("Shell".to_string(), "sh".to_string());
798
799        let config = MD040Config {
800            style: LanguageStyle::Consistent,
801            preferred_aliases: preferred,
802            ..Default::default()
803        };
804        let fixed = run_fix_with_config(content, config).unwrap();
805        assert_eq!(fixed.matches("```sh").count(), 2);
806        assert_eq!(fixed.matches("```bash").count(), 0);
807    }
808
809    #[test]
810    fn test_consistent_mode_ignores_disabled_blocks() {
811        let content = r#"```bash
812echo hi
813```
814<!-- rumdl-disable MD040 -->
815```sh
816echo there
817```
818```sh
819echo again
820```
821<!-- rumdl-enable MD040 -->
822"#;
823        let config = MD040Config {
824            style: LanguageStyle::Consistent,
825            ..Default::default()
826        };
827        let result = run_check_with_config(content, config).unwrap();
828        assert!(result.is_empty(), "Disabled blocks should not affect consistency");
829    }
830
831    #[test]
832    fn test_fix_preserves_attributes() {
833        let content = "```sh {.highlight}\ncode\n```\n\n```bash\nmore\n```";
834        let config = MD040Config {
835            style: LanguageStyle::Consistent,
836            ..Default::default()
837        };
838        let fixed = run_fix_with_config(content, config).unwrap();
839        assert!(fixed.contains("```bash {.highlight}"));
840    }
841
842    #[test]
843    fn test_fix_preserves_spacing_before_label() {
844        let content = "```bash\ncode\n```\n\n```  sh {.highlight}\ncode\n```";
845        let config = MD040Config {
846            style: LanguageStyle::Consistent,
847            ..Default::default()
848        };
849        let fixed = run_fix_with_config(content, config).unwrap();
850        assert!(fixed.contains("```  bash {.highlight}"));
851        assert!(!fixed.contains("```  sh {.highlight}"));
852    }
853
854    // =========================================================================
855    // Allowlist/denylist tests
856    // =========================================================================
857
858    #[test]
859    fn test_allowlist_blocks_unlisted() {
860        let content = "```java\ncode\n```";
861        let config = MD040Config {
862            allowed_languages: vec!["Python".to_string(), "Shell".to_string()],
863            ..Default::default()
864        };
865        let result = run_check_with_config(content, config).unwrap();
866        assert_eq!(result.len(), 1);
867        assert!(result[0].message.contains("not in the allowed list"));
868    }
869
870    #[test]
871    fn test_allowlist_allows_listed() {
872        let content = "```python\ncode\n```";
873        let config = MD040Config {
874            allowed_languages: vec!["Python".to_string()],
875            ..Default::default()
876        };
877        let result = run_check_with_config(content, config).unwrap();
878        assert!(result.is_empty());
879    }
880
881    #[test]
882    fn test_allowlist_blocks_unknown_language() {
883        let content = "```mysterylang\ncode\n```";
884        let config = MD040Config {
885            allowed_languages: vec!["Python".to_string()],
886            ..Default::default()
887        };
888        let result = run_check_with_config(content, config).unwrap();
889        assert_eq!(result.len(), 1);
890        assert!(result[0].message.contains("allowed list"));
891    }
892
893    #[test]
894    fn test_allowlist_case_insensitive() {
895        let content = "```python\ncode\n```";
896        let config = MD040Config {
897            allowed_languages: vec!["PYTHON".to_string()],
898            ..Default::default()
899        };
900        let result = run_check_with_config(content, config).unwrap();
901        assert!(result.is_empty());
902    }
903
904    #[test]
905    fn test_denylist_blocks_listed() {
906        let content = "```java\ncode\n```";
907        let config = MD040Config {
908            disallowed_languages: vec!["Java".to_string()],
909            ..Default::default()
910        };
911        let result = run_check_with_config(content, config).unwrap();
912        assert_eq!(result.len(), 1);
913        assert!(result[0].message.contains("disallowed"));
914    }
915
916    #[test]
917    fn test_denylist_allows_unlisted() {
918        let content = "```python\ncode\n```";
919        let config = MD040Config {
920            disallowed_languages: vec!["Java".to_string()],
921            ..Default::default()
922        };
923        let result = run_check_with_config(content, config).unwrap();
924        assert!(result.is_empty());
925    }
926
927    #[test]
928    fn test_allowlist_takes_precedence_over_denylist() {
929        let content = "```python\ncode\n```";
930        let config = MD040Config {
931            allowed_languages: vec!["Python".to_string()],
932            disallowed_languages: vec!["Python".to_string()], // Should be ignored
933            ..Default::default()
934        };
935        let result = run_check_with_config(content, config).unwrap();
936        assert!(result.is_empty());
937    }
938
939    // =========================================================================
940    // Unknown language tests
941    // =========================================================================
942
943    #[test]
944    fn test_unknown_language_ignore_default() {
945        let content = "```mycustomlang\ncode\n```";
946        let result = run_check(content).unwrap();
947        assert!(result.is_empty(), "Unknown languages ignored by default");
948    }
949
950    #[test]
951    fn test_unknown_language_warn() {
952        let content = "```mycustomlang\ncode\n```";
953        let config = MD040Config {
954            unknown_language_action: UnknownLanguageAction::Warn,
955            ..Default::default()
956        };
957        let result = run_check_with_config(content, config).unwrap();
958        assert_eq!(result.len(), 1);
959        assert!(result[0].message.contains("Unknown language"));
960        assert!(result[0].message.contains("mycustomlang"));
961        assert_eq!(result[0].severity, Severity::Warning);
962    }
963
964    #[test]
965    fn test_unknown_language_error() {
966        let content = "```mycustomlang\ncode\n```";
967        let config = MD040Config {
968            unknown_language_action: UnknownLanguageAction::Error,
969            ..Default::default()
970        };
971        let result = run_check_with_config(content, config).unwrap();
972        assert_eq!(result.len(), 1);
973        assert!(result[0].message.contains("Unknown language"));
974        assert_eq!(result[0].severity, Severity::Error);
975    }
976
977    // =========================================================================
978    // Config validation tests
979    // =========================================================================
980
981    #[test]
982    fn test_invalid_preferred_alias_detected() {
983        let mut preferred = HashMap::new();
984        preferred.insert("Shell".to_string(), "invalid_alias".to_string());
985
986        let config = MD040Config {
987            style: LanguageStyle::Consistent,
988            preferred_aliases: preferred,
989            ..Default::default()
990        };
991        let rule = MD040FencedCodeLanguage::with_config(config);
992        let errors = rule.validate_config();
993        assert_eq!(errors.len(), 1);
994        assert!(errors[0].contains("Invalid alias"));
995        assert!(errors[0].contains("invalid_alias"));
996    }
997
998    #[test]
999    fn test_unknown_language_in_preferred_aliases_detected() {
1000        let mut preferred = HashMap::new();
1001        preferred.insert("NotARealLanguage".to_string(), "nope".to_string());
1002
1003        let config = MD040Config {
1004            style: LanguageStyle::Consistent,
1005            preferred_aliases: preferred,
1006            ..Default::default()
1007        };
1008        let rule = MD040FencedCodeLanguage::with_config(config);
1009        let errors = rule.validate_config();
1010        assert_eq!(errors.len(), 1);
1011        assert!(errors[0].contains("Unknown language"));
1012    }
1013
1014    #[test]
1015    fn test_valid_preferred_alias_accepted() {
1016        let mut preferred = HashMap::new();
1017        preferred.insert("Shell".to_string(), "bash".to_string());
1018        preferred.insert("JavaScript".to_string(), "js".to_string());
1019
1020        let config = MD040Config {
1021            style: LanguageStyle::Consistent,
1022            preferred_aliases: preferred,
1023            ..Default::default()
1024        };
1025        let rule = MD040FencedCodeLanguage::with_config(config);
1026        let errors = rule.validate_config();
1027        assert!(errors.is_empty());
1028    }
1029
1030    // =========================================================================
1031    // Linguist resolution tests
1032    // =========================================================================
1033
1034    #[test]
1035    fn test_linguist_resolution() {
1036        assert_eq!(resolve_canonical("bash"), Some("Shell"));
1037        assert_eq!(resolve_canonical("sh"), Some("Shell"));
1038        assert_eq!(resolve_canonical("zsh"), Some("Shell"));
1039        assert_eq!(resolve_canonical("js"), Some("JavaScript"));
1040        assert_eq!(resolve_canonical("python"), Some("Python"));
1041        assert_eq!(resolve_canonical("unknown_lang"), None);
1042    }
1043
1044    #[test]
1045    fn test_linguist_resolution_case_insensitive() {
1046        assert_eq!(resolve_canonical("BASH"), Some("Shell"));
1047        assert_eq!(resolve_canonical("Bash"), Some("Shell"));
1048        assert_eq!(resolve_canonical("Python"), Some("Python"));
1049        assert_eq!(resolve_canonical("PYTHON"), Some("Python"));
1050    }
1051
1052    #[test]
1053    fn test_alias_validation() {
1054        assert!(is_valid_alias("Shell", "bash"));
1055        assert!(is_valid_alias("Shell", "sh"));
1056        assert!(is_valid_alias("Shell", "zsh"));
1057        assert!(!is_valid_alias("Shell", "python"));
1058        assert!(!is_valid_alias("Shell", "invalid"));
1059    }
1060
1061    #[test]
1062    fn test_default_alias() {
1063        assert_eq!(default_alias("Shell"), Some("bash"));
1064        assert_eq!(default_alias("JavaScript"), Some("js"));
1065        assert_eq!(default_alias("Python"), Some("python"));
1066    }
1067
1068    // =========================================================================
1069    // Edge case tests
1070    // =========================================================================
1071
1072    #[test]
1073    fn test_mixed_case_labels_normalized() {
1074        let content = r#"```BASH
1075echo hi
1076```
1077
1078```Bash
1079echo there
1080```
1081
1082```bash
1083echo again
1084```
1085"#;
1086        let config = MD040Config {
1087            style: LanguageStyle::Consistent,
1088            ..Default::default()
1089        };
1090        // All should resolve to Shell, most prevalent should win
1091        let result = run_check_with_config(content, config).unwrap();
1092        // "bash" appears 1x, "Bash" appears 1x, "BASH" appears 1x
1093        // All are different strings, so there's a 3-way tie
1094        // Should pick curated default "bash" or alphabetically first
1095        assert!(result.len() >= 2, "Should flag at least 2 inconsistent labels");
1096    }
1097
1098    #[test]
1099    fn test_multiple_languages_independent() {
1100        let content = r#"```bash
1101shell code
1102```
1103
1104```python
1105python code
1106```
1107
1108```sh
1109more shell
1110```
1111
1112```python3
1113more python
1114```
1115"#;
1116        let config = MD040Config {
1117            style: LanguageStyle::Consistent,
1118            ..Default::default()
1119        };
1120        let result = run_check_with_config(content, config).unwrap();
1121        // Should have 2 warnings: one for sh (inconsistent with bash) and one for python3 (inconsistent with python)
1122        assert_eq!(result.len(), 2);
1123    }
1124
1125    #[test]
1126    fn test_tilde_fences() {
1127        let content = r#"~~~bash
1128echo hi
1129~~~
1130
1131~~~sh
1132echo there
1133~~~
1134"#;
1135        let config = MD040Config {
1136            style: LanguageStyle::Consistent,
1137            ..Default::default()
1138        };
1139        let result = run_check_with_config(content, config.clone()).unwrap();
1140        assert_eq!(result.len(), 1);
1141
1142        let fixed = run_fix_with_config(content, config).unwrap();
1143        assert!(fixed.contains("~~~bash"));
1144        assert!(!fixed.contains("~~~sh"));
1145    }
1146
1147    #[test]
1148    fn test_longer_fence_markers_preserved() {
1149        let content = "````sh\ncode\n````\n\n```bash\ncode\n```";
1150        let config = MD040Config {
1151            style: LanguageStyle::Consistent,
1152            ..Default::default()
1153        };
1154        let fixed = run_fix_with_config(content, config).unwrap();
1155        assert!(fixed.contains("````bash"));
1156        assert!(fixed.contains("```bash"));
1157    }
1158
1159    #[test]
1160    fn test_empty_document() {
1161        let result = run_check("").unwrap();
1162        assert!(result.is_empty());
1163    }
1164
1165    #[test]
1166    fn test_no_code_blocks() {
1167        let content = "# Just a heading\n\nSome text.";
1168        let result = run_check(content).unwrap();
1169        assert!(result.is_empty());
1170    }
1171
1172    #[test]
1173    fn test_single_code_block_no_inconsistency() {
1174        let content = "```bash\necho hi\n```";
1175        let config = MD040Config {
1176            style: LanguageStyle::Consistent,
1177            ..Default::default()
1178        };
1179        let result = run_check_with_config(content, config).unwrap();
1180        assert!(result.is_empty(), "Single block has no inconsistency");
1181    }
1182
1183    #[test]
1184    fn test_idempotent_fix() {
1185        let content = r#"```bash
1186echo hi
1187```
1188
1189```sh
1190echo there
1191```
1192"#;
1193        let config = MD040Config {
1194            style: LanguageStyle::Consistent,
1195            ..Default::default()
1196        };
1197        let fixed1 = run_fix_with_config(content, config.clone()).unwrap();
1198        let fixed2 = run_fix_with_config(&fixed1, config).unwrap();
1199        assert_eq!(fixed1, fixed2, "Fix should be idempotent");
1200    }
1201}