rumdl_lib/rules/
md044_proper_names.rs

1use crate::utils::fast_hash;
2use crate::utils::regex_cache::{escape_regex, get_cached_fancy_regex};
3
4use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, Severity};
5use fancy_regex::Regex;
6use std::collections::{HashMap, HashSet};
7use std::sync::LazyLock;
8use std::sync::{Arc, Mutex};
9
10mod md044_config;
11use md044_config::MD044Config;
12
13static HTML_COMMENT_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<!--([\s\S]*?)-->").unwrap());
14// Reference definition pattern - matches [ref]: url "title"
15static REF_DEF_REGEX: LazyLock<regex::Regex> = LazyLock::new(|| {
16    regex::Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap()
17});
18
19type WarningPosition = (usize, usize, String); // (line, column, found_name)
20
21/// Rule MD044: Proper names should be capitalized
22///
23/// See [docs/md044.md](../../docs/md044.md) for full documentation, configuration, and examples.
24///
25/// This rule is triggered when proper names are not capitalized correctly in the document.
26/// For example, if you have defined "JavaScript" as a proper name, the rule will flag any
27/// occurrences of "javascript" or "Javascript" as violations.
28///
29/// ## Purpose
30///
31/// Ensuring consistent capitalization of proper names improves document quality and
32/// professionalism. This is especially important for technical documentation where
33/// product names, programming languages, and technologies often have specific
34/// capitalization conventions.
35///
36/// ## Configuration Options
37///
38/// The rule supports the following configuration options:
39///
40/// ```yaml
41/// MD044:
42///   names: []                # List of proper names to check for correct capitalization
43///   code-blocks: false       # Whether to check code blocks (default: false)
44/// ```
45///
46/// Example configuration:
47///
48/// ```yaml
49/// MD044:
50///   names: ["JavaScript", "Node.js", "TypeScript"]
51///   code-blocks: true
52/// ```
53///
54/// ## Performance Optimizations
55///
56/// This rule implements several performance optimizations:
57///
58/// 1. **Regex Caching**: Pre-compiles and caches regex patterns for each proper name
59/// 2. **Content Caching**: Caches results based on content hashing for repeated checks
60/// 3. **Efficient Text Processing**: Uses optimized algorithms to avoid redundant text processing
61/// 4. **Smart Code Block Detection**: Efficiently identifies and optionally excludes code blocks
62///
63/// ## Edge Cases Handled
64///
65/// - **Word Boundaries**: Only matches complete words, not substrings within other words
66/// - **Case Sensitivity**: Properly handles case-specific matching
67/// - **Code Blocks**: Optionally checks code blocks (controlled by code-blocks setting)
68/// - **Markdown Formatting**: Handles proper names within Markdown formatting elements
69///
70/// ## Fix Behavior
71///
72/// When fixing issues, this rule replaces incorrect capitalization with the correct form
73/// as defined in the configuration.
74///
75#[derive(Clone)]
76pub struct MD044ProperNames {
77    config: MD044Config,
78    // Cache the combined regex pattern string
79    combined_pattern: Option<String>,
80    // Precomputed lowercase name variants for fast pre-checks
81    name_variants: Vec<String>,
82    // Cache for name violations by content hash
83    content_cache: Arc<Mutex<HashMap<u64, Vec<WarningPosition>>>>,
84}
85
86impl MD044ProperNames {
87    pub fn new(names: Vec<String>, code_blocks: bool) -> Self {
88        let config = MD044Config {
89            names,
90            code_blocks,
91            html_elements: true, // Default to checking HTML elements
92            html_comments: true, // Default to checking HTML comments
93        };
94        let combined_pattern = Self::create_combined_pattern(&config);
95        let name_variants = Self::build_name_variants(&config);
96        Self {
97            config,
98            combined_pattern,
99            name_variants,
100            content_cache: Arc::new(Mutex::new(HashMap::new())),
101        }
102    }
103
104    // Helper function for consistent ASCII normalization
105    fn ascii_normalize(s: &str) -> String {
106        s.replace(['é', 'è', 'ê', 'ë'], "e")
107            .replace(['à', 'á', 'â', 'ä', 'ã', 'å'], "a")
108            .replace(['ï', 'î', 'í', 'ì'], "i")
109            .replace(['ü', 'ú', 'ù', 'û'], "u")
110            .replace(['ö', 'ó', 'ò', 'ô', 'õ'], "o")
111            .replace('ñ', "n")
112            .replace('ç', "c")
113    }
114
115    pub fn from_config_struct(config: MD044Config) -> Self {
116        let combined_pattern = Self::create_combined_pattern(&config);
117        let name_variants = Self::build_name_variants(&config);
118        Self {
119            config,
120            combined_pattern,
121            name_variants,
122            content_cache: Arc::new(Mutex::new(HashMap::new())),
123        }
124    }
125
126    // Create a combined regex pattern for all proper names
127    fn create_combined_pattern(config: &MD044Config) -> Option<String> {
128        if config.names.is_empty() {
129            return None;
130        }
131
132        // Create patterns for all names and their variations
133        let mut patterns: Vec<String> = config
134            .names
135            .iter()
136            .flat_map(|name| {
137                let mut variations = vec![];
138                let lower_name = name.to_lowercase();
139
140                // Add the lowercase version
141                variations.push(escape_regex(&lower_name));
142
143                // Add version without dots
144                let lower_name_no_dots = lower_name.replace('.', "");
145                if lower_name != lower_name_no_dots {
146                    variations.push(escape_regex(&lower_name_no_dots));
147                }
148
149                // Add ASCII-normalized versions for common accented characters
150                let ascii_normalized = Self::ascii_normalize(&lower_name);
151
152                if ascii_normalized != lower_name {
153                    variations.push(escape_regex(&ascii_normalized));
154
155                    // Also add version without dots
156                    let ascii_no_dots = ascii_normalized.replace('.', "");
157                    if ascii_normalized != ascii_no_dots {
158                        variations.push(escape_regex(&ascii_no_dots));
159                    }
160                }
161
162                variations
163            })
164            .collect();
165
166        // Sort patterns by length (longest first) to avoid shorter patterns matching within longer ones
167        patterns.sort_by_key(|b| std::cmp::Reverse(b.len()));
168
169        // Combine all patterns into a single regex with capture groups
170        // Don't use \b as it doesn't work with Unicode - we'll check boundaries manually
171        Some(format!(r"(?i)({})", patterns.join("|")))
172    }
173
174    fn build_name_variants(config: &MD044Config) -> Vec<String> {
175        let mut variants = HashSet::new();
176        for name in &config.names {
177            let lower_name = name.to_lowercase();
178            variants.insert(lower_name.clone());
179
180            let lower_no_dots = lower_name.replace('.', "");
181            if lower_name != lower_no_dots {
182                variants.insert(lower_no_dots);
183            }
184
185            let ascii_normalized = Self::ascii_normalize(&lower_name);
186            if ascii_normalized != lower_name {
187                variants.insert(ascii_normalized.clone());
188
189                let ascii_no_dots = ascii_normalized.replace('.', "");
190                if ascii_normalized != ascii_no_dots {
191                    variants.insert(ascii_no_dots);
192                }
193            }
194        }
195
196        variants.into_iter().collect()
197    }
198
199    // Find all name violations in the content and return positions
200    fn find_name_violations(&self, content: &str, ctx: &crate::lint_context::LintContext) -> Vec<WarningPosition> {
201        // Early return: if no names configured or content is empty
202        if self.config.names.is_empty() || content.is_empty() || self.combined_pattern.is_none() {
203            return Vec::new();
204        }
205
206        // Early return: quick check if any of the configured names might be in content
207        let content_lower = if content.is_ascii() {
208            content.to_ascii_lowercase()
209        } else {
210            content.to_lowercase()
211        };
212        let has_potential_matches = self.name_variants.iter().any(|name| content_lower.contains(name));
213
214        if !has_potential_matches {
215            return Vec::new();
216        }
217
218        // Check if we have cached results
219        let hash = fast_hash(content);
220        {
221            // Use a separate scope for borrowing to minimize lock time
222            if let Ok(cache) = self.content_cache.lock()
223                && let Some(cached) = cache.get(&hash)
224            {
225                return cached.clone();
226            }
227        }
228
229        let mut violations = Vec::new();
230
231        // Get the regex from global cache
232        let combined_regex = match &self.combined_pattern {
233            Some(pattern) => match get_cached_fancy_regex(pattern) {
234                Ok(regex) => regex,
235                Err(_) => return Vec::new(),
236            },
237            None => return Vec::new(),
238        };
239
240        // Use ctx.lines for better performance
241        for (line_idx, line_info) in ctx.lines.iter().enumerate() {
242            let line_num = line_idx + 1;
243            let line = line_info.content(ctx.content);
244
245            // Skip code fence lines (```language or ~~~language)
246            let trimmed = line.trim_start();
247            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
248                continue;
249            }
250
251            // Skip if in code block (when code_blocks = false)
252            if !self.config.code_blocks && line_info.in_code_block {
253                continue;
254            }
255
256            // Skip if in HTML block (when html_elements = false)
257            if !self.config.html_elements && line_info.in_html_block {
258                continue;
259            }
260
261            // Check if we should skip HTML comments
262            let in_html_comment = if !self.config.html_comments {
263                // Check if this position is within an HTML comment
264                self.is_in_html_comment(content, line_info.byte_offset)
265            } else {
266                false
267            };
268
269            if in_html_comment {
270                continue;
271            }
272
273            // Skip JSX expressions and MDX comments (MDX flavor)
274            if line_info.in_jsx_expression || line_info.in_mdx_comment {
275                continue;
276            }
277
278            // Early return: skip lines that don't contain any potential matches
279            let line_lower = line.to_lowercase();
280            let has_line_matches = self.name_variants.iter().any(|name| line_lower.contains(name));
281
282            if !has_line_matches {
283                continue;
284            }
285
286            // Use the combined regex to find all matches in one pass
287            for cap_result in combined_regex.find_iter(line) {
288                match cap_result {
289                    Ok(cap) => {
290                        let found_name = &line[cap.start()..cap.end()];
291
292                        // Check word boundaries manually for Unicode support
293                        let start_pos = cap.start();
294                        let end_pos = cap.end();
295
296                        if !self.is_at_word_boundary(line, start_pos, true)
297                            || !self.is_at_word_boundary(line, end_pos, false)
298                        {
299                            continue; // Not at word boundary
300                        }
301
302                        // Skip if in inline code when code_blocks is false
303                        if !self.config.code_blocks {
304                            let byte_pos = line_info.byte_offset + cap.start();
305                            if ctx.is_in_code_block_or_span(byte_pos) {
306                                continue;
307                            }
308                        }
309
310                        // Skip if in link (inline links, reference links, or reference definitions)
311                        let byte_pos = line_info.byte_offset + cap.start();
312                        if self.is_in_link(ctx, byte_pos) {
313                            continue;
314                        }
315
316                        // Find which proper name this matches
317                        if let Some(proper_name) = self.get_proper_name_for(found_name) {
318                            // Only flag if it's not already correct
319                            if found_name != proper_name {
320                                violations.push((line_num, cap.start() + 1, found_name.to_string()));
321                            }
322                        }
323                    }
324                    Err(e) => {
325                        eprintln!("Regex execution error on line {line_num}: {e}");
326                    }
327                }
328            }
329        }
330
331        // Store in cache (ignore if mutex is poisoned)
332        if let Ok(mut cache) = self.content_cache.lock() {
333            cache.insert(hash, violations.clone());
334        }
335        violations
336    }
337
338    // Check if a byte position is within an HTML comment
339    fn is_in_html_comment(&self, content: &str, byte_pos: usize) -> bool {
340        for m in HTML_COMMENT_REGEX.find_iter(content).flatten() {
341            if m.start() <= byte_pos && byte_pos < m.end() {
342                return true;
343            }
344        }
345        false
346    }
347
348    /// Check if a byte position is within a link (inline links, reference links, or reference definitions)
349    fn is_in_link(&self, ctx: &crate::lint_context::LintContext, byte_pos: usize) -> bool {
350        // Check inline and reference links
351        for link in &ctx.links {
352            if link.byte_offset <= byte_pos && byte_pos < link.byte_end {
353                return true;
354            }
355        }
356
357        // Check images (which use similar syntax)
358        for image in &ctx.images {
359            if image.byte_offset <= byte_pos && byte_pos < image.byte_end {
360                return true;
361            }
362        }
363
364        // Check reference definitions [ref]: url "title" using regex pattern
365        for m in REF_DEF_REGEX.find_iter(ctx.content) {
366            if m.start() <= byte_pos && byte_pos < m.end() {
367                return true;
368            }
369        }
370
371        false
372    }
373
374    // Check if a character is a word boundary (handles Unicode)
375    fn is_word_boundary_char(c: char) -> bool {
376        !c.is_alphanumeric()
377    }
378
379    // Check if position is at a word boundary
380    fn is_at_word_boundary(&self, content: &str, pos: usize, is_start: bool) -> bool {
381        let chars: Vec<char> = content.chars().collect();
382        let char_indices: Vec<(usize, char)> = content.char_indices().collect();
383
384        // Find the character position
385        let char_pos = char_indices.iter().position(|(idx, _)| *idx == pos);
386        if char_pos.is_none() {
387            return true; // If we can't find position, assume boundary
388        }
389        let char_pos = char_pos.unwrap();
390
391        if is_start {
392            // Check character before position
393            if char_pos == 0 {
394                return true; // Start of string
395            }
396            Self::is_word_boundary_char(chars[char_pos - 1])
397        } else {
398            // Check character after position
399            if char_pos >= chars.len() {
400                return true; // End of string
401            }
402            Self::is_word_boundary_char(chars[char_pos])
403        }
404    }
405
406    // Get the proper name that should be used for a found name
407    fn get_proper_name_for(&self, found_name: &str) -> Option<String> {
408        let found_lower = found_name.to_lowercase();
409
410        // Iterate through the configured proper names
411        for name in &self.config.names {
412            let lower_name = name.to_lowercase();
413            let lower_name_no_dots = lower_name.replace('.', "");
414
415            // Direct match
416            if found_lower == lower_name || found_lower == lower_name_no_dots {
417                return Some(name.clone());
418            }
419
420            // Check ASCII-normalized version
421            let ascii_normalized = Self::ascii_normalize(&lower_name);
422
423            let ascii_no_dots = ascii_normalized.replace('.', "");
424
425            if found_lower == ascii_normalized || found_lower == ascii_no_dots {
426                return Some(name.clone());
427            }
428        }
429        None
430    }
431}
432
433impl Rule for MD044ProperNames {
434    fn name(&self) -> &'static str {
435        "MD044"
436    }
437
438    fn description(&self) -> &'static str {
439        "Proper names should have the correct capitalization"
440    }
441
442    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
443        if self.config.names.is_empty() {
444            return true;
445        }
446        // Quick check if any configured names exist (case-insensitive)
447        let content_lower = ctx.content.to_lowercase();
448        !self
449            .config
450            .names
451            .iter()
452            .any(|name| content_lower.contains(&name.to_lowercase()))
453    }
454
455    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
456        let content = ctx.content;
457        if content.is_empty() || self.config.names.is_empty() || self.combined_pattern.is_none() {
458            return Ok(Vec::new());
459        }
460
461        // Early return: quick check if any of the configured names might be in content
462        let content_lower = content.to_lowercase();
463        let has_potential_matches = self.config.names.iter().any(|name| {
464            let name_lower = name.to_lowercase();
465            let name_no_dots = name_lower.replace('.', "");
466
467            // Check direct match
468            if content_lower.contains(&name_lower) || content_lower.contains(&name_no_dots) {
469                return true;
470            }
471
472            // Also check ASCII-normalized version
473            let ascii_normalized = Self::ascii_normalize(&name_lower);
474
475            if ascii_normalized != name_lower {
476                if content_lower.contains(&ascii_normalized) {
477                    return true;
478                }
479                let ascii_no_dots = ascii_normalized.replace('.', "");
480                if ascii_normalized != ascii_no_dots && content_lower.contains(&ascii_no_dots) {
481                    return true;
482                }
483            }
484
485            false
486        });
487
488        if !has_potential_matches {
489            return Ok(Vec::new());
490        }
491
492        let line_index = &ctx.line_index;
493        let violations = self.find_name_violations(content, ctx);
494
495        let warnings = violations
496            .into_iter()
497            .filter_map(|(line, column, found_name)| {
498                self.get_proper_name_for(&found_name).map(|proper_name| LintWarning {
499                    rule_name: Some(self.name().to_string()),
500                    line,
501                    column,
502                    end_line: line,
503                    end_column: column + found_name.len(),
504                    message: format!("Proper name '{found_name}' should be '{proper_name}'"),
505                    severity: Severity::Warning,
506                    fix: Some(Fix {
507                        range: line_index.line_col_to_byte_range(line, column),
508                        replacement: proper_name,
509                    }),
510                })
511            })
512            .collect();
513
514        Ok(warnings)
515    }
516
517    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
518        let content = ctx.content;
519        if content.is_empty() || self.config.names.is_empty() {
520            return Ok(content.to_string());
521        }
522
523        let violations = self.find_name_violations(content, ctx);
524        if violations.is_empty() {
525            return Ok(content.to_string());
526        }
527
528        // Process lines and build the fixed content
529        let mut fixed_lines = Vec::new();
530
531        // Group violations by line
532        let mut violations_by_line: HashMap<usize, Vec<(usize, String)>> = HashMap::new();
533        for (line_num, col_num, found_name) in violations {
534            violations_by_line
535                .entry(line_num)
536                .or_default()
537                .push((col_num, found_name));
538        }
539
540        // Sort violations within each line in reverse order
541        for violations in violations_by_line.values_mut() {
542            violations.sort_by_key(|b| std::cmp::Reverse(b.0));
543        }
544
545        // Process each line
546        for (line_idx, line_info) in ctx.lines.iter().enumerate() {
547            let line_num = line_idx + 1;
548
549            if let Some(line_violations) = violations_by_line.get(&line_num) {
550                // This line has violations, fix them
551                let mut fixed_line = line_info.content(ctx.content).to_string();
552
553                for (col_num, found_name) in line_violations {
554                    if let Some(proper_name) = self.get_proper_name_for(found_name) {
555                        let start_col = col_num - 1; // Convert to 0-based
556                        let end_col = start_col + found_name.len();
557
558                        if end_col <= fixed_line.len()
559                            && fixed_line.is_char_boundary(start_col)
560                            && fixed_line.is_char_boundary(end_col)
561                        {
562                            fixed_line.replace_range(start_col..end_col, &proper_name);
563                        }
564                    }
565                }
566
567                fixed_lines.push(fixed_line);
568            } else {
569                // No violations on this line, keep it as is
570                fixed_lines.push(line_info.content(ctx.content).to_string());
571            }
572        }
573
574        // Join lines with newlines, preserving the original ending
575        let mut result = fixed_lines.join("\n");
576        if content.ends_with('\n') && !result.ends_with('\n') {
577            result.push('\n');
578        }
579        Ok(result)
580    }
581
582    fn as_any(&self) -> &dyn std::any::Any {
583        self
584    }
585
586    fn default_config_section(&self) -> Option<(String, toml::Value)> {
587        let json_value = serde_json::to_value(&self.config).ok()?;
588        Some((
589            self.name().to_string(),
590            crate::rule_config_serde::json_to_toml_value(&json_value)?,
591        ))
592    }
593
594    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
595    where
596        Self: Sized,
597    {
598        let rule_config = crate::rule_config_serde::load_rule_config::<MD044Config>(config);
599        Box::new(Self::from_config_struct(rule_config))
600    }
601}
602
603#[cfg(test)]
604mod tests {
605    use super::*;
606    use crate::lint_context::LintContext;
607
608    fn create_context(content: &str) -> LintContext<'_> {
609        LintContext::new(content, crate::config::MarkdownFlavor::Standard, None)
610    }
611
612    #[test]
613    fn test_correctly_capitalized_names() {
614        let rule = MD044ProperNames::new(
615            vec![
616                "JavaScript".to_string(),
617                "TypeScript".to_string(),
618                "Node.js".to_string(),
619            ],
620            true,
621        );
622
623        let content = "This document uses JavaScript, TypeScript, and Node.js correctly.";
624        let ctx = create_context(content);
625        let result = rule.check(&ctx).unwrap();
626        assert!(result.is_empty(), "Should not flag correctly capitalized names");
627    }
628
629    #[test]
630    fn test_incorrectly_capitalized_names() {
631        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "TypeScript".to_string()], true);
632
633        let content = "This document uses javascript and typescript incorrectly.";
634        let ctx = create_context(content);
635        let result = rule.check(&ctx).unwrap();
636
637        assert_eq!(result.len(), 2, "Should flag two incorrect capitalizations");
638        assert_eq!(result[0].message, "Proper name 'javascript' should be 'JavaScript'");
639        assert_eq!(result[0].line, 1);
640        assert_eq!(result[0].column, 20);
641        assert_eq!(result[1].message, "Proper name 'typescript' should be 'TypeScript'");
642        assert_eq!(result[1].line, 1);
643        assert_eq!(result[1].column, 35);
644    }
645
646    #[test]
647    fn test_names_at_beginning_of_sentences() {
648        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "Python".to_string()], true);
649
650        let content = "javascript is a great language. python is also popular.";
651        let ctx = create_context(content);
652        let result = rule.check(&ctx).unwrap();
653
654        assert_eq!(result.len(), 2, "Should flag names at beginning of sentences");
655        assert_eq!(result[0].line, 1);
656        assert_eq!(result[0].column, 1);
657        assert_eq!(result[1].line, 1);
658        assert_eq!(result[1].column, 33);
659    }
660
661    #[test]
662    fn test_names_in_code_blocks_checked_by_default() {
663        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
664
665        let content = r#"Here is some text with JavaScript.
666
667```javascript
668// This javascript should be checked
669const lang = "javascript";
670```
671
672But this javascript should be flagged."#;
673
674        let ctx = create_context(content);
675        let result = rule.check(&ctx).unwrap();
676
677        assert_eq!(result.len(), 3, "Should flag javascript inside and outside code blocks");
678        assert_eq!(result[0].line, 4);
679        assert_eq!(result[1].line, 5);
680        assert_eq!(result[2].line, 8);
681    }
682
683    #[test]
684    fn test_names_in_code_blocks_ignored_when_disabled() {
685        let rule = MD044ProperNames::new(
686            vec!["JavaScript".to_string()],
687            false, // code_blocks = false means skip code blocks
688        );
689
690        let content = r#"```
691javascript in code block
692```"#;
693
694        let ctx = create_context(content);
695        let result = rule.check(&ctx).unwrap();
696
697        assert_eq!(
698            result.len(),
699            0,
700            "Should not flag javascript in code blocks when code_blocks is false"
701        );
702    }
703
704    #[test]
705    fn test_names_in_inline_code_checked_by_default() {
706        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
707
708        let content = "This is `javascript` in inline code and javascript outside.";
709        let ctx = create_context(content);
710        let result = rule.check(&ctx).unwrap();
711
712        // When code_blocks=true, inline code should be checked
713        assert_eq!(result.len(), 2, "Should flag javascript inside and outside inline code");
714        assert_eq!(result[0].column, 10); // javascript in inline code
715        assert_eq!(result[1].column, 41); // javascript outside
716    }
717
718    #[test]
719    fn test_multiple_names_in_same_line() {
720        let rule = MD044ProperNames::new(
721            vec!["JavaScript".to_string(), "TypeScript".to_string(), "React".to_string()],
722            true,
723        );
724
725        let content = "I use javascript, typescript, and react in my projects.";
726        let ctx = create_context(content);
727        let result = rule.check(&ctx).unwrap();
728
729        assert_eq!(result.len(), 3, "Should flag all three incorrect names");
730        assert_eq!(result[0].message, "Proper name 'javascript' should be 'JavaScript'");
731        assert_eq!(result[1].message, "Proper name 'typescript' should be 'TypeScript'");
732        assert_eq!(result[2].message, "Proper name 'react' should be 'React'");
733    }
734
735    #[test]
736    fn test_case_sensitivity() {
737        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
738
739        let content = "JAVASCRIPT, Javascript, javascript, and JavaScript variations.";
740        let ctx = create_context(content);
741        let result = rule.check(&ctx).unwrap();
742
743        assert_eq!(result.len(), 3, "Should flag all incorrect case variations");
744        // JavaScript (correct) should not be flagged
745        assert!(result.iter().all(|w| w.message.contains("should be 'JavaScript'")));
746    }
747
748    #[test]
749    fn test_configuration_with_custom_name_list() {
750        let config = MD044Config {
751            names: vec!["GitHub".to_string(), "GitLab".to_string(), "DevOps".to_string()],
752            code_blocks: true,
753            html_elements: true,
754            html_comments: true,
755        };
756        let rule = MD044ProperNames::from_config_struct(config);
757
758        let content = "We use github, gitlab, and devops for our workflow.";
759        let ctx = create_context(content);
760        let result = rule.check(&ctx).unwrap();
761
762        assert_eq!(result.len(), 3, "Should flag all custom names");
763        assert_eq!(result[0].message, "Proper name 'github' should be 'GitHub'");
764        assert_eq!(result[1].message, "Proper name 'gitlab' should be 'GitLab'");
765        assert_eq!(result[2].message, "Proper name 'devops' should be 'DevOps'");
766    }
767
768    #[test]
769    fn test_empty_configuration() {
770        let rule = MD044ProperNames::new(vec![], true);
771
772        let content = "This has javascript and typescript but no configured names.";
773        let ctx = create_context(content);
774        let result = rule.check(&ctx).unwrap();
775
776        assert!(result.is_empty(), "Should not flag anything with empty configuration");
777    }
778
779    #[test]
780    fn test_names_with_special_characters() {
781        let rule = MD044ProperNames::new(
782            vec!["Node.js".to_string(), "ASP.NET".to_string(), "C++".to_string()],
783            true,
784        );
785
786        let content = "We use nodejs, asp.net, ASP.NET, and c++ in our stack.";
787        let ctx = create_context(content);
788        let result = rule.check(&ctx).unwrap();
789
790        // nodejs should match Node.js (dotless variation)
791        // asp.net should be flagged (wrong case)
792        // ASP.NET should not be flagged (correct)
793        // c++ should be flagged
794        assert_eq!(result.len(), 3, "Should handle special characters correctly");
795
796        let messages: Vec<&str> = result.iter().map(|w| w.message.as_str()).collect();
797        assert!(messages.contains(&"Proper name 'nodejs' should be 'Node.js'"));
798        assert!(messages.contains(&"Proper name 'asp.net' should be 'ASP.NET'"));
799        assert!(messages.contains(&"Proper name 'c++' should be 'C++'"));
800    }
801
802    #[test]
803    fn test_word_boundaries() {
804        let rule = MD044ProperNames::new(vec!["Java".to_string(), "Script".to_string()], true);
805
806        let content = "JavaScript is not java or script, but Java and Script are separate.";
807        let ctx = create_context(content);
808        let result = rule.check(&ctx).unwrap();
809
810        // Should only flag lowercase "java" and "script" as separate words
811        assert_eq!(result.len(), 2, "Should respect word boundaries");
812        assert!(result.iter().any(|w| w.column == 19)); // "java" position
813        assert!(result.iter().any(|w| w.column == 27)); // "script" position
814    }
815
816    #[test]
817    fn test_fix_method() {
818        let rule = MD044ProperNames::new(
819            vec![
820                "JavaScript".to_string(),
821                "TypeScript".to_string(),
822                "Node.js".to_string(),
823            ],
824            true,
825        );
826
827        let content = "I love javascript, typescript, and nodejs!";
828        let ctx = create_context(content);
829        let fixed = rule.fix(&ctx).unwrap();
830
831        assert_eq!(fixed, "I love JavaScript, TypeScript, and Node.js!");
832    }
833
834    #[test]
835    fn test_fix_multiple_occurrences() {
836        let rule = MD044ProperNames::new(vec!["Python".to_string()], true);
837
838        let content = "python is great. I use python daily. PYTHON is powerful.";
839        let ctx = create_context(content);
840        let fixed = rule.fix(&ctx).unwrap();
841
842        assert_eq!(fixed, "Python is great. I use Python daily. Python is powerful.");
843    }
844
845    #[test]
846    fn test_fix_checks_code_blocks_by_default() {
847        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
848
849        let content = r#"I love javascript.
850
851```
852const lang = "javascript";
853```
854
855More javascript here."#;
856
857        let ctx = create_context(content);
858        let fixed = rule.fix(&ctx).unwrap();
859
860        let expected = r#"I love JavaScript.
861
862```
863const lang = "JavaScript";
864```
865
866More JavaScript here."#;
867
868        assert_eq!(fixed, expected);
869    }
870
871    #[test]
872    fn test_multiline_content() {
873        let rule = MD044ProperNames::new(vec!["Rust".to_string(), "Python".to_string()], true);
874
875        let content = r#"First line with rust.
876Second line with python.
877Third line with RUST and PYTHON."#;
878
879        let ctx = create_context(content);
880        let result = rule.check(&ctx).unwrap();
881
882        assert_eq!(result.len(), 4, "Should flag all incorrect occurrences");
883        assert_eq!(result[0].line, 1);
884        assert_eq!(result[1].line, 2);
885        assert_eq!(result[2].line, 3);
886        assert_eq!(result[3].line, 3);
887    }
888
889    #[test]
890    fn test_default_config() {
891        let config = MD044Config::default();
892        assert!(config.names.is_empty());
893        assert!(!config.code_blocks); // Default is false (skip code blocks)
894    }
895
896    #[test]
897    fn test_performance_with_many_names() {
898        let mut names = vec![];
899        for i in 0..50 {
900            names.push(format!("ProperName{i}"));
901        }
902
903        let rule = MD044ProperNames::new(names, true);
904
905        let content = "This has propername0, propername25, and propername49 incorrectly.";
906        let ctx = create_context(content);
907        let result = rule.check(&ctx).unwrap();
908
909        assert_eq!(result.len(), 3, "Should handle many configured names efficiently");
910    }
911
912    #[test]
913    fn test_large_name_count_performance() {
914        // Verify MD044 can handle large numbers of names without regex limitations
915        // This test confirms that fancy-regex handles large patterns well
916        let names = (0..1000).map(|i| format!("ProperName{i}")).collect::<Vec<_>>();
917
918        let rule = MD044ProperNames::new(names, true);
919
920        // The combined pattern should be created successfully
921        assert!(rule.combined_pattern.is_some());
922
923        // Should be able to check content without errors
924        let content = "This has propername0 and propername999 in it.";
925        let ctx = create_context(content);
926        let result = rule.check(&ctx).unwrap();
927
928        // Should detect both incorrect names
929        assert_eq!(result.len(), 2, "Should handle 1000 names without issues");
930    }
931
932    #[test]
933    fn test_cache_behavior() {
934        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
935
936        let content = "Using javascript here.";
937        let ctx = create_context(content);
938
939        // First check
940        let result1 = rule.check(&ctx).unwrap();
941        assert_eq!(result1.len(), 1);
942
943        // Second check should use cache
944        let result2 = rule.check(&ctx).unwrap();
945        assert_eq!(result2.len(), 1);
946
947        // Results should be identical
948        assert_eq!(result1[0].line, result2[0].line);
949        assert_eq!(result1[0].column, result2[0].column);
950    }
951
952    #[test]
953    fn test_html_comments_not_checked_when_disabled() {
954        let config = MD044Config {
955            names: vec!["JavaScript".to_string()],
956            code_blocks: true,    // Check code blocks
957            html_elements: true,  // Check HTML elements
958            html_comments: false, // Don't check HTML comments
959        };
960        let rule = MD044ProperNames::from_config_struct(config);
961
962        let content = r#"Regular javascript here.
963<!-- This javascript in HTML comment should be ignored -->
964More javascript outside."#;
965
966        let ctx = create_context(content);
967        let result = rule.check(&ctx).unwrap();
968
969        assert_eq!(result.len(), 2, "Should only flag javascript outside HTML comments");
970        assert_eq!(result[0].line, 1);
971        assert_eq!(result[1].line, 3);
972    }
973
974    #[test]
975    fn test_html_comments_checked_when_enabled() {
976        let config = MD044Config {
977            names: vec!["JavaScript".to_string()],
978            code_blocks: true,   // Check code blocks
979            html_elements: true, // Check HTML elements
980            html_comments: true, // Check HTML comments
981        };
982        let rule = MD044ProperNames::from_config_struct(config);
983
984        let content = r#"Regular javascript here.
985<!-- This javascript in HTML comment should be checked -->
986More javascript outside."#;
987
988        let ctx = create_context(content);
989        let result = rule.check(&ctx).unwrap();
990
991        assert_eq!(
992            result.len(),
993            3,
994            "Should flag all javascript occurrences including in HTML comments"
995        );
996    }
997
998    #[test]
999    fn test_multiline_html_comments() {
1000        let config = MD044Config {
1001            names: vec!["Python".to_string(), "JavaScript".to_string()],
1002            code_blocks: true,    // Check code blocks
1003            html_elements: true,  // Check HTML elements
1004            html_comments: false, // Don't check HTML comments
1005        };
1006        let rule = MD044ProperNames::from_config_struct(config);
1007
1008        let content = r#"Regular python here.
1009<!--
1010This is a multiline comment
1011with javascript and python
1012that should be ignored
1013-->
1014More javascript outside."#;
1015
1016        let ctx = create_context(content);
1017        let result = rule.check(&ctx).unwrap();
1018
1019        assert_eq!(result.len(), 2, "Should only flag names outside HTML comments");
1020        assert_eq!(result[0].line, 1); // python
1021        assert_eq!(result[1].line, 7); // javascript
1022    }
1023
1024    #[test]
1025    fn test_fix_preserves_html_comments_when_disabled() {
1026        let config = MD044Config {
1027            names: vec!["JavaScript".to_string()],
1028            code_blocks: true,    // Check code blocks
1029            html_elements: true,  // Check HTML elements
1030            html_comments: false, // Don't check HTML comments
1031        };
1032        let rule = MD044ProperNames::from_config_struct(config);
1033
1034        let content = r#"javascript here.
1035<!-- javascript in comment -->
1036More javascript."#;
1037
1038        let ctx = create_context(content);
1039        let fixed = rule.fix(&ctx).unwrap();
1040
1041        let expected = r#"JavaScript here.
1042<!-- javascript in comment -->
1043More JavaScript."#;
1044
1045        assert_eq!(
1046            fixed, expected,
1047            "Should not fix names inside HTML comments when disabled"
1048        );
1049    }
1050
1051    #[test]
1052    fn test_proper_names_in_links_not_flagged() {
1053        let rule = MD044ProperNames::new(
1054            vec!["JavaScript".to_string(), "Node.js".to_string(), "Python".to_string()],
1055            true,
1056        );
1057
1058        let content = r#"Check this [javascript documentation](https://javascript.info) for info.
1059
1060Visit [node.js homepage](https://nodejs.org) and [python tutorial](https://python.org).
1061
1062Real javascript should be flagged.
1063
1064Also see the [typescript guide][ts-ref] for more.
1065
1066Real python should be flagged too.
1067
1068[ts-ref]: https://typescript.org/handbook"#;
1069
1070        let ctx = create_context(content);
1071        let result = rule.check(&ctx).unwrap();
1072
1073        // Only the real standalone proper names should be flagged
1074        assert_eq!(
1075            result.len(),
1076            2,
1077            "Expected exactly 2 warnings for standalone proper names"
1078        );
1079        assert!(result[0].message.contains("'javascript' should be 'JavaScript'"));
1080        assert!(result[1].message.contains("'python' should be 'Python'"));
1081        // Should be on lines with standalone instances
1082        assert!(result[0].line == 5); // "Real javascript should be flagged."
1083        assert!(result[1].line == 9); // "Real python should be flagged too."
1084    }
1085
1086    #[test]
1087    fn test_proper_names_in_images_not_flagged() {
1088        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1089
1090        let content = r#"Here is a ![javascript logo](javascript.png "javascript icon") image.
1091
1092Real javascript should be flagged."#;
1093
1094        let ctx = create_context(content);
1095        let result = rule.check(&ctx).unwrap();
1096
1097        // Only the standalone proper name should be flagged
1098        assert_eq!(result.len(), 1, "Expected exactly 1 warning for standalone proper name");
1099        assert!(result[0].message.contains("'javascript' should be 'JavaScript'"));
1100        assert!(result[0].line == 3); // "Real javascript should be flagged."
1101    }
1102
1103    #[test]
1104    fn test_proper_names_in_reference_definitions_not_flagged() {
1105        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "TypeScript".to_string()], true);
1106
1107        let content = r#"Check the [javascript guide][js-ref] for details.
1108
1109Real javascript should be flagged.
1110
1111[js-ref]: https://javascript.info/typescript/guide"#;
1112
1113        let ctx = create_context(content);
1114        let result = rule.check(&ctx).unwrap();
1115
1116        // Only the standalone proper name should be flagged
1117        assert_eq!(result.len(), 1, "Expected exactly 1 warning for standalone proper name");
1118        assert!(result[0].message.contains("'javascript' should be 'JavaScript'"));
1119        assert!(result[0].line == 3); // "Real javascript should be flagged."
1120    }
1121}
rumdl_lib/rules/md044_proper_names.rs

rumdl_lib/rules/
md044_proper_names.rs