rumdl_lib/rules/
md044_proper_names.rs

1use crate::utils::fast_hash;
2use crate::utils::regex_cache::{escape_regex, get_cached_fancy_regex};
3
4use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, Severity};
5use fancy_regex::Regex;
6use std::collections::HashMap;
7use std::sync::LazyLock;
8use std::sync::{Arc, Mutex};
9
10mod md044_config;
11use md044_config::MD044Config;
12
13static HTML_COMMENT_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<!--([\s\S]*?)-->").unwrap());
14// Reference definition pattern - matches [ref]: url "title"
15static REF_DEF_REGEX: LazyLock<regex::Regex> = LazyLock::new(|| {
16    regex::Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap()
17});
18
19type WarningPosition = (usize, usize, String); // (line, column, found_name)
20
21/// Rule MD044: Proper names should be capitalized
22///
23/// See [docs/md044.md](../../docs/md044.md) for full documentation, configuration, and examples.
24///
25/// This rule is triggered when proper names are not capitalized correctly in the document.
26/// For example, if you have defined "JavaScript" as a proper name, the rule will flag any
27/// occurrences of "javascript" or "Javascript" as violations.
28///
29/// ## Purpose
30///
31/// Ensuring consistent capitalization of proper names improves document quality and
32/// professionalism. This is especially important for technical documentation where
33/// product names, programming languages, and technologies often have specific
34/// capitalization conventions.
35///
36/// ## Configuration Options
37///
38/// The rule supports the following configuration options:
39///
40/// ```yaml
41/// MD044:
42///   names: []                # List of proper names to check for correct capitalization
43///   code-blocks: false       # Whether to check code blocks (default: false)
44/// ```
45///
46/// Example configuration:
47///
48/// ```yaml
49/// MD044:
50///   names: ["JavaScript", "Node.js", "TypeScript"]
51///   code-blocks: true
52/// ```
53///
54/// ## Performance Optimizations
55///
56/// This rule implements several performance optimizations:
57///
58/// 1. **Regex Caching**: Pre-compiles and caches regex patterns for each proper name
59/// 2. **Content Caching**: Caches results based on content hashing for repeated checks
60/// 3. **Efficient Text Processing**: Uses optimized algorithms to avoid redundant text processing
61/// 4. **Smart Code Block Detection**: Efficiently identifies and optionally excludes code blocks
62///
63/// ## Edge Cases Handled
64///
65/// - **Word Boundaries**: Only matches complete words, not substrings within other words
66/// - **Case Sensitivity**: Properly handles case-specific matching
67/// - **Code Blocks**: Optionally checks code blocks (controlled by code-blocks setting)
68/// - **Markdown Formatting**: Handles proper names within Markdown formatting elements
69///
70/// ## Fix Behavior
71///
72/// When fixing issues, this rule replaces incorrect capitalization with the correct form
73/// as defined in the configuration.
74///
75#[derive(Clone)]
76pub struct MD044ProperNames {
77    config: MD044Config,
78    // Cache the combined regex pattern string
79    combined_pattern: Option<String>,
80    // Cache for name violations by content hash
81    content_cache: Arc<Mutex<HashMap<u64, Vec<WarningPosition>>>>,
82}
83
84impl MD044ProperNames {
85    pub fn new(names: Vec<String>, code_blocks: bool) -> Self {
86        let config = MD044Config {
87            names,
88            code_blocks,
89            html_elements: true, // Default to checking HTML elements
90            html_comments: true, // Default to checking HTML comments
91        };
92        let combined_pattern = Self::create_combined_pattern(&config);
93        Self {
94            config,
95            combined_pattern,
96            content_cache: Arc::new(Mutex::new(HashMap::new())),
97        }
98    }
99
100    // Helper function for consistent ASCII normalization
101    fn ascii_normalize(s: &str) -> String {
102        s.replace(['é', 'è', 'ê', 'ë'], "e")
103            .replace(['à', 'á', 'â', 'ä', 'ã', 'å'], "a")
104            .replace(['ï', 'î', 'í', 'ì'], "i")
105            .replace(['ü', 'ú', 'ù', 'û'], "u")
106            .replace(['ö', 'ó', 'ò', 'ô', 'õ'], "o")
107            .replace('ñ', "n")
108            .replace('ç', "c")
109    }
110
111    pub fn from_config_struct(config: MD044Config) -> Self {
112        let combined_pattern = Self::create_combined_pattern(&config);
113        Self {
114            config,
115            combined_pattern,
116            content_cache: Arc::new(Mutex::new(HashMap::new())),
117        }
118    }
119
120    // Create a combined regex pattern for all proper names
121    fn create_combined_pattern(config: &MD044Config) -> Option<String> {
122        if config.names.is_empty() {
123            return None;
124        }
125
126        // Create patterns for all names and their variations
127        let mut patterns: Vec<String> = config
128            .names
129            .iter()
130            .flat_map(|name| {
131                let mut variations = vec![];
132                let lower_name = name.to_lowercase();
133
134                // Add the lowercase version
135                variations.push(escape_regex(&lower_name));
136
137                // Add version without dots
138                let lower_name_no_dots = lower_name.replace('.', "");
139                if lower_name != lower_name_no_dots {
140                    variations.push(escape_regex(&lower_name_no_dots));
141                }
142
143                // Add ASCII-normalized versions for common accented characters
144                let ascii_normalized = Self::ascii_normalize(&lower_name);
145
146                if ascii_normalized != lower_name {
147                    variations.push(escape_regex(&ascii_normalized));
148
149                    // Also add version without dots
150                    let ascii_no_dots = ascii_normalized.replace('.', "");
151                    if ascii_normalized != ascii_no_dots {
152                        variations.push(escape_regex(&ascii_no_dots));
153                    }
154                }
155
156                variations
157            })
158            .collect();
159
160        // Sort patterns by length (longest first) to avoid shorter patterns matching within longer ones
161        patterns.sort_by_key(|b| std::cmp::Reverse(b.len()));
162
163        // Combine all patterns into a single regex with capture groups
164        // Don't use \b as it doesn't work with Unicode - we'll check boundaries manually
165        Some(format!(r"(?i)({})", patterns.join("|")))
166    }
167
168    // Find all name violations in the content and return positions
169    fn find_name_violations(&self, content: &str, ctx: &crate::lint_context::LintContext) -> Vec<WarningPosition> {
170        // Early return: if no names configured or content is empty
171        if self.config.names.is_empty() || content.is_empty() || self.combined_pattern.is_none() {
172            return Vec::new();
173        }
174
175        // Early return: quick check if any of the configured names might be in content
176        let content_lower = content.to_lowercase();
177        let has_potential_matches = self.config.names.iter().any(|name| {
178            let name_lower = name.to_lowercase();
179            let name_no_dots = name_lower.replace('.', "");
180
181            // Check direct match
182            if content_lower.contains(&name_lower) || content_lower.contains(&name_no_dots) {
183                return true;
184            }
185
186            // Also check ASCII-normalized version
187            let ascii_normalized = Self::ascii_normalize(&name_lower);
188
189            if ascii_normalized != name_lower {
190                if content_lower.contains(&ascii_normalized) {
191                    return true;
192                }
193                let ascii_no_dots = ascii_normalized.replace('.', "");
194                if ascii_normalized != ascii_no_dots && content_lower.contains(&ascii_no_dots) {
195                    return true;
196                }
197            }
198
199            false
200        });
201
202        if !has_potential_matches {
203            return Vec::new();
204        }
205
206        // Check if we have cached results
207        let hash = fast_hash(content);
208        {
209            // Use a separate scope for borrowing to minimize lock time
210            if let Ok(cache) = self.content_cache.lock()
211                && let Some(cached) = cache.get(&hash)
212            {
213                return cached.clone();
214            }
215        }
216
217        let mut violations = Vec::new();
218
219        // Get the regex from global cache
220        let combined_regex = match &self.combined_pattern {
221            Some(pattern) => match get_cached_fancy_regex(pattern) {
222                Ok(regex) => regex,
223                Err(_) => return Vec::new(),
224            },
225            None => return Vec::new(),
226        };
227
228        // Use ctx.lines for better performance
229        for (line_idx, line_info) in ctx.lines.iter().enumerate() {
230            let line_num = line_idx + 1;
231            let line = line_info.content(ctx.content);
232
233            // Skip code fence lines (```language or ~~~language)
234            let trimmed = line.trim_start();
235            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
236                continue;
237            }
238
239            // Skip if in code block (when code_blocks = false)
240            if !self.config.code_blocks && line_info.in_code_block {
241                continue;
242            }
243
244            // Skip if in HTML block (when html_elements = false)
245            if !self.config.html_elements && line_info.in_html_block {
246                continue;
247            }
248
249            // Check if we should skip HTML comments
250            let in_html_comment = if !self.config.html_comments {
251                // Check if this position is within an HTML comment
252                self.is_in_html_comment(content, line_info.byte_offset)
253            } else {
254                false
255            };
256
257            if in_html_comment {
258                continue;
259            }
260
261            // Early return: skip lines that don't contain any potential matches
262            let line_lower = line.to_lowercase();
263            let has_line_matches = self.config.names.iter().any(|name| {
264                let name_lower = name.to_lowercase();
265                let name_no_dots = name_lower.replace('.', "");
266
267                // Check direct match
268                if line_lower.contains(&name_lower) || line_lower.contains(&name_no_dots) {
269                    return true;
270                }
271
272                // Also check ASCII-normalized version
273                let ascii_normalized = Self::ascii_normalize(&name_lower);
274                if ascii_normalized != name_lower {
275                    if line_lower.contains(&ascii_normalized) {
276                        return true;
277                    }
278                    let ascii_no_dots = ascii_normalized.replace('.', "");
279                    if ascii_normalized != ascii_no_dots && line_lower.contains(&ascii_no_dots) {
280                        return true;
281                    }
282                }
283
284                false
285            });
286
287            if !has_line_matches {
288                continue;
289            }
290
291            // Use the combined regex to find all matches in one pass
292            for cap_result in combined_regex.find_iter(line) {
293                match cap_result {
294                    Ok(cap) => {
295                        let found_name = &line[cap.start()..cap.end()];
296
297                        // Check word boundaries manually for Unicode support
298                        let start_pos = cap.start();
299                        let end_pos = cap.end();
300
301                        if !self.is_at_word_boundary(line, start_pos, true)
302                            || !self.is_at_word_boundary(line, end_pos, false)
303                        {
304                            continue; // Not at word boundary
305                        }
306
307                        // Skip if in inline code when code_blocks is false
308                        if !self.config.code_blocks {
309                            let byte_pos = line_info.byte_offset + cap.start();
310                            if ctx.is_in_code_block_or_span(byte_pos) {
311                                continue;
312                            }
313                        }
314
315                        // Skip if in link (inline links, reference links, or reference definitions)
316                        let byte_pos = line_info.byte_offset + cap.start();
317                        if self.is_in_link(ctx, byte_pos) {
318                            continue;
319                        }
320
321                        // Find which proper name this matches
322                        if let Some(proper_name) = self.get_proper_name_for(found_name) {
323                            // Only flag if it's not already correct
324                            if found_name != proper_name {
325                                violations.push((line_num, cap.start() + 1, found_name.to_string()));
326                            }
327                        }
328                    }
329                    Err(e) => {
330                        eprintln!("Regex execution error on line {line_num}: {e}");
331                    }
332                }
333            }
334        }
335
336        // Store in cache (ignore if mutex is poisoned)
337        if let Ok(mut cache) = self.content_cache.lock() {
338            cache.insert(hash, violations.clone());
339        }
340        violations
341    }
342
343    // Check if a byte position is within an HTML comment
344    fn is_in_html_comment(&self, content: &str, byte_pos: usize) -> bool {
345        for m in HTML_COMMENT_REGEX.find_iter(content).flatten() {
346            if m.start() <= byte_pos && byte_pos < m.end() {
347                return true;
348            }
349        }
350        false
351    }
352
353    /// Check if a byte position is within a link (inline links, reference links, or reference definitions)
354    fn is_in_link(&self, ctx: &crate::lint_context::LintContext, byte_pos: usize) -> bool {
355        // Check inline and reference links
356        for link in &ctx.links {
357            if link.byte_offset <= byte_pos && byte_pos < link.byte_end {
358                return true;
359            }
360        }
361
362        // Check images (which use similar syntax)
363        for image in &ctx.images {
364            if image.byte_offset <= byte_pos && byte_pos < image.byte_end {
365                return true;
366            }
367        }
368
369        // Check reference definitions [ref]: url "title" using regex pattern
370        for m in REF_DEF_REGEX.find_iter(ctx.content) {
371            if m.start() <= byte_pos && byte_pos < m.end() {
372                return true;
373            }
374        }
375
376        false
377    }
378
379    // Check if a character is a word boundary (handles Unicode)
380    fn is_word_boundary_char(c: char) -> bool {
381        !c.is_alphanumeric()
382    }
383
384    // Check if position is at a word boundary
385    fn is_at_word_boundary(&self, content: &str, pos: usize, is_start: bool) -> bool {
386        let chars: Vec<char> = content.chars().collect();
387        let char_indices: Vec<(usize, char)> = content.char_indices().collect();
388
389        // Find the character position
390        let char_pos = char_indices.iter().position(|(idx, _)| *idx == pos);
391        if char_pos.is_none() {
392            return true; // If we can't find position, assume boundary
393        }
394        let char_pos = char_pos.unwrap();
395
396        if is_start {
397            // Check character before position
398            if char_pos == 0 {
399                return true; // Start of string
400            }
401            Self::is_word_boundary_char(chars[char_pos - 1])
402        } else {
403            // Check character after position
404            if char_pos >= chars.len() {
405                return true; // End of string
406            }
407            Self::is_word_boundary_char(chars[char_pos])
408        }
409    }
410
411    // Get the proper name that should be used for a found name
412    fn get_proper_name_for(&self, found_name: &str) -> Option<String> {
413        let found_lower = found_name.to_lowercase();
414
415        // Iterate through the configured proper names
416        for name in &self.config.names {
417            let lower_name = name.to_lowercase();
418            let lower_name_no_dots = lower_name.replace('.', "");
419
420            // Direct match
421            if found_lower == lower_name || found_lower == lower_name_no_dots {
422                return Some(name.clone());
423            }
424
425            // Check ASCII-normalized version
426            let ascii_normalized = Self::ascii_normalize(&lower_name);
427
428            let ascii_no_dots = ascii_normalized.replace('.', "");
429
430            if found_lower == ascii_normalized || found_lower == ascii_no_dots {
431                return Some(name.clone());
432            }
433        }
434        None
435    }
436}
437
438impl Rule for MD044ProperNames {
439    fn name(&self) -> &'static str {
440        "MD044"
441    }
442
443    fn description(&self) -> &'static str {
444        "Proper names should have the correct capitalization"
445    }
446
447    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
448        if self.config.names.is_empty() {
449            return true;
450        }
451        // Quick check if any configured names exist (case-insensitive)
452        let content_lower = ctx.content.to_lowercase();
453        !self
454            .config
455            .names
456            .iter()
457            .any(|name| content_lower.contains(&name.to_lowercase()))
458    }
459
460    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
461        let content = ctx.content;
462        if content.is_empty() || self.config.names.is_empty() || self.combined_pattern.is_none() {
463            return Ok(Vec::new());
464        }
465
466        // Early return: quick check if any of the configured names might be in content
467        let content_lower = content.to_lowercase();
468        let has_potential_matches = self.config.names.iter().any(|name| {
469            let name_lower = name.to_lowercase();
470            let name_no_dots = name_lower.replace('.', "");
471
472            // Check direct match
473            if content_lower.contains(&name_lower) || content_lower.contains(&name_no_dots) {
474                return true;
475            }
476
477            // Also check ASCII-normalized version
478            let ascii_normalized = Self::ascii_normalize(&name_lower);
479
480            if ascii_normalized != name_lower {
481                if content_lower.contains(&ascii_normalized) {
482                    return true;
483                }
484                let ascii_no_dots = ascii_normalized.replace('.', "");
485                if ascii_normalized != ascii_no_dots && content_lower.contains(&ascii_no_dots) {
486                    return true;
487                }
488            }
489
490            false
491        });
492
493        if !has_potential_matches {
494            return Ok(Vec::new());
495        }
496
497        let line_index = &ctx.line_index;
498        let violations = self.find_name_violations(content, ctx);
499
500        let warnings = violations
501            .into_iter()
502            .filter_map(|(line, column, found_name)| {
503                self.get_proper_name_for(&found_name).map(|proper_name| LintWarning {
504                    rule_name: Some(self.name().to_string()),
505                    line,
506                    column,
507                    end_line: line,
508                    end_column: column + found_name.len(),
509                    message: format!("Proper name '{found_name}' should be '{proper_name}'"),
510                    severity: Severity::Warning,
511                    fix: Some(Fix {
512                        range: line_index.line_col_to_byte_range(line, column),
513                        replacement: proper_name,
514                    }),
515                })
516            })
517            .collect();
518
519        Ok(warnings)
520    }
521
522    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
523        let content = ctx.content;
524        if content.is_empty() || self.config.names.is_empty() {
525            return Ok(content.to_string());
526        }
527
528        let violations = self.find_name_violations(content, ctx);
529        if violations.is_empty() {
530            return Ok(content.to_string());
531        }
532
533        // Process lines and build the fixed content
534        let mut fixed_lines = Vec::new();
535
536        // Group violations by line
537        let mut violations_by_line: HashMap<usize, Vec<(usize, String)>> = HashMap::new();
538        for (line_num, col_num, found_name) in violations {
539            violations_by_line
540                .entry(line_num)
541                .or_default()
542                .push((col_num, found_name));
543        }
544
545        // Sort violations within each line in reverse order
546        for violations in violations_by_line.values_mut() {
547            violations.sort_by_key(|b| std::cmp::Reverse(b.0));
548        }
549
550        // Process each line
551        for (line_idx, line_info) in ctx.lines.iter().enumerate() {
552            let line_num = line_idx + 1;
553
554            if let Some(line_violations) = violations_by_line.get(&line_num) {
555                // This line has violations, fix them
556                let mut fixed_line = line_info.content(ctx.content).to_string();
557
558                for (col_num, found_name) in line_violations {
559                    if let Some(proper_name) = self.get_proper_name_for(found_name) {
560                        let start_col = col_num - 1; // Convert to 0-based
561                        let end_col = start_col + found_name.len();
562
563                        if end_col <= fixed_line.len()
564                            && fixed_line.is_char_boundary(start_col)
565                            && fixed_line.is_char_boundary(end_col)
566                        {
567                            fixed_line.replace_range(start_col..end_col, &proper_name);
568                        }
569                    }
570                }
571
572                fixed_lines.push(fixed_line);
573            } else {
574                // No violations on this line, keep it as is
575                fixed_lines.push(line_info.content(ctx.content).to_string());
576            }
577        }
578
579        // Join lines with newlines, preserving the original ending
580        let mut result = fixed_lines.join("\n");
581        if content.ends_with('\n') && !result.ends_with('\n') {
582            result.push('\n');
583        }
584        Ok(result)
585    }
586
587    fn as_any(&self) -> &dyn std::any::Any {
588        self
589    }
590
591    fn default_config_section(&self) -> Option<(String, toml::Value)> {
592        let json_value = serde_json::to_value(&self.config).ok()?;
593        Some((
594            self.name().to_string(),
595            crate::rule_config_serde::json_to_toml_value(&json_value)?,
596        ))
597    }
598
599    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
600    where
601        Self: Sized,
602    {
603        let rule_config = crate::rule_config_serde::load_rule_config::<MD044Config>(config);
604        Box::new(Self::from_config_struct(rule_config))
605    }
606}
607
608#[cfg(test)]
609mod tests {
610    use super::*;
611    use crate::lint_context::LintContext;
612
613    fn create_context(content: &str) -> LintContext<'_> {
614        LintContext::new(content, crate::config::MarkdownFlavor::Standard, None)
615    }
616
617    #[test]
618    fn test_correctly_capitalized_names() {
619        let rule = MD044ProperNames::new(
620            vec![
621                "JavaScript".to_string(),
622                "TypeScript".to_string(),
623                "Node.js".to_string(),
624            ],
625            true,
626        );
627
628        let content = "This document uses JavaScript, TypeScript, and Node.js correctly.";
629        let ctx = create_context(content);
630        let result = rule.check(&ctx).unwrap();
631        assert!(result.is_empty(), "Should not flag correctly capitalized names");
632    }
633
634    #[test]
635    fn test_incorrectly_capitalized_names() {
636        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "TypeScript".to_string()], true);
637
638        let content = "This document uses javascript and typescript incorrectly.";
639        let ctx = create_context(content);
640        let result = rule.check(&ctx).unwrap();
641
642        assert_eq!(result.len(), 2, "Should flag two incorrect capitalizations");
643        assert_eq!(result[0].message, "Proper name 'javascript' should be 'JavaScript'");
644        assert_eq!(result[0].line, 1);
645        assert_eq!(result[0].column, 20);
646        assert_eq!(result[1].message, "Proper name 'typescript' should be 'TypeScript'");
647        assert_eq!(result[1].line, 1);
648        assert_eq!(result[1].column, 35);
649    }
650
651    #[test]
652    fn test_names_at_beginning_of_sentences() {
653        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "Python".to_string()], true);
654
655        let content = "javascript is a great language. python is also popular.";
656        let ctx = create_context(content);
657        let result = rule.check(&ctx).unwrap();
658
659        assert_eq!(result.len(), 2, "Should flag names at beginning of sentences");
660        assert_eq!(result[0].line, 1);
661        assert_eq!(result[0].column, 1);
662        assert_eq!(result[1].line, 1);
663        assert_eq!(result[1].column, 33);
664    }
665
666    #[test]
667    fn test_names_in_code_blocks_checked_by_default() {
668        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
669
670        let content = r#"Here is some text with JavaScript.
671
672```javascript
673// This javascript should be checked
674const lang = "javascript";
675```
676
677But this javascript should be flagged."#;
678
679        let ctx = create_context(content);
680        let result = rule.check(&ctx).unwrap();
681
682        assert_eq!(result.len(), 3, "Should flag javascript inside and outside code blocks");
683        assert_eq!(result[0].line, 4);
684        assert_eq!(result[1].line, 5);
685        assert_eq!(result[2].line, 8);
686    }
687
688    #[test]
689    fn test_names_in_code_blocks_ignored_when_disabled() {
690        let rule = MD044ProperNames::new(
691            vec!["JavaScript".to_string()],
692            false, // code_blocks = false means skip code blocks
693        );
694
695        let content = r#"```
696javascript in code block
697```"#;
698
699        let ctx = create_context(content);
700        let result = rule.check(&ctx).unwrap();
701
702        assert_eq!(
703            result.len(),
704            0,
705            "Should not flag javascript in code blocks when code_blocks is false"
706        );
707    }
708
709    #[test]
710    fn test_names_in_inline_code_checked_by_default() {
711        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
712
713        let content = "This is `javascript` in inline code and javascript outside.";
714        let ctx = create_context(content);
715        let result = rule.check(&ctx).unwrap();
716
717        // When code_blocks=true, inline code should be checked
718        assert_eq!(result.len(), 2, "Should flag javascript inside and outside inline code");
719        assert_eq!(result[0].column, 10); // javascript in inline code
720        assert_eq!(result[1].column, 41); // javascript outside
721    }
722
723    #[test]
724    fn test_multiple_names_in_same_line() {
725        let rule = MD044ProperNames::new(
726            vec!["JavaScript".to_string(), "TypeScript".to_string(), "React".to_string()],
727            true,
728        );
729
730        let content = "I use javascript, typescript, and react in my projects.";
731        let ctx = create_context(content);
732        let result = rule.check(&ctx).unwrap();
733
734        assert_eq!(result.len(), 3, "Should flag all three incorrect names");
735        assert_eq!(result[0].message, "Proper name 'javascript' should be 'JavaScript'");
736        assert_eq!(result[1].message, "Proper name 'typescript' should be 'TypeScript'");
737        assert_eq!(result[2].message, "Proper name 'react' should be 'React'");
738    }
739
740    #[test]
741    fn test_case_sensitivity() {
742        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
743
744        let content = "JAVASCRIPT, Javascript, javascript, and JavaScript variations.";
745        let ctx = create_context(content);
746        let result = rule.check(&ctx).unwrap();
747
748        assert_eq!(result.len(), 3, "Should flag all incorrect case variations");
749        // JavaScript (correct) should not be flagged
750        assert!(result.iter().all(|w| w.message.contains("should be 'JavaScript'")));
751    }
752
753    #[test]
754    fn test_configuration_with_custom_name_list() {
755        let config = MD044Config {
756            names: vec!["GitHub".to_string(), "GitLab".to_string(), "DevOps".to_string()],
757            code_blocks: true,
758            html_elements: true,
759            html_comments: true,
760        };
761        let rule = MD044ProperNames::from_config_struct(config);
762
763        let content = "We use github, gitlab, and devops for our workflow.";
764        let ctx = create_context(content);
765        let result = rule.check(&ctx).unwrap();
766
767        assert_eq!(result.len(), 3, "Should flag all custom names");
768        assert_eq!(result[0].message, "Proper name 'github' should be 'GitHub'");
769        assert_eq!(result[1].message, "Proper name 'gitlab' should be 'GitLab'");
770        assert_eq!(result[2].message, "Proper name 'devops' should be 'DevOps'");
771    }
772
773    #[test]
774    fn test_empty_configuration() {
775        let rule = MD044ProperNames::new(vec![], true);
776
777        let content = "This has javascript and typescript but no configured names.";
778        let ctx = create_context(content);
779        let result = rule.check(&ctx).unwrap();
780
781        assert!(result.is_empty(), "Should not flag anything with empty configuration");
782    }
783
784    #[test]
785    fn test_names_with_special_characters() {
786        let rule = MD044ProperNames::new(
787            vec!["Node.js".to_string(), "ASP.NET".to_string(), "C++".to_string()],
788            true,
789        );
790
791        let content = "We use nodejs, asp.net, ASP.NET, and c++ in our stack.";
792        let ctx = create_context(content);
793        let result = rule.check(&ctx).unwrap();
794
795        // nodejs should match Node.js (dotless variation)
796        // asp.net should be flagged (wrong case)
797        // ASP.NET should not be flagged (correct)
798        // c++ should be flagged
799        assert_eq!(result.len(), 3, "Should handle special characters correctly");
800
801        let messages: Vec<&str> = result.iter().map(|w| w.message.as_str()).collect();
802        assert!(messages.contains(&"Proper name 'nodejs' should be 'Node.js'"));
803        assert!(messages.contains(&"Proper name 'asp.net' should be 'ASP.NET'"));
804        assert!(messages.contains(&"Proper name 'c++' should be 'C++'"));
805    }
806
807    #[test]
808    fn test_word_boundaries() {
809        let rule = MD044ProperNames::new(vec!["Java".to_string(), "Script".to_string()], true);
810
811        let content = "JavaScript is not java or script, but Java and Script are separate.";
812        let ctx = create_context(content);
813        let result = rule.check(&ctx).unwrap();
814
815        // Should only flag lowercase "java" and "script" as separate words
816        assert_eq!(result.len(), 2, "Should respect word boundaries");
817        assert!(result.iter().any(|w| w.column == 19)); // "java" position
818        assert!(result.iter().any(|w| w.column == 27)); // "script" position
819    }
820
821    #[test]
822    fn test_fix_method() {
823        let rule = MD044ProperNames::new(
824            vec![
825                "JavaScript".to_string(),
826                "TypeScript".to_string(),
827                "Node.js".to_string(),
828            ],
829            true,
830        );
831
832        let content = "I love javascript, typescript, and nodejs!";
833        let ctx = create_context(content);
834        let fixed = rule.fix(&ctx).unwrap();
835
836        assert_eq!(fixed, "I love JavaScript, TypeScript, and Node.js!");
837    }
838
839    #[test]
840    fn test_fix_multiple_occurrences() {
841        let rule = MD044ProperNames::new(vec!["Python".to_string()], true);
842
843        let content = "python is great. I use python daily. PYTHON is powerful.";
844        let ctx = create_context(content);
845        let fixed = rule.fix(&ctx).unwrap();
846
847        assert_eq!(fixed, "Python is great. I use Python daily. Python is powerful.");
848    }
849
850    #[test]
851    fn test_fix_checks_code_blocks_by_default() {
852        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
853
854        let content = r#"I love javascript.
855
856```
857const lang = "javascript";
858```
859
860More javascript here."#;
861
862        let ctx = create_context(content);
863        let fixed = rule.fix(&ctx).unwrap();
864
865        let expected = r#"I love JavaScript.
866
867```
868const lang = "JavaScript";
869```
870
871More JavaScript here."#;
872
873        assert_eq!(fixed, expected);
874    }
875
876    #[test]
877    fn test_multiline_content() {
878        let rule = MD044ProperNames::new(vec!["Rust".to_string(), "Python".to_string()], true);
879
880        let content = r#"First line with rust.
881Second line with python.
882Third line with RUST and PYTHON."#;
883
884        let ctx = create_context(content);
885        let result = rule.check(&ctx).unwrap();
886
887        assert_eq!(result.len(), 4, "Should flag all incorrect occurrences");
888        assert_eq!(result[0].line, 1);
889        assert_eq!(result[1].line, 2);
890        assert_eq!(result[2].line, 3);
891        assert_eq!(result[3].line, 3);
892    }
893
894    #[test]
895    fn test_default_config() {
896        let config = MD044Config::default();
897        assert!(config.names.is_empty());
898        assert!(!config.code_blocks); // Default is false (skip code blocks)
899    }
900
901    #[test]
902    fn test_performance_with_many_names() {
903        let mut names = vec![];
904        for i in 0..50 {
905            names.push(format!("ProperName{i}"));
906        }
907
908        let rule = MD044ProperNames::new(names, true);
909
910        let content = "This has propername0, propername25, and propername49 incorrectly.";
911        let ctx = create_context(content);
912        let result = rule.check(&ctx).unwrap();
913
914        assert_eq!(result.len(), 3, "Should handle many configured names efficiently");
915    }
916
917    #[test]
918    fn test_large_name_count_performance() {
919        // Verify MD044 can handle large numbers of names without regex limitations
920        // This test confirms that fancy-regex handles large patterns well
921        let names = (0..1000).map(|i| format!("ProperName{i}")).collect::<Vec<_>>();
922
923        let rule = MD044ProperNames::new(names, true);
924
925        // The combined pattern should be created successfully
926        assert!(rule.combined_pattern.is_some());
927
928        // Should be able to check content without errors
929        let content = "This has propername0 and propername999 in it.";
930        let ctx = create_context(content);
931        let result = rule.check(&ctx).unwrap();
932
933        // Should detect both incorrect names
934        assert_eq!(result.len(), 2, "Should handle 1000 names without issues");
935    }
936
937    #[test]
938    fn test_cache_behavior() {
939        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
940
941        let content = "Using javascript here.";
942        let ctx = create_context(content);
943
944        // First check
945        let result1 = rule.check(&ctx).unwrap();
946        assert_eq!(result1.len(), 1);
947
948        // Second check should use cache
949        let result2 = rule.check(&ctx).unwrap();
950        assert_eq!(result2.len(), 1);
951
952        // Results should be identical
953        assert_eq!(result1[0].line, result2[0].line);
954        assert_eq!(result1[0].column, result2[0].column);
955    }
956
957    #[test]
958    fn test_html_comments_not_checked_when_disabled() {
959        let config = MD044Config {
960            names: vec!["JavaScript".to_string()],
961            code_blocks: true,    // Check code blocks
962            html_elements: true,  // Check HTML elements
963            html_comments: false, // Don't check HTML comments
964        };
965        let rule = MD044ProperNames::from_config_struct(config);
966
967        let content = r#"Regular javascript here.
968<!-- This javascript in HTML comment should be ignored -->
969More javascript outside."#;
970
971        let ctx = create_context(content);
972        let result = rule.check(&ctx).unwrap();
973
974        assert_eq!(result.len(), 2, "Should only flag javascript outside HTML comments");
975        assert_eq!(result[0].line, 1);
976        assert_eq!(result[1].line, 3);
977    }
978
979    #[test]
980    fn test_html_comments_checked_when_enabled() {
981        let config = MD044Config {
982            names: vec!["JavaScript".to_string()],
983            code_blocks: true,   // Check code blocks
984            html_elements: true, // Check HTML elements
985            html_comments: true, // Check HTML comments
986        };
987        let rule = MD044ProperNames::from_config_struct(config);
988
989        let content = r#"Regular javascript here.
990<!-- This javascript in HTML comment should be checked -->
991More javascript outside."#;
992
993        let ctx = create_context(content);
994        let result = rule.check(&ctx).unwrap();
995
996        assert_eq!(
997            result.len(),
998            3,
999            "Should flag all javascript occurrences including in HTML comments"
1000        );
1001    }
1002
1003    #[test]
1004    fn test_multiline_html_comments() {
1005        let config = MD044Config {
1006            names: vec!["Python".to_string(), "JavaScript".to_string()],
1007            code_blocks: true,    // Check code blocks
1008            html_elements: true,  // Check HTML elements
1009            html_comments: false, // Don't check HTML comments
1010        };
1011        let rule = MD044ProperNames::from_config_struct(config);
1012
1013        let content = r#"Regular python here.
1014<!--
1015This is a multiline comment
1016with javascript and python
1017that should be ignored
1018-->
1019More javascript outside."#;
1020
1021        let ctx = create_context(content);
1022        let result = rule.check(&ctx).unwrap();
1023
1024        assert_eq!(result.len(), 2, "Should only flag names outside HTML comments");
1025        assert_eq!(result[0].line, 1); // python
1026        assert_eq!(result[1].line, 7); // javascript
1027    }
1028
1029    #[test]
1030    fn test_fix_preserves_html_comments_when_disabled() {
1031        let config = MD044Config {
1032            names: vec!["JavaScript".to_string()],
1033            code_blocks: true,    // Check code blocks
1034            html_elements: true,  // Check HTML elements
1035            html_comments: false, // Don't check HTML comments
1036        };
1037        let rule = MD044ProperNames::from_config_struct(config);
1038
1039        let content = r#"javascript here.
1040<!-- javascript in comment -->
1041More javascript."#;
1042
1043        let ctx = create_context(content);
1044        let fixed = rule.fix(&ctx).unwrap();
1045
1046        let expected = r#"JavaScript here.
1047<!-- javascript in comment -->
1048More JavaScript."#;
1049
1050        assert_eq!(
1051            fixed, expected,
1052            "Should not fix names inside HTML comments when disabled"
1053        );
1054    }
1055
1056    #[test]
1057    fn test_proper_names_in_links_not_flagged() {
1058        let rule = MD044ProperNames::new(
1059            vec!["JavaScript".to_string(), "Node.js".to_string(), "Python".to_string()],
1060            true,
1061        );
1062
1063        let content = r#"Check this [javascript documentation](https://javascript.info) for info.
1064
1065Visit [node.js homepage](https://nodejs.org) and [python tutorial](https://python.org).
1066
1067Real javascript should be flagged.
1068
1069Also see the [typescript guide][ts-ref] for more.
1070
1071Real python should be flagged too.
1072
1073[ts-ref]: https://typescript.org/handbook"#;
1074
1075        let ctx = create_context(content);
1076        let result = rule.check(&ctx).unwrap();
1077
1078        // Only the real standalone proper names should be flagged
1079        assert_eq!(
1080            result.len(),
1081            2,
1082            "Expected exactly 2 warnings for standalone proper names"
1083        );
1084        assert!(result[0].message.contains("'javascript' should be 'JavaScript'"));
1085        assert!(result[1].message.contains("'python' should be 'Python'"));
1086        // Should be on lines with standalone instances
1087        assert!(result[0].line == 5); // "Real javascript should be flagged."
1088        assert!(result[1].line == 9); // "Real python should be flagged too."
1089    }
1090
1091    #[test]
1092    fn test_proper_names_in_images_not_flagged() {
1093        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1094
1095        let content = r#"Here is a ![javascript logo](javascript.png "javascript icon") image.
1096
1097Real javascript should be flagged."#;
1098
1099        let ctx = create_context(content);
1100        let result = rule.check(&ctx).unwrap();
1101
1102        // Only the standalone proper name should be flagged
1103        assert_eq!(result.len(), 1, "Expected exactly 1 warning for standalone proper name");
1104        assert!(result[0].message.contains("'javascript' should be 'JavaScript'"));
1105        assert!(result[0].line == 3); // "Real javascript should be flagged."
1106    }
1107
1108    #[test]
1109    fn test_proper_names_in_reference_definitions_not_flagged() {
1110        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "TypeScript".to_string()], true);
1111
1112        let content = r#"Check the [javascript guide][js-ref] for details.
1113
1114Real javascript should be flagged.
1115
1116[js-ref]: https://javascript.info/typescript/guide"#;
1117
1118        let ctx = create_context(content);
1119        let result = rule.check(&ctx).unwrap();
1120
1121        // Only the standalone proper name should be flagged
1122        assert_eq!(result.len(), 1, "Expected exactly 1 warning for standalone proper name");
1123        assert!(result[0].message.contains("'javascript' should be 'JavaScript'"));
1124        assert!(result[0].line == 3); // "Real javascript should be flagged."
1125    }
1126}
rumdl_lib/rules/md044_proper_names.rs

rumdl_lib/rules/
md044_proper_names.rs