rumdl_lib/rules/
md044_proper_names.rs

1use crate::utils::fast_hash;
2use crate::utils::regex_cache::{escape_regex, get_cached_fancy_regex};
3
4use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, Severity};
5use fancy_regex::Regex;
6use std::collections::HashMap;
7use std::sync::LazyLock;
8use std::sync::{Arc, Mutex};
9
10mod md044_config;
11use md044_config::MD044Config;
12
13static HTML_COMMENT_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<!--([\s\S]*?)-->").unwrap());
14// Reference definition pattern - matches [ref]: url "title"
15static REF_DEF_REGEX: LazyLock<regex::Regex> = LazyLock::new(|| {
16    regex::Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap()
17});
18
19type WarningPosition = (usize, usize, String); // (line, column, found_name)
20
21/// Rule MD044: Proper names should be capitalized
22///
23/// See [docs/md044.md](../../docs/md044.md) for full documentation, configuration, and examples.
24///
25/// This rule is triggered when proper names are not capitalized correctly in the document.
26/// For example, if you have defined "JavaScript" as a proper name, the rule will flag any
27/// occurrences of "javascript" or "Javascript" as violations.
28///
29/// ## Purpose
30///
31/// Ensuring consistent capitalization of proper names improves document quality and
32/// professionalism. This is especially important for technical documentation where
33/// product names, programming languages, and technologies often have specific
34/// capitalization conventions.
35///
36/// ## Configuration Options
37///
38/// The rule supports the following configuration options:
39///
40/// ```yaml
41/// MD044:
42///   names: []                # List of proper names to check for correct capitalization
43///   code-blocks: false       # Whether to check code blocks (default: false)
44/// ```
45///
46/// Example configuration:
47///
48/// ```yaml
49/// MD044:
50///   names: ["JavaScript", "Node.js", "TypeScript"]
51///   code-blocks: true
52/// ```
53///
54/// ## Performance Optimizations
55///
56/// This rule implements several performance optimizations:
57///
58/// 1. **Regex Caching**: Pre-compiles and caches regex patterns for each proper name
59/// 2. **Content Caching**: Caches results based on content hashing for repeated checks
60/// 3. **Efficient Text Processing**: Uses optimized algorithms to avoid redundant text processing
61/// 4. **Smart Code Block Detection**: Efficiently identifies and optionally excludes code blocks
62///
63/// ## Edge Cases Handled
64///
65/// - **Word Boundaries**: Only matches complete words, not substrings within other words
66/// - **Case Sensitivity**: Properly handles case-specific matching
67/// - **Code Blocks**: Optionally checks code blocks (controlled by code-blocks setting)
68/// - **Markdown Formatting**: Handles proper names within Markdown formatting elements
69///
70/// ## Fix Behavior
71///
72/// When fixing issues, this rule replaces incorrect capitalization with the correct form
73/// as defined in the configuration.
74///
75#[derive(Clone)]
76pub struct MD044ProperNames {
77    config: MD044Config,
78    // Cache the combined regex pattern string
79    combined_pattern: Option<String>,
80    // Cache for name violations by content hash
81    content_cache: Arc<Mutex<HashMap<u64, Vec<WarningPosition>>>>,
82}
83
84impl MD044ProperNames {
85    pub fn new(names: Vec<String>, code_blocks: bool) -> Self {
86        let config = MD044Config {
87            names,
88            code_blocks,
89            html_elements: true, // Default to checking HTML elements
90            html_comments: true, // Default to checking HTML comments
91        };
92        let combined_pattern = Self::create_combined_pattern(&config);
93        Self {
94            config,
95            combined_pattern,
96            content_cache: Arc::new(Mutex::new(HashMap::new())),
97        }
98    }
99
100    // Helper function for consistent ASCII normalization
101    fn ascii_normalize(s: &str) -> String {
102        s.replace(['é', 'è', 'ê', 'ë'], "e")
103            .replace(['à', 'á', 'â', 'ä', 'ã', 'å'], "a")
104            .replace(['ï', 'î', 'í', 'ì'], "i")
105            .replace(['ü', 'ú', 'ù', 'û'], "u")
106            .replace(['ö', 'ó', 'ò', 'ô', 'õ'], "o")
107            .replace('ñ', "n")
108            .replace('ç', "c")
109    }
110
111    pub fn from_config_struct(config: MD044Config) -> Self {
112        let combined_pattern = Self::create_combined_pattern(&config);
113        Self {
114            config,
115            combined_pattern,
116            content_cache: Arc::new(Mutex::new(HashMap::new())),
117        }
118    }
119
120    // Create a combined regex pattern for all proper names
121    fn create_combined_pattern(config: &MD044Config) -> Option<String> {
122        if config.names.is_empty() {
123            return None;
124        }
125
126        // Create patterns for all names and their variations
127        let mut patterns: Vec<String> = config
128            .names
129            .iter()
130            .flat_map(|name| {
131                let mut variations = vec![];
132                let lower_name = name.to_lowercase();
133
134                // Add the lowercase version
135                variations.push(escape_regex(&lower_name));
136
137                // Add version without dots
138                let lower_name_no_dots = lower_name.replace('.', "");
139                if lower_name != lower_name_no_dots {
140                    variations.push(escape_regex(&lower_name_no_dots));
141                }
142
143                // Add ASCII-normalized versions for common accented characters
144                let ascii_normalized = Self::ascii_normalize(&lower_name);
145
146                if ascii_normalized != lower_name {
147                    variations.push(escape_regex(&ascii_normalized));
148
149                    // Also add version without dots
150                    let ascii_no_dots = ascii_normalized.replace('.', "");
151                    if ascii_normalized != ascii_no_dots {
152                        variations.push(escape_regex(&ascii_no_dots));
153                    }
154                }
155
156                variations
157            })
158            .collect();
159
160        // Sort patterns by length (longest first) to avoid shorter patterns matching within longer ones
161        patterns.sort_by_key(|b| std::cmp::Reverse(b.len()));
162
163        // Combine all patterns into a single regex with capture groups
164        // Don't use \b as it doesn't work with Unicode - we'll check boundaries manually
165        Some(format!(r"(?i)({})", patterns.join("|")))
166    }
167
168    // Find all name violations in the content and return positions
169    fn find_name_violations(&self, content: &str, ctx: &crate::lint_context::LintContext) -> Vec<WarningPosition> {
170        // Early return: if no names configured or content is empty
171        if self.config.names.is_empty() || content.is_empty() || self.combined_pattern.is_none() {
172            return Vec::new();
173        }
174
175        // Early return: quick check if any of the configured names might be in content
176        let content_lower = content.to_lowercase();
177        let has_potential_matches = self.config.names.iter().any(|name| {
178            let name_lower = name.to_lowercase();
179            let name_no_dots = name_lower.replace('.', "");
180
181            // Check direct match
182            if content_lower.contains(&name_lower) || content_lower.contains(&name_no_dots) {
183                return true;
184            }
185
186            // Also check ASCII-normalized version
187            let ascii_normalized = Self::ascii_normalize(&name_lower);
188
189            if ascii_normalized != name_lower {
190                if content_lower.contains(&ascii_normalized) {
191                    return true;
192                }
193                let ascii_no_dots = ascii_normalized.replace('.', "");
194                if ascii_normalized != ascii_no_dots && content_lower.contains(&ascii_no_dots) {
195                    return true;
196                }
197            }
198
199            false
200        });
201
202        if !has_potential_matches {
203            return Vec::new();
204        }
205
206        // Check if we have cached results
207        let hash = fast_hash(content);
208        {
209            // Use a separate scope for borrowing to minimize lock time
210            if let Ok(cache) = self.content_cache.lock()
211                && let Some(cached) = cache.get(&hash)
212            {
213                return cached.clone();
214            }
215        }
216
217        let mut violations = Vec::new();
218
219        // Get the regex from global cache
220        let combined_regex = match &self.combined_pattern {
221            Some(pattern) => match get_cached_fancy_regex(pattern) {
222                Ok(regex) => regex,
223                Err(_) => return Vec::new(),
224            },
225            None => return Vec::new(),
226        };
227
228        // Use ctx.lines for better performance
229        for (line_idx, line_info) in ctx.lines.iter().enumerate() {
230            let line_num = line_idx + 1;
231            let line = line_info.content(ctx.content);
232
233            // Skip code fence lines (```language or ~~~language)
234            let trimmed = line.trim_start();
235            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
236                continue;
237            }
238
239            // Skip if in code block (when code_blocks = false)
240            if !self.config.code_blocks && line_info.in_code_block {
241                continue;
242            }
243
244            // Skip if in HTML block (when html_elements = false)
245            if !self.config.html_elements && line_info.in_html_block {
246                continue;
247            }
248
249            // Check if we should skip HTML comments
250            let in_html_comment = if !self.config.html_comments {
251                // Check if this position is within an HTML comment
252                self.is_in_html_comment(content, line_info.byte_offset)
253            } else {
254                false
255            };
256
257            if in_html_comment {
258                continue;
259            }
260
261            // Skip JSX expressions and MDX comments (MDX flavor)
262            if line_info.in_jsx_expression || line_info.in_mdx_comment {
263                continue;
264            }
265
266            // Early return: skip lines that don't contain any potential matches
267            let line_lower = line.to_lowercase();
268            let has_line_matches = self.config.names.iter().any(|name| {
269                let name_lower = name.to_lowercase();
270                let name_no_dots = name_lower.replace('.', "");
271
272                // Check direct match
273                if line_lower.contains(&name_lower) || line_lower.contains(&name_no_dots) {
274                    return true;
275                }
276
277                // Also check ASCII-normalized version
278                let ascii_normalized = Self::ascii_normalize(&name_lower);
279                if ascii_normalized != name_lower {
280                    if line_lower.contains(&ascii_normalized) {
281                        return true;
282                    }
283                    let ascii_no_dots = ascii_normalized.replace('.', "");
284                    if ascii_normalized != ascii_no_dots && line_lower.contains(&ascii_no_dots) {
285                        return true;
286                    }
287                }
288
289                false
290            });
291
292            if !has_line_matches {
293                continue;
294            }
295
296            // Use the combined regex to find all matches in one pass
297            for cap_result in combined_regex.find_iter(line) {
298                match cap_result {
299                    Ok(cap) => {
300                        let found_name = &line[cap.start()..cap.end()];
301
302                        // Check word boundaries manually for Unicode support
303                        let start_pos = cap.start();
304                        let end_pos = cap.end();
305
306                        if !self.is_at_word_boundary(line, start_pos, true)
307                            || !self.is_at_word_boundary(line, end_pos, false)
308                        {
309                            continue; // Not at word boundary
310                        }
311
312                        // Skip if in inline code when code_blocks is false
313                        if !self.config.code_blocks {
314                            let byte_pos = line_info.byte_offset + cap.start();
315                            if ctx.is_in_code_block_or_span(byte_pos) {
316                                continue;
317                            }
318                        }
319
320                        // Skip if in link (inline links, reference links, or reference definitions)
321                        let byte_pos = line_info.byte_offset + cap.start();
322                        if self.is_in_link(ctx, byte_pos) {
323                            continue;
324                        }
325
326                        // Find which proper name this matches
327                        if let Some(proper_name) = self.get_proper_name_for(found_name) {
328                            // Only flag if it's not already correct
329                            if found_name != proper_name {
330                                violations.push((line_num, cap.start() + 1, found_name.to_string()));
331                            }
332                        }
333                    }
334                    Err(e) => {
335                        eprintln!("Regex execution error on line {line_num}: {e}");
336                    }
337                }
338            }
339        }
340
341        // Store in cache (ignore if mutex is poisoned)
342        if let Ok(mut cache) = self.content_cache.lock() {
343            cache.insert(hash, violations.clone());
344        }
345        violations
346    }
347
348    // Check if a byte position is within an HTML comment
349    fn is_in_html_comment(&self, content: &str, byte_pos: usize) -> bool {
350        for m in HTML_COMMENT_REGEX.find_iter(content).flatten() {
351            if m.start() <= byte_pos && byte_pos < m.end() {
352                return true;
353            }
354        }
355        false
356    }
357
358    /// Check if a byte position is within a link (inline links, reference links, or reference definitions)
359    fn is_in_link(&self, ctx: &crate::lint_context::LintContext, byte_pos: usize) -> bool {
360        // Check inline and reference links
361        for link in &ctx.links {
362            if link.byte_offset <= byte_pos && byte_pos < link.byte_end {
363                return true;
364            }
365        }
366
367        // Check images (which use similar syntax)
368        for image in &ctx.images {
369            if image.byte_offset <= byte_pos && byte_pos < image.byte_end {
370                return true;
371            }
372        }
373
374        // Check reference definitions [ref]: url "title" using regex pattern
375        for m in REF_DEF_REGEX.find_iter(ctx.content) {
376            if m.start() <= byte_pos && byte_pos < m.end() {
377                return true;
378            }
379        }
380
381        false
382    }
383
384    // Check if a character is a word boundary (handles Unicode)
385    fn is_word_boundary_char(c: char) -> bool {
386        !c.is_alphanumeric()
387    }
388
389    // Check if position is at a word boundary
390    fn is_at_word_boundary(&self, content: &str, pos: usize, is_start: bool) -> bool {
391        let chars: Vec<char> = content.chars().collect();
392        let char_indices: Vec<(usize, char)> = content.char_indices().collect();
393
394        // Find the character position
395        let char_pos = char_indices.iter().position(|(idx, _)| *idx == pos);
396        if char_pos.is_none() {
397            return true; // If we can't find position, assume boundary
398        }
399        let char_pos = char_pos.unwrap();
400
401        if is_start {
402            // Check character before position
403            if char_pos == 0 {
404                return true; // Start of string
405            }
406            Self::is_word_boundary_char(chars[char_pos - 1])
407        } else {
408            // Check character after position
409            if char_pos >= chars.len() {
410                return true; // End of string
411            }
412            Self::is_word_boundary_char(chars[char_pos])
413        }
414    }
415
416    // Get the proper name that should be used for a found name
417    fn get_proper_name_for(&self, found_name: &str) -> Option<String> {
418        let found_lower = found_name.to_lowercase();
419
420        // Iterate through the configured proper names
421        for name in &self.config.names {
422            let lower_name = name.to_lowercase();
423            let lower_name_no_dots = lower_name.replace('.', "");
424
425            // Direct match
426            if found_lower == lower_name || found_lower == lower_name_no_dots {
427                return Some(name.clone());
428            }
429
430            // Check ASCII-normalized version
431            let ascii_normalized = Self::ascii_normalize(&lower_name);
432
433            let ascii_no_dots = ascii_normalized.replace('.', "");
434
435            if found_lower == ascii_normalized || found_lower == ascii_no_dots {
436                return Some(name.clone());
437            }
438        }
439        None
440    }
441}
442
443impl Rule for MD044ProperNames {
444    fn name(&self) -> &'static str {
445        "MD044"
446    }
447
448    fn description(&self) -> &'static str {
449        "Proper names should have the correct capitalization"
450    }
451
452    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
453        if self.config.names.is_empty() {
454            return true;
455        }
456        // Quick check if any configured names exist (case-insensitive)
457        let content_lower = ctx.content.to_lowercase();
458        !self
459            .config
460            .names
461            .iter()
462            .any(|name| content_lower.contains(&name.to_lowercase()))
463    }
464
465    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
466        let content = ctx.content;
467        if content.is_empty() || self.config.names.is_empty() || self.combined_pattern.is_none() {
468            return Ok(Vec::new());
469        }
470
471        // Early return: quick check if any of the configured names might be in content
472        let content_lower = content.to_lowercase();
473        let has_potential_matches = self.config.names.iter().any(|name| {
474            let name_lower = name.to_lowercase();
475            let name_no_dots = name_lower.replace('.', "");
476
477            // Check direct match
478            if content_lower.contains(&name_lower) || content_lower.contains(&name_no_dots) {
479                return true;
480            }
481
482            // Also check ASCII-normalized version
483            let ascii_normalized = Self::ascii_normalize(&name_lower);
484
485            if ascii_normalized != name_lower {
486                if content_lower.contains(&ascii_normalized) {
487                    return true;
488                }
489                let ascii_no_dots = ascii_normalized.replace('.', "");
490                if ascii_normalized != ascii_no_dots && content_lower.contains(&ascii_no_dots) {
491                    return true;
492                }
493            }
494
495            false
496        });
497
498        if !has_potential_matches {
499            return Ok(Vec::new());
500        }
501
502        let line_index = &ctx.line_index;
503        let violations = self.find_name_violations(content, ctx);
504
505        let warnings = violations
506            .into_iter()
507            .filter_map(|(line, column, found_name)| {
508                self.get_proper_name_for(&found_name).map(|proper_name| LintWarning {
509                    rule_name: Some(self.name().to_string()),
510                    line,
511                    column,
512                    end_line: line,
513                    end_column: column + found_name.len(),
514                    message: format!("Proper name '{found_name}' should be '{proper_name}'"),
515                    severity: Severity::Warning,
516                    fix: Some(Fix {
517                        range: line_index.line_col_to_byte_range(line, column),
518                        replacement: proper_name,
519                    }),
520                })
521            })
522            .collect();
523
524        Ok(warnings)
525    }
526
527    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
528        let content = ctx.content;
529        if content.is_empty() || self.config.names.is_empty() {
530            return Ok(content.to_string());
531        }
532
533        let violations = self.find_name_violations(content, ctx);
534        if violations.is_empty() {
535            return Ok(content.to_string());
536        }
537
538        // Process lines and build the fixed content
539        let mut fixed_lines = Vec::new();
540
541        // Group violations by line
542        let mut violations_by_line: HashMap<usize, Vec<(usize, String)>> = HashMap::new();
543        for (line_num, col_num, found_name) in violations {
544            violations_by_line
545                .entry(line_num)
546                .or_default()
547                .push((col_num, found_name));
548        }
549
550        // Sort violations within each line in reverse order
551        for violations in violations_by_line.values_mut() {
552            violations.sort_by_key(|b| std::cmp::Reverse(b.0));
553        }
554
555        // Process each line
556        for (line_idx, line_info) in ctx.lines.iter().enumerate() {
557            let line_num = line_idx + 1;
558
559            if let Some(line_violations) = violations_by_line.get(&line_num) {
560                // This line has violations, fix them
561                let mut fixed_line = line_info.content(ctx.content).to_string();
562
563                for (col_num, found_name) in line_violations {
564                    if let Some(proper_name) = self.get_proper_name_for(found_name) {
565                        let start_col = col_num - 1; // Convert to 0-based
566                        let end_col = start_col + found_name.len();
567
568                        if end_col <= fixed_line.len()
569                            && fixed_line.is_char_boundary(start_col)
570                            && fixed_line.is_char_boundary(end_col)
571                        {
572                            fixed_line.replace_range(start_col..end_col, &proper_name);
573                        }
574                    }
575                }
576
577                fixed_lines.push(fixed_line);
578            } else {
579                // No violations on this line, keep it as is
580                fixed_lines.push(line_info.content(ctx.content).to_string());
581            }
582        }
583
584        // Join lines with newlines, preserving the original ending
585        let mut result = fixed_lines.join("\n");
586        if content.ends_with('\n') && !result.ends_with('\n') {
587            result.push('\n');
588        }
589        Ok(result)
590    }
591
592    fn as_any(&self) -> &dyn std::any::Any {
593        self
594    }
595
596    fn default_config_section(&self) -> Option<(String, toml::Value)> {
597        let json_value = serde_json::to_value(&self.config).ok()?;
598        Some((
599            self.name().to_string(),
600            crate::rule_config_serde::json_to_toml_value(&json_value)?,
601        ))
602    }
603
604    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
605    where
606        Self: Sized,
607    {
608        let rule_config = crate::rule_config_serde::load_rule_config::<MD044Config>(config);
609        Box::new(Self::from_config_struct(rule_config))
610    }
611}
612
613#[cfg(test)]
614mod tests {
615    use super::*;
616    use crate::lint_context::LintContext;
617
618    fn create_context(content: &str) -> LintContext<'_> {
619        LintContext::new(content, crate::config::MarkdownFlavor::Standard, None)
620    }
621
622    #[test]
623    fn test_correctly_capitalized_names() {
624        let rule = MD044ProperNames::new(
625            vec![
626                "JavaScript".to_string(),
627                "TypeScript".to_string(),
628                "Node.js".to_string(),
629            ],
630            true,
631        );
632
633        let content = "This document uses JavaScript, TypeScript, and Node.js correctly.";
634        let ctx = create_context(content);
635        let result = rule.check(&ctx).unwrap();
636        assert!(result.is_empty(), "Should not flag correctly capitalized names");
637    }
638
639    #[test]
640    fn test_incorrectly_capitalized_names() {
641        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "TypeScript".to_string()], true);
642
643        let content = "This document uses javascript and typescript incorrectly.";
644        let ctx = create_context(content);
645        let result = rule.check(&ctx).unwrap();
646
647        assert_eq!(result.len(), 2, "Should flag two incorrect capitalizations");
648        assert_eq!(result[0].message, "Proper name 'javascript' should be 'JavaScript'");
649        assert_eq!(result[0].line, 1);
650        assert_eq!(result[0].column, 20);
651        assert_eq!(result[1].message, "Proper name 'typescript' should be 'TypeScript'");
652        assert_eq!(result[1].line, 1);
653        assert_eq!(result[1].column, 35);
654    }
655
656    #[test]
657    fn test_names_at_beginning_of_sentences() {
658        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "Python".to_string()], true);
659
660        let content = "javascript is a great language. python is also popular.";
661        let ctx = create_context(content);
662        let result = rule.check(&ctx).unwrap();
663
664        assert_eq!(result.len(), 2, "Should flag names at beginning of sentences");
665        assert_eq!(result[0].line, 1);
666        assert_eq!(result[0].column, 1);
667        assert_eq!(result[1].line, 1);
668        assert_eq!(result[1].column, 33);
669    }
670
671    #[test]
672    fn test_names_in_code_blocks_checked_by_default() {
673        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
674
675        let content = r#"Here is some text with JavaScript.
676
677```javascript
678// This javascript should be checked
679const lang = "javascript";
680```
681
682But this javascript should be flagged."#;
683
684        let ctx = create_context(content);
685        let result = rule.check(&ctx).unwrap();
686
687        assert_eq!(result.len(), 3, "Should flag javascript inside and outside code blocks");
688        assert_eq!(result[0].line, 4);
689        assert_eq!(result[1].line, 5);
690        assert_eq!(result[2].line, 8);
691    }
692
693    #[test]
694    fn test_names_in_code_blocks_ignored_when_disabled() {
695        let rule = MD044ProperNames::new(
696            vec!["JavaScript".to_string()],
697            false, // code_blocks = false means skip code blocks
698        );
699
700        let content = r#"```
701javascript in code block
702```"#;
703
704        let ctx = create_context(content);
705        let result = rule.check(&ctx).unwrap();
706
707        assert_eq!(
708            result.len(),
709            0,
710            "Should not flag javascript in code blocks when code_blocks is false"
711        );
712    }
713
714    #[test]
715    fn test_names_in_inline_code_checked_by_default() {
716        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
717
718        let content = "This is `javascript` in inline code and javascript outside.";
719        let ctx = create_context(content);
720        let result = rule.check(&ctx).unwrap();
721
722        // When code_blocks=true, inline code should be checked
723        assert_eq!(result.len(), 2, "Should flag javascript inside and outside inline code");
724        assert_eq!(result[0].column, 10); // javascript in inline code
725        assert_eq!(result[1].column, 41); // javascript outside
726    }
727
728    #[test]
729    fn test_multiple_names_in_same_line() {
730        let rule = MD044ProperNames::new(
731            vec!["JavaScript".to_string(), "TypeScript".to_string(), "React".to_string()],
732            true,
733        );
734
735        let content = "I use javascript, typescript, and react in my projects.";
736        let ctx = create_context(content);
737        let result = rule.check(&ctx).unwrap();
738
739        assert_eq!(result.len(), 3, "Should flag all three incorrect names");
740        assert_eq!(result[0].message, "Proper name 'javascript' should be 'JavaScript'");
741        assert_eq!(result[1].message, "Proper name 'typescript' should be 'TypeScript'");
742        assert_eq!(result[2].message, "Proper name 'react' should be 'React'");
743    }
744
745    #[test]
746    fn test_case_sensitivity() {
747        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
748
749        let content = "JAVASCRIPT, Javascript, javascript, and JavaScript variations.";
750        let ctx = create_context(content);
751        let result = rule.check(&ctx).unwrap();
752
753        assert_eq!(result.len(), 3, "Should flag all incorrect case variations");
754        // JavaScript (correct) should not be flagged
755        assert!(result.iter().all(|w| w.message.contains("should be 'JavaScript'")));
756    }
757
758    #[test]
759    fn test_configuration_with_custom_name_list() {
760        let config = MD044Config {
761            names: vec!["GitHub".to_string(), "GitLab".to_string(), "DevOps".to_string()],
762            code_blocks: true,
763            html_elements: true,
764            html_comments: true,
765        };
766        let rule = MD044ProperNames::from_config_struct(config);
767
768        let content = "We use github, gitlab, and devops for our workflow.";
769        let ctx = create_context(content);
770        let result = rule.check(&ctx).unwrap();
771
772        assert_eq!(result.len(), 3, "Should flag all custom names");
773        assert_eq!(result[0].message, "Proper name 'github' should be 'GitHub'");
774        assert_eq!(result[1].message, "Proper name 'gitlab' should be 'GitLab'");
775        assert_eq!(result[2].message, "Proper name 'devops' should be 'DevOps'");
776    }
777
778    #[test]
779    fn test_empty_configuration() {
780        let rule = MD044ProperNames::new(vec![], true);
781
782        let content = "This has javascript and typescript but no configured names.";
783        let ctx = create_context(content);
784        let result = rule.check(&ctx).unwrap();
785
786        assert!(result.is_empty(), "Should not flag anything with empty configuration");
787    }
788
789    #[test]
790    fn test_names_with_special_characters() {
791        let rule = MD044ProperNames::new(
792            vec!["Node.js".to_string(), "ASP.NET".to_string(), "C++".to_string()],
793            true,
794        );
795
796        let content = "We use nodejs, asp.net, ASP.NET, and c++ in our stack.";
797        let ctx = create_context(content);
798        let result = rule.check(&ctx).unwrap();
799
800        // nodejs should match Node.js (dotless variation)
801        // asp.net should be flagged (wrong case)
802        // ASP.NET should not be flagged (correct)
803        // c++ should be flagged
804        assert_eq!(result.len(), 3, "Should handle special characters correctly");
805
806        let messages: Vec<&str> = result.iter().map(|w| w.message.as_str()).collect();
807        assert!(messages.contains(&"Proper name 'nodejs' should be 'Node.js'"));
808        assert!(messages.contains(&"Proper name 'asp.net' should be 'ASP.NET'"));
809        assert!(messages.contains(&"Proper name 'c++' should be 'C++'"));
810    }
811
812    #[test]
813    fn test_word_boundaries() {
814        let rule = MD044ProperNames::new(vec!["Java".to_string(), "Script".to_string()], true);
815
816        let content = "JavaScript is not java or script, but Java and Script are separate.";
817        let ctx = create_context(content);
818        let result = rule.check(&ctx).unwrap();
819
820        // Should only flag lowercase "java" and "script" as separate words
821        assert_eq!(result.len(), 2, "Should respect word boundaries");
822        assert!(result.iter().any(|w| w.column == 19)); // "java" position
823        assert!(result.iter().any(|w| w.column == 27)); // "script" position
824    }
825
826    #[test]
827    fn test_fix_method() {
828        let rule = MD044ProperNames::new(
829            vec![
830                "JavaScript".to_string(),
831                "TypeScript".to_string(),
832                "Node.js".to_string(),
833            ],
834            true,
835        );
836
837        let content = "I love javascript, typescript, and nodejs!";
838        let ctx = create_context(content);
839        let fixed = rule.fix(&ctx).unwrap();
840
841        assert_eq!(fixed, "I love JavaScript, TypeScript, and Node.js!");
842    }
843
844    #[test]
845    fn test_fix_multiple_occurrences() {
846        let rule = MD044ProperNames::new(vec!["Python".to_string()], true);
847
848        let content = "python is great. I use python daily. PYTHON is powerful.";
849        let ctx = create_context(content);
850        let fixed = rule.fix(&ctx).unwrap();
851
852        assert_eq!(fixed, "Python is great. I use Python daily. Python is powerful.");
853    }
854
855    #[test]
856    fn test_fix_checks_code_blocks_by_default() {
857        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
858
859        let content = r#"I love javascript.
860
861```
862const lang = "javascript";
863```
864
865More javascript here."#;
866
867        let ctx = create_context(content);
868        let fixed = rule.fix(&ctx).unwrap();
869
870        let expected = r#"I love JavaScript.
871
872```
873const lang = "JavaScript";
874```
875
876More JavaScript here."#;
877
878        assert_eq!(fixed, expected);
879    }
880
881    #[test]
882    fn test_multiline_content() {
883        let rule = MD044ProperNames::new(vec!["Rust".to_string(), "Python".to_string()], true);
884
885        let content = r#"First line with rust.
886Second line with python.
887Third line with RUST and PYTHON."#;
888
889        let ctx = create_context(content);
890        let result = rule.check(&ctx).unwrap();
891
892        assert_eq!(result.len(), 4, "Should flag all incorrect occurrences");
893        assert_eq!(result[0].line, 1);
894        assert_eq!(result[1].line, 2);
895        assert_eq!(result[2].line, 3);
896        assert_eq!(result[3].line, 3);
897    }
898
899    #[test]
900    fn test_default_config() {
901        let config = MD044Config::default();
902        assert!(config.names.is_empty());
903        assert!(!config.code_blocks); // Default is false (skip code blocks)
904    }
905
906    #[test]
907    fn test_performance_with_many_names() {
908        let mut names = vec![];
909        for i in 0..50 {
910            names.push(format!("ProperName{i}"));
911        }
912
913        let rule = MD044ProperNames::new(names, true);
914
915        let content = "This has propername0, propername25, and propername49 incorrectly.";
916        let ctx = create_context(content);
917        let result = rule.check(&ctx).unwrap();
918
919        assert_eq!(result.len(), 3, "Should handle many configured names efficiently");
920    }
921
922    #[test]
923    fn test_large_name_count_performance() {
924        // Verify MD044 can handle large numbers of names without regex limitations
925        // This test confirms that fancy-regex handles large patterns well
926        let names = (0..1000).map(|i| format!("ProperName{i}")).collect::<Vec<_>>();
927
928        let rule = MD044ProperNames::new(names, true);
929
930        // The combined pattern should be created successfully
931        assert!(rule.combined_pattern.is_some());
932
933        // Should be able to check content without errors
934        let content = "This has propername0 and propername999 in it.";
935        let ctx = create_context(content);
936        let result = rule.check(&ctx).unwrap();
937
938        // Should detect both incorrect names
939        assert_eq!(result.len(), 2, "Should handle 1000 names without issues");
940    }
941
942    #[test]
943    fn test_cache_behavior() {
944        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
945
946        let content = "Using javascript here.";
947        let ctx = create_context(content);
948
949        // First check
950        let result1 = rule.check(&ctx).unwrap();
951        assert_eq!(result1.len(), 1);
952
953        // Second check should use cache
954        let result2 = rule.check(&ctx).unwrap();
955        assert_eq!(result2.len(), 1);
956
957        // Results should be identical
958        assert_eq!(result1[0].line, result2[0].line);
959        assert_eq!(result1[0].column, result2[0].column);
960    }
961
962    #[test]
963    fn test_html_comments_not_checked_when_disabled() {
964        let config = MD044Config {
965            names: vec!["JavaScript".to_string()],
966            code_blocks: true,    // Check code blocks
967            html_elements: true,  // Check HTML elements
968            html_comments: false, // Don't check HTML comments
969        };
970        let rule = MD044ProperNames::from_config_struct(config);
971
972        let content = r#"Regular javascript here.
973<!-- This javascript in HTML comment should be ignored -->
974More javascript outside."#;
975
976        let ctx = create_context(content);
977        let result = rule.check(&ctx).unwrap();
978
979        assert_eq!(result.len(), 2, "Should only flag javascript outside HTML comments");
980        assert_eq!(result[0].line, 1);
981        assert_eq!(result[1].line, 3);
982    }
983
984    #[test]
985    fn test_html_comments_checked_when_enabled() {
986        let config = MD044Config {
987            names: vec!["JavaScript".to_string()],
988            code_blocks: true,   // Check code blocks
989            html_elements: true, // Check HTML elements
990            html_comments: true, // Check HTML comments
991        };
992        let rule = MD044ProperNames::from_config_struct(config);
993
994        let content = r#"Regular javascript here.
995<!-- This javascript in HTML comment should be checked -->
996More javascript outside."#;
997
998        let ctx = create_context(content);
999        let result = rule.check(&ctx).unwrap();
1000
1001        assert_eq!(
1002            result.len(),
1003            3,
1004            "Should flag all javascript occurrences including in HTML comments"
1005        );
1006    }
1007
1008    #[test]
1009    fn test_multiline_html_comments() {
1010        let config = MD044Config {
1011            names: vec!["Python".to_string(), "JavaScript".to_string()],
1012            code_blocks: true,    // Check code blocks
1013            html_elements: true,  // Check HTML elements
1014            html_comments: false, // Don't check HTML comments
1015        };
1016        let rule = MD044ProperNames::from_config_struct(config);
1017
1018        let content = r#"Regular python here.
1019<!--
1020This is a multiline comment
1021with javascript and python
1022that should be ignored
1023-->
1024More javascript outside."#;
1025
1026        let ctx = create_context(content);
1027        let result = rule.check(&ctx).unwrap();
1028
1029        assert_eq!(result.len(), 2, "Should only flag names outside HTML comments");
1030        assert_eq!(result[0].line, 1); // python
1031        assert_eq!(result[1].line, 7); // javascript
1032    }
1033
1034    #[test]
1035    fn test_fix_preserves_html_comments_when_disabled() {
1036        let config = MD044Config {
1037            names: vec!["JavaScript".to_string()],
1038            code_blocks: true,    // Check code blocks
1039            html_elements: true,  // Check HTML elements
1040            html_comments: false, // Don't check HTML comments
1041        };
1042        let rule = MD044ProperNames::from_config_struct(config);
1043
1044        let content = r#"javascript here.
1045<!-- javascript in comment -->
1046More javascript."#;
1047
1048        let ctx = create_context(content);
1049        let fixed = rule.fix(&ctx).unwrap();
1050
1051        let expected = r#"JavaScript here.
1052<!-- javascript in comment -->
1053More JavaScript."#;
1054
1055        assert_eq!(
1056            fixed, expected,
1057            "Should not fix names inside HTML comments when disabled"
1058        );
1059    }
1060
1061    #[test]
1062    fn test_proper_names_in_links_not_flagged() {
1063        let rule = MD044ProperNames::new(
1064            vec!["JavaScript".to_string(), "Node.js".to_string(), "Python".to_string()],
1065            true,
1066        );
1067
1068        let content = r#"Check this [javascript documentation](https://javascript.info) for info.
1069
1070Visit [node.js homepage](https://nodejs.org) and [python tutorial](https://python.org).
1071
1072Real javascript should be flagged.
1073
1074Also see the [typescript guide][ts-ref] for more.
1075
1076Real python should be flagged too.
1077
1078[ts-ref]: https://typescript.org/handbook"#;
1079
1080        let ctx = create_context(content);
1081        let result = rule.check(&ctx).unwrap();
1082
1083        // Only the real standalone proper names should be flagged
1084        assert_eq!(
1085            result.len(),
1086            2,
1087            "Expected exactly 2 warnings for standalone proper names"
1088        );
1089        assert!(result[0].message.contains("'javascript' should be 'JavaScript'"));
1090        assert!(result[1].message.contains("'python' should be 'Python'"));
1091        // Should be on lines with standalone instances
1092        assert!(result[0].line == 5); // "Real javascript should be flagged."
1093        assert!(result[1].line == 9); // "Real python should be flagged too."
1094    }
1095
1096    #[test]
1097    fn test_proper_names_in_images_not_flagged() {
1098        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1099
1100        let content = r#"Here is a ![javascript logo](javascript.png "javascript icon") image.
1101
1102Real javascript should be flagged."#;
1103
1104        let ctx = create_context(content);
1105        let result = rule.check(&ctx).unwrap();
1106
1107        // Only the standalone proper name should be flagged
1108        assert_eq!(result.len(), 1, "Expected exactly 1 warning for standalone proper name");
1109        assert!(result[0].message.contains("'javascript' should be 'JavaScript'"));
1110        assert!(result[0].line == 3); // "Real javascript should be flagged."
1111    }
1112
1113    #[test]
1114    fn test_proper_names_in_reference_definitions_not_flagged() {
1115        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "TypeScript".to_string()], true);
1116
1117        let content = r#"Check the [javascript guide][js-ref] for details.
1118
1119Real javascript should be flagged.
1120
1121[js-ref]: https://javascript.info/typescript/guide"#;
1122
1123        let ctx = create_context(content);
1124        let result = rule.check(&ctx).unwrap();
1125
1126        // Only the standalone proper name should be flagged
1127        assert_eq!(result.len(), 1, "Expected exactly 1 warning for standalone proper name");
1128        assert!(result[0].message.contains("'javascript' should be 'JavaScript'"));
1129        assert!(result[0].line == 3); // "Real javascript should be flagged."
1130    }
1131}
rumdl_lib/rules/md044_proper_names.rs

rumdl_lib/rules/
md044_proper_names.rs