rumdl_lib/rules/
md044_proper_names.rs

1use crate::utils::fast_hash;
2use crate::utils::regex_cache::{escape_regex, get_cached_fancy_regex};
3
4use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, Severity};
5use fancy_regex::Regex;
6use lazy_static::lazy_static;
7use std::collections::HashMap;
8use std::sync::{Arc, Mutex};
9
10mod md044_config;
11use md044_config::MD044Config;
12
13lazy_static! {
14    static ref HTML_COMMENT_REGEX: Regex = Regex::new(r"<!--([\s\S]*?)-->").unwrap();
15    // Reference definition pattern - matches [ref]: url "title"
16    static ref REF_DEF_REGEX: regex::Regex = regex::Regex::new(
17        r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
18    ).unwrap();
19}
20
21type WarningPosition = (usize, usize, String); // (line, column, found_name)
22
23/// Rule MD044: Proper names should be capitalized
24///
25/// See [docs/md044.md](../../docs/md044.md) for full documentation, configuration, and examples.
26///
27/// This rule is triggered when proper names are not capitalized correctly in the document.
28/// For example, if you have defined "JavaScript" as a proper name, the rule will flag any
29/// occurrences of "javascript" or "Javascript" as violations.
30///
31/// ## Purpose
32///
33/// Ensuring consistent capitalization of proper names improves document quality and
34/// professionalism. This is especially important for technical documentation where
35/// product names, programming languages, and technologies often have specific
36/// capitalization conventions.
37///
38/// ## Configuration Options
39///
40/// The rule supports the following configuration options:
41///
42/// ```yaml
43/// MD044:
44///   names: []                # List of proper names to check for correct capitalization
45///   code-blocks: false       # Whether to check code blocks (default: false)
46/// ```
47///
48/// Example configuration:
49///
50/// ```yaml
51/// MD044:
52///   names: ["JavaScript", "Node.js", "TypeScript"]
53///   code-blocks: true
54/// ```
55///
56/// ## Performance Optimizations
57///
58/// This rule implements several performance optimizations:
59///
60/// 1. **Regex Caching**: Pre-compiles and caches regex patterns for each proper name
61/// 2. **Content Caching**: Caches results based on content hashing for repeated checks
62/// 3. **Efficient Text Processing**: Uses optimized algorithms to avoid redundant text processing
63/// 4. **Smart Code Block Detection**: Efficiently identifies and optionally excludes code blocks
64///
65/// ## Edge Cases Handled
66///
67/// - **Word Boundaries**: Only matches complete words, not substrings within other words
68/// - **Case Sensitivity**: Properly handles case-specific matching
69/// - **Code Blocks**: Optionally checks code blocks (controlled by code-blocks setting)
70/// - **Markdown Formatting**: Handles proper names within Markdown formatting elements
71///
72/// ## Fix Behavior
73///
74/// When fixing issues, this rule replaces incorrect capitalization with the correct form
75/// as defined in the configuration.
76///
77#[derive(Clone)]
78pub struct MD044ProperNames {
79    config: MD044Config,
80    // Cache the combined regex pattern string
81    combined_pattern: Option<String>,
82    // Cache for name violations by content hash
83    content_cache: Arc<Mutex<HashMap<u64, Vec<WarningPosition>>>>,
84}
85
86impl MD044ProperNames {
87    pub fn new(names: Vec<String>, code_blocks: bool) -> Self {
88        let config = MD044Config {
89            names,
90            code_blocks,
91            html_comments: true, // Default to checking HTML comments
92        };
93        let combined_pattern = Self::create_combined_pattern(&config);
94        Self {
95            config,
96            combined_pattern,
97            content_cache: Arc::new(Mutex::new(HashMap::new())),
98        }
99    }
100
101    // Helper function for consistent ASCII normalization
102    fn ascii_normalize(s: &str) -> String {
103        s.replace(['é', 'è', 'ê', 'ë'], "e")
104            .replace(['à', 'á', 'â', 'ä'], "a")
105            .replace(['ï', 'î', 'í', 'ì'], "i")
106            .replace(['ü', 'ú', 'ù', 'û'], "u")
107            .replace(['ö', 'ó', 'ò', 'ô'], "o")
108            .replace('ñ', "n")
109            .replace('ç', "c")
110    }
111
112    pub fn from_config_struct(config: MD044Config) -> Self {
113        let combined_pattern = Self::create_combined_pattern(&config);
114        Self {
115            config,
116            combined_pattern,
117            content_cache: Arc::new(Mutex::new(HashMap::new())),
118        }
119    }
120
121    // Create a combined regex pattern for all proper names
122    fn create_combined_pattern(config: &MD044Config) -> Option<String> {
123        if config.names.is_empty() {
124            return None;
125        }
126
127        // Create patterns for all names and their variations
128        let mut patterns: Vec<String> = config
129            .names
130            .iter()
131            .flat_map(|name| {
132                let mut variations = vec![];
133                let lower_name = name.to_lowercase();
134
135                // Add the lowercase version
136                variations.push(escape_regex(&lower_name));
137
138                // Add version without dots
139                let lower_name_no_dots = lower_name.replace('.', "");
140                if lower_name != lower_name_no_dots {
141                    variations.push(escape_regex(&lower_name_no_dots));
142                }
143
144                // Add ASCII-normalized versions for common accented characters
145                let ascii_normalized = Self::ascii_normalize(&lower_name);
146
147                if ascii_normalized != lower_name {
148                    variations.push(escape_regex(&ascii_normalized));
149
150                    // Also add version without dots
151                    let ascii_no_dots = ascii_normalized.replace('.', "");
152                    if ascii_normalized != ascii_no_dots {
153                        variations.push(escape_regex(&ascii_no_dots));
154                    }
155                }
156
157                variations
158            })
159            .collect();
160
161        // Sort patterns by length (longest first) to avoid shorter patterns matching within longer ones
162        patterns.sort_by_key(|b| std::cmp::Reverse(b.len()));
163
164        // Combine all patterns into a single regex with capture groups
165        // Don't use \b as it doesn't work with Unicode - we'll check boundaries manually
166        Some(format!(r"(?i)({})", patterns.join("|")))
167    }
168
169    // Find all name violations in the content and return positions
170    fn find_name_violations(&self, content: &str, ctx: &crate::lint_context::LintContext) -> Vec<WarningPosition> {
171        // Early return: if no names configured or content is empty
172        if self.config.names.is_empty() || content.is_empty() || self.combined_pattern.is_none() {
173            return Vec::new();
174        }
175
176        // Early return: quick check if any of the configured names might be in content
177        let content_lower = content.to_lowercase();
178        let has_potential_matches = self.config.names.iter().any(|name| {
179            let name_lower = name.to_lowercase();
180            let name_no_dots = name_lower.replace('.', "");
181
182            // Check direct match
183            if content_lower.contains(&name_lower) || content_lower.contains(&name_no_dots) {
184                return true;
185            }
186
187            // Also check ASCII-normalized version
188            let ascii_normalized = Self::ascii_normalize(&name_lower);
189
190            if ascii_normalized != name_lower {
191                if content_lower.contains(&ascii_normalized) {
192                    return true;
193                }
194                let ascii_no_dots = ascii_normalized.replace('.', "");
195                if ascii_normalized != ascii_no_dots && content_lower.contains(&ascii_no_dots) {
196                    return true;
197                }
198            }
199
200            false
201        });
202
203        if !has_potential_matches {
204            return Vec::new();
205        }
206
207        // Check if we have cached results
208        let hash = fast_hash(content);
209        {
210            // Use a separate scope for borrowing to minimize lock time
211            let cache = self.content_cache.lock().unwrap();
212            if let Some(cached) = cache.get(&hash) {
213                return cached.clone();
214            }
215        }
216
217        let mut violations = Vec::new();
218
219        // Get the regex from global cache
220        let combined_regex = match &self.combined_pattern {
221            Some(pattern) => match get_cached_fancy_regex(pattern) {
222                Ok(regex) => regex,
223                Err(_) => return Vec::new(),
224            },
225            None => return Vec::new(),
226        };
227
228        // Use ctx.lines for better performance
229        for (line_idx, line_info) in ctx.lines.iter().enumerate() {
230            let line_num = line_idx + 1;
231            let line = &line_info.content;
232
233            // Skip code fence lines (```language or ~~~language)
234            let trimmed = line.trim_start();
235            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
236                continue;
237            }
238
239            // Skip if in code block (when code_blocks = false)
240            if !self.config.code_blocks && line_info.in_code_block {
241                continue;
242            }
243
244            // Check if we should skip HTML comments
245            let in_html_comment = if !self.config.html_comments {
246                // Check if this position is within an HTML comment
247                self.is_in_html_comment(content, line_info.byte_offset)
248            } else {
249                false
250            };
251
252            if in_html_comment {
253                continue;
254            }
255
256            // Early return: skip lines that don't contain any potential matches
257            let line_lower = line.to_lowercase();
258            let has_line_matches = self.config.names.iter().any(|name| {
259                let name_lower = name.to_lowercase();
260                let name_no_dots = name_lower.replace('.', "");
261
262                // Check direct match
263                if line_lower.contains(&name_lower) || line_lower.contains(&name_no_dots) {
264                    return true;
265                }
266
267                // Also check ASCII-normalized version
268                let ascii_normalized = Self::ascii_normalize(&name_lower);
269                if ascii_normalized != name_lower {
270                    if line_lower.contains(&ascii_normalized) {
271                        return true;
272                    }
273                    let ascii_no_dots = ascii_normalized.replace('.', "");
274                    if ascii_normalized != ascii_no_dots && line_lower.contains(&ascii_no_dots) {
275                        return true;
276                    }
277                }
278
279                false
280            });
281
282            if !has_line_matches {
283                continue;
284            }
285
286            // Use the combined regex to find all matches in one pass
287            for cap_result in combined_regex.find_iter(line) {
288                match cap_result {
289                    Ok(cap) => {
290                        let found_name = &line[cap.start()..cap.end()];
291
292                        // Check word boundaries manually for Unicode support
293                        let start_pos = cap.start();
294                        let end_pos = cap.end();
295
296                        if !self.is_at_word_boundary(line, start_pos, true)
297                            || !self.is_at_word_boundary(line, end_pos, false)
298                        {
299                            continue; // Not at word boundary
300                        }
301
302                        // Skip if in inline code when code_blocks is false
303                        if !self.config.code_blocks {
304                            let byte_pos = line_info.byte_offset + cap.start();
305                            if ctx.is_in_code_block_or_span(byte_pos) {
306                                continue;
307                            }
308                        }
309
310                        // Skip if in link (inline links, reference links, or reference definitions)
311                        let byte_pos = line_info.byte_offset + cap.start();
312                        if self.is_in_link(ctx, byte_pos) {
313                            continue;
314                        }
315
316                        // Find which proper name this matches
317                        if let Some(proper_name) = self.get_proper_name_for(found_name) {
318                            // Only flag if it's not already correct
319                            if found_name != proper_name {
320                                violations.push((line_num, cap.start() + 1, found_name.to_string()));
321                            }
322                        }
323                    }
324                    Err(e) => {
325                        eprintln!("Regex execution error on line {line_num}: {e}");
326                    }
327                }
328            }
329        }
330
331        // Store in cache
332        self.content_cache.lock().unwrap().insert(hash, violations.clone());
333        violations
334    }
335
336    // Check if a byte position is within an HTML comment
337    fn is_in_html_comment(&self, content: &str, byte_pos: usize) -> bool {
338        for m in HTML_COMMENT_REGEX.find_iter(content).flatten() {
339            if m.start() <= byte_pos && byte_pos < m.end() {
340                return true;
341            }
342        }
343        false
344    }
345
346    /// Check if a byte position is within a link (inline links, reference links, or reference definitions)
347    fn is_in_link(&self, ctx: &crate::lint_context::LintContext, byte_pos: usize) -> bool {
348        // Check inline and reference links
349        for link in &ctx.links {
350            if link.byte_offset <= byte_pos && byte_pos < link.byte_end {
351                return true;
352            }
353        }
354
355        // Check images (which use similar syntax)
356        for image in &ctx.images {
357            if image.byte_offset <= byte_pos && byte_pos < image.byte_end {
358                return true;
359            }
360        }
361
362        // Check reference definitions [ref]: url "title" using regex pattern
363        for m in REF_DEF_REGEX.find_iter(ctx.content) {
364            if m.start() <= byte_pos && byte_pos < m.end() {
365                return true;
366            }
367        }
368
369        false
370    }
371
372    // Check if a character is a word boundary (handles Unicode)
373    fn is_word_boundary_char(c: char) -> bool {
374        !c.is_alphanumeric()
375    }
376
377    // Check if position is at a word boundary
378    fn is_at_word_boundary(&self, content: &str, pos: usize, is_start: bool) -> bool {
379        let chars: Vec<char> = content.chars().collect();
380        let char_indices: Vec<(usize, char)> = content.char_indices().collect();
381
382        // Find the character position
383        let char_pos = char_indices.iter().position(|(idx, _)| *idx == pos);
384        if char_pos.is_none() {
385            return true; // If we can't find position, assume boundary
386        }
387        let char_pos = char_pos.unwrap();
388
389        if is_start {
390            // Check character before position
391            if char_pos == 0 {
392                return true; // Start of string
393            }
394            Self::is_word_boundary_char(chars[char_pos - 1])
395        } else {
396            // Check character after position
397            if char_pos >= chars.len() {
398                return true; // End of string
399            }
400            Self::is_word_boundary_char(chars[char_pos])
401        }
402    }
403
404    // Get the proper name that should be used for a found name
405    fn get_proper_name_for(&self, found_name: &str) -> Option<String> {
406        let found_lower = found_name.to_lowercase();
407
408        // Iterate through the configured proper names
409        for name in &self.config.names {
410            let lower_name = name.to_lowercase();
411            let lower_name_no_dots = lower_name.replace('.', "");
412
413            // Direct match
414            if found_lower == lower_name || found_lower == lower_name_no_dots {
415                return Some(name.clone());
416            }
417
418            // Check ASCII-normalized version
419            let ascii_normalized = Self::ascii_normalize(&lower_name);
420
421            let ascii_no_dots = ascii_normalized.replace('.', "");
422
423            if found_lower == ascii_normalized || found_lower == ascii_no_dots {
424                return Some(name.clone());
425            }
426        }
427        None
428    }
429}
430
431impl Rule for MD044ProperNames {
432    fn name(&self) -> &'static str {
433        "MD044"
434    }
435
436    fn description(&self) -> &'static str {
437        "Proper names should have the correct capitalization"
438    }
439
440    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
441        if self.config.names.is_empty() {
442            return true;
443        }
444        // Quick check if any configured names exist (case-insensitive)
445        let content_lower = ctx.content.to_lowercase();
446        !self
447            .config
448            .names
449            .iter()
450            .any(|name| content_lower.contains(&name.to_lowercase()))
451    }
452
453    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
454        let content = ctx.content;
455        if content.is_empty() || self.config.names.is_empty() || self.combined_pattern.is_none() {
456            return Ok(Vec::new());
457        }
458
459        // Early return: quick check if any of the configured names might be in content
460        let content_lower = content.to_lowercase();
461        let has_potential_matches = self.config.names.iter().any(|name| {
462            let name_lower = name.to_lowercase();
463            let name_no_dots = name_lower.replace('.', "");
464
465            // Check direct match
466            if content_lower.contains(&name_lower) || content_lower.contains(&name_no_dots) {
467                return true;
468            }
469
470            // Also check ASCII-normalized version
471            let ascii_normalized = Self::ascii_normalize(&name_lower);
472
473            if ascii_normalized != name_lower {
474                if content_lower.contains(&ascii_normalized) {
475                    return true;
476                }
477                let ascii_no_dots = ascii_normalized.replace('.', "");
478                if ascii_normalized != ascii_no_dots && content_lower.contains(&ascii_no_dots) {
479                    return true;
480                }
481            }
482
483            false
484        });
485
486        if !has_potential_matches {
487            return Ok(Vec::new());
488        }
489
490        let line_index = &ctx.line_index;
491        let violations = self.find_name_violations(content, ctx);
492
493        let warnings = violations
494            .into_iter()
495            .filter_map(|(line, column, found_name)| {
496                self.get_proper_name_for(&found_name).map(|proper_name| LintWarning {
497                    rule_name: Some(self.name().to_string()),
498                    line,
499                    column,
500                    end_line: line,
501                    end_column: column + found_name.len(),
502                    message: format!("Proper name '{found_name}' should be '{proper_name}'"),
503                    severity: Severity::Warning,
504                    fix: Some(Fix {
505                        range: line_index.line_col_to_byte_range(line, column),
506                        replacement: proper_name,
507                    }),
508                })
509            })
510            .collect();
511
512        Ok(warnings)
513    }
514
515    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
516        let content = ctx.content;
517        if content.is_empty() || self.config.names.is_empty() {
518            return Ok(content.to_string());
519        }
520
521        let violations = self.find_name_violations(content, ctx);
522        if violations.is_empty() {
523            return Ok(content.to_string());
524        }
525
526        // Process lines and build the fixed content
527        let mut fixed_lines = Vec::new();
528
529        // Group violations by line
530        let mut violations_by_line: HashMap<usize, Vec<(usize, String)>> = HashMap::new();
531        for (line_num, col_num, found_name) in violations {
532            violations_by_line
533                .entry(line_num)
534                .or_default()
535                .push((col_num, found_name));
536        }
537
538        // Sort violations within each line in reverse order
539        for violations in violations_by_line.values_mut() {
540            violations.sort_by_key(|b| std::cmp::Reverse(b.0));
541        }
542
543        // Process each line
544        for (line_idx, line_info) in ctx.lines.iter().enumerate() {
545            let line_num = line_idx + 1;
546
547            if let Some(line_violations) = violations_by_line.get(&line_num) {
548                // This line has violations, fix them
549                let mut fixed_line = line_info.content.clone();
550
551                for (col_num, found_name) in line_violations {
552                    if let Some(proper_name) = self.get_proper_name_for(found_name) {
553                        let start_col = col_num - 1; // Convert to 0-based
554                        let end_col = start_col + found_name.len();
555
556                        if end_col <= fixed_line.len()
557                            && fixed_line.is_char_boundary(start_col)
558                            && fixed_line.is_char_boundary(end_col)
559                        {
560                            fixed_line.replace_range(start_col..end_col, &proper_name);
561                        }
562                    }
563                }
564
565                fixed_lines.push(fixed_line);
566            } else {
567                // No violations on this line, keep it as is
568                fixed_lines.push(line_info.content.clone());
569            }
570        }
571
572        // Join lines with newlines, preserving the original ending
573        let mut result = fixed_lines.join("\n");
574        if content.ends_with('\n') && !result.ends_with('\n') {
575            result.push('\n');
576        }
577        Ok(result)
578    }
579
580    fn as_any(&self) -> &dyn std::any::Any {
581        self
582    }
583
584    fn default_config_section(&self) -> Option<(String, toml::Value)> {
585        let json_value = serde_json::to_value(&self.config).ok()?;
586        Some((
587            self.name().to_string(),
588            crate::rule_config_serde::json_to_toml_value(&json_value)?,
589        ))
590    }
591
592    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
593    where
594        Self: Sized,
595    {
596        let rule_config = crate::rule_config_serde::load_rule_config::<MD044Config>(config);
597        Box::new(Self::from_config_struct(rule_config))
598    }
599}
600
601#[cfg(test)]
602mod tests {
603    use super::*;
604    use crate::lint_context::LintContext;
605
606    fn create_context(content: &str) -> LintContext<'_> {
607        LintContext::new(content, crate::config::MarkdownFlavor::Standard)
608    }
609
610    #[test]
611    fn test_correctly_capitalized_names() {
612        let rule = MD044ProperNames::new(
613            vec![
614                "JavaScript".to_string(),
615                "TypeScript".to_string(),
616                "Node.js".to_string(),
617            ],
618            true,
619        );
620
621        let content = "This document uses JavaScript, TypeScript, and Node.js correctly.";
622        let ctx = create_context(content);
623        let result = rule.check(&ctx).unwrap();
624        assert!(result.is_empty(), "Should not flag correctly capitalized names");
625    }
626
627    #[test]
628    fn test_incorrectly_capitalized_names() {
629        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "TypeScript".to_string()], true);
630
631        let content = "This document uses javascript and typescript incorrectly.";
632        let ctx = create_context(content);
633        let result = rule.check(&ctx).unwrap();
634
635        assert_eq!(result.len(), 2, "Should flag two incorrect capitalizations");
636        assert_eq!(result[0].message, "Proper name 'javascript' should be 'JavaScript'");
637        assert_eq!(result[0].line, 1);
638        assert_eq!(result[0].column, 20);
639        assert_eq!(result[1].message, "Proper name 'typescript' should be 'TypeScript'");
640        assert_eq!(result[1].line, 1);
641        assert_eq!(result[1].column, 35);
642    }
643
644    #[test]
645    fn test_names_at_beginning_of_sentences() {
646        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "Python".to_string()], true);
647
648        let content = "javascript is a great language. python is also popular.";
649        let ctx = create_context(content);
650        let result = rule.check(&ctx).unwrap();
651
652        assert_eq!(result.len(), 2, "Should flag names at beginning of sentences");
653        assert_eq!(result[0].line, 1);
654        assert_eq!(result[0].column, 1);
655        assert_eq!(result[1].line, 1);
656        assert_eq!(result[1].column, 33);
657    }
658
659    #[test]
660    fn test_names_in_code_blocks_checked_by_default() {
661        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
662
663        let content = r#"Here is some text with JavaScript.
664
665```javascript
666// This javascript should be checked
667const lang = "javascript";
668```
669
670But this javascript should be flagged."#;
671
672        let ctx = create_context(content);
673        let result = rule.check(&ctx).unwrap();
674
675        assert_eq!(result.len(), 3, "Should flag javascript inside and outside code blocks");
676        assert_eq!(result[0].line, 4);
677        assert_eq!(result[1].line, 5);
678        assert_eq!(result[2].line, 8);
679    }
680
681    #[test]
682    fn test_names_in_code_blocks_ignored_when_disabled() {
683        let rule = MD044ProperNames::new(
684            vec!["JavaScript".to_string()],
685            false, // code_blocks = false means skip code blocks
686        );
687
688        let content = r#"```
689javascript in code block
690```"#;
691
692        let ctx = create_context(content);
693        let result = rule.check(&ctx).unwrap();
694
695        assert_eq!(
696            result.len(),
697            0,
698            "Should not flag javascript in code blocks when code_blocks is false"
699        );
700    }
701
702    #[test]
703    fn test_names_in_inline_code_checked_by_default() {
704        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
705
706        let content = "This is `javascript` in inline code and javascript outside.";
707        let ctx = create_context(content);
708        let result = rule.check(&ctx).unwrap();
709
710        // When code_blocks=true, inline code should be checked
711        assert_eq!(result.len(), 2, "Should flag javascript inside and outside inline code");
712        assert_eq!(result[0].column, 10); // javascript in inline code
713        assert_eq!(result[1].column, 41); // javascript outside
714    }
715
716    #[test]
717    fn test_multiple_names_in_same_line() {
718        let rule = MD044ProperNames::new(
719            vec!["JavaScript".to_string(), "TypeScript".to_string(), "React".to_string()],
720            true,
721        );
722
723        let content = "I use javascript, typescript, and react in my projects.";
724        let ctx = create_context(content);
725        let result = rule.check(&ctx).unwrap();
726
727        assert_eq!(result.len(), 3, "Should flag all three incorrect names");
728        assert_eq!(result[0].message, "Proper name 'javascript' should be 'JavaScript'");
729        assert_eq!(result[1].message, "Proper name 'typescript' should be 'TypeScript'");
730        assert_eq!(result[2].message, "Proper name 'react' should be 'React'");
731    }
732
733    #[test]
734    fn test_case_sensitivity() {
735        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
736
737        let content = "JAVASCRIPT, Javascript, javascript, and JavaScript variations.";
738        let ctx = create_context(content);
739        let result = rule.check(&ctx).unwrap();
740
741        assert_eq!(result.len(), 3, "Should flag all incorrect case variations");
742        // JavaScript (correct) should not be flagged
743        assert!(result.iter().all(|w| w.message.contains("should be 'JavaScript'")));
744    }
745
746    #[test]
747    fn test_configuration_with_custom_name_list() {
748        let config = MD044Config {
749            names: vec!["GitHub".to_string(), "GitLab".to_string(), "DevOps".to_string()],
750            code_blocks: true,
751            html_comments: true,
752        };
753        let rule = MD044ProperNames::from_config_struct(config);
754
755        let content = "We use github, gitlab, and devops for our workflow.";
756        let ctx = create_context(content);
757        let result = rule.check(&ctx).unwrap();
758
759        assert_eq!(result.len(), 3, "Should flag all custom names");
760        assert_eq!(result[0].message, "Proper name 'github' should be 'GitHub'");
761        assert_eq!(result[1].message, "Proper name 'gitlab' should be 'GitLab'");
762        assert_eq!(result[2].message, "Proper name 'devops' should be 'DevOps'");
763    }
764
765    #[test]
766    fn test_empty_configuration() {
767        let rule = MD044ProperNames::new(vec![], true);
768
769        let content = "This has javascript and typescript but no configured names.";
770        let ctx = create_context(content);
771        let result = rule.check(&ctx).unwrap();
772
773        assert!(result.is_empty(), "Should not flag anything with empty configuration");
774    }
775
776    #[test]
777    fn test_names_with_special_characters() {
778        let rule = MD044ProperNames::new(
779            vec!["Node.js".to_string(), "ASP.NET".to_string(), "C++".to_string()],
780            true,
781        );
782
783        let content = "We use nodejs, asp.net, ASP.NET, and c++ in our stack.";
784        let ctx = create_context(content);
785        let result = rule.check(&ctx).unwrap();
786
787        // nodejs should match Node.js (dotless variation)
788        // asp.net should be flagged (wrong case)
789        // ASP.NET should not be flagged (correct)
790        // c++ should be flagged
791        assert_eq!(result.len(), 3, "Should handle special characters correctly");
792
793        let messages: Vec<&str> = result.iter().map(|w| w.message.as_str()).collect();
794        assert!(messages.contains(&"Proper name 'nodejs' should be 'Node.js'"));
795        assert!(messages.contains(&"Proper name 'asp.net' should be 'ASP.NET'"));
796        assert!(messages.contains(&"Proper name 'c++' should be 'C++'"));
797    }
798
799    #[test]
800    fn test_word_boundaries() {
801        let rule = MD044ProperNames::new(vec!["Java".to_string(), "Script".to_string()], true);
802
803        let content = "JavaScript is not java or script, but Java and Script are separate.";
804        let ctx = create_context(content);
805        let result = rule.check(&ctx).unwrap();
806
807        // Should only flag lowercase "java" and "script" as separate words
808        assert_eq!(result.len(), 2, "Should respect word boundaries");
809        assert!(result.iter().any(|w| w.column == 19)); // "java" position
810        assert!(result.iter().any(|w| w.column == 27)); // "script" position
811    }
812
813    #[test]
814    fn test_fix_method() {
815        let rule = MD044ProperNames::new(
816            vec![
817                "JavaScript".to_string(),
818                "TypeScript".to_string(),
819                "Node.js".to_string(),
820            ],
821            true,
822        );
823
824        let content = "I love javascript, typescript, and nodejs!";
825        let ctx = create_context(content);
826        let fixed = rule.fix(&ctx).unwrap();
827
828        assert_eq!(fixed, "I love JavaScript, TypeScript, and Node.js!");
829    }
830
831    #[test]
832    fn test_fix_multiple_occurrences() {
833        let rule = MD044ProperNames::new(vec!["Python".to_string()], true);
834
835        let content = "python is great. I use python daily. PYTHON is powerful.";
836        let ctx = create_context(content);
837        let fixed = rule.fix(&ctx).unwrap();
838
839        assert_eq!(fixed, "Python is great. I use Python daily. Python is powerful.");
840    }
841
842    #[test]
843    fn test_fix_checks_code_blocks_by_default() {
844        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
845
846        let content = r#"I love javascript.
847
848```
849const lang = "javascript";
850```
851
852More javascript here."#;
853
854        let ctx = create_context(content);
855        let fixed = rule.fix(&ctx).unwrap();
856
857        let expected = r#"I love JavaScript.
858
859```
860const lang = "JavaScript";
861```
862
863More JavaScript here."#;
864
865        assert_eq!(fixed, expected);
866    }
867
868    #[test]
869    fn test_multiline_content() {
870        let rule = MD044ProperNames::new(vec!["Rust".to_string(), "Python".to_string()], true);
871
872        let content = r#"First line with rust.
873Second line with python.
874Third line with RUST and PYTHON."#;
875
876        let ctx = create_context(content);
877        let result = rule.check(&ctx).unwrap();
878
879        assert_eq!(result.len(), 4, "Should flag all incorrect occurrences");
880        assert_eq!(result[0].line, 1);
881        assert_eq!(result[1].line, 2);
882        assert_eq!(result[2].line, 3);
883        assert_eq!(result[3].line, 3);
884    }
885
886    #[test]
887    fn test_default_config() {
888        let config = MD044Config::default();
889        assert!(config.names.is_empty());
890        assert!(!config.code_blocks); // Default is false (skip code blocks)
891    }
892
893    #[test]
894    fn test_performance_with_many_names() {
895        let mut names = vec![];
896        for i in 0..50 {
897            names.push(format!("ProperName{i}"));
898        }
899
900        let rule = MD044ProperNames::new(names, true);
901
902        let content = "This has propername0, propername25, and propername49 incorrectly.";
903        let ctx = create_context(content);
904        let result = rule.check(&ctx).unwrap();
905
906        assert_eq!(result.len(), 3, "Should handle many configured names efficiently");
907    }
908
909    #[test]
910    fn test_large_name_count_performance() {
911        // Verify MD044 can handle large numbers of names without regex limitations
912        // This test confirms that fancy-regex handles large patterns well
913        let names = (0..1000).map(|i| format!("ProperName{i}")).collect::<Vec<_>>();
914
915        let rule = MD044ProperNames::new(names, true);
916
917        // The combined pattern should be created successfully
918        assert!(rule.combined_pattern.is_some());
919
920        // Should be able to check content without errors
921        let content = "This has propername0 and propername999 in it.";
922        let ctx = create_context(content);
923        let result = rule.check(&ctx).unwrap();
924
925        // Should detect both incorrect names
926        assert_eq!(result.len(), 2, "Should handle 1000 names without issues");
927    }
928
929    #[test]
930    fn test_cache_behavior() {
931        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
932
933        let content = "Using javascript here.";
934        let ctx = create_context(content);
935
936        // First check
937        let result1 = rule.check(&ctx).unwrap();
938        assert_eq!(result1.len(), 1);
939
940        // Second check should use cache
941        let result2 = rule.check(&ctx).unwrap();
942        assert_eq!(result2.len(), 1);
943
944        // Results should be identical
945        assert_eq!(result1[0].line, result2[0].line);
946        assert_eq!(result1[0].column, result2[0].column);
947    }
948
949    #[test]
950    fn test_html_comments_not_checked_when_disabled() {
951        let config = MD044Config {
952            names: vec!["JavaScript".to_string()],
953            code_blocks: true,    // Check code blocks
954            html_comments: false, // Don't check HTML comments
955        };
956        let rule = MD044ProperNames::from_config_struct(config);
957
958        let content = r#"Regular javascript here.
959<!-- This javascript in HTML comment should be ignored -->
960More javascript outside."#;
961
962        let ctx = create_context(content);
963        let result = rule.check(&ctx).unwrap();
964
965        assert_eq!(result.len(), 2, "Should only flag javascript outside HTML comments");
966        assert_eq!(result[0].line, 1);
967        assert_eq!(result[1].line, 3);
968    }
969
970    #[test]
971    fn test_html_comments_checked_when_enabled() {
972        let config = MD044Config {
973            names: vec!["JavaScript".to_string()],
974            code_blocks: true,   // Check code blocks
975            html_comments: true, // Check HTML comments
976        };
977        let rule = MD044ProperNames::from_config_struct(config);
978
979        let content = r#"Regular javascript here.
980<!-- This javascript in HTML comment should be checked -->
981More javascript outside."#;
982
983        let ctx = create_context(content);
984        let result = rule.check(&ctx).unwrap();
985
986        assert_eq!(
987            result.len(),
988            3,
989            "Should flag all javascript occurrences including in HTML comments"
990        );
991    }
992
993    #[test]
994    fn test_multiline_html_comments() {
995        let config = MD044Config {
996            names: vec!["Python".to_string(), "JavaScript".to_string()],
997            code_blocks: true,    // Check code blocks
998            html_comments: false, // Don't check HTML comments
999        };
1000        let rule = MD044ProperNames::from_config_struct(config);
1001
1002        let content = r#"Regular python here.
1003<!--
1004This is a multiline comment
1005with javascript and python
1006that should be ignored
1007-->
1008More javascript outside."#;
1009
1010        let ctx = create_context(content);
1011        let result = rule.check(&ctx).unwrap();
1012
1013        assert_eq!(result.len(), 2, "Should only flag names outside HTML comments");
1014        assert_eq!(result[0].line, 1); // python
1015        assert_eq!(result[1].line, 7); // javascript
1016    }
1017
1018    #[test]
1019    fn test_fix_preserves_html_comments_when_disabled() {
1020        let config = MD044Config {
1021            names: vec!["JavaScript".to_string()],
1022            code_blocks: true,    // Check code blocks
1023            html_comments: false, // Don't check HTML comments
1024        };
1025        let rule = MD044ProperNames::from_config_struct(config);
1026
1027        let content = r#"javascript here.
1028<!-- javascript in comment -->
1029More javascript."#;
1030
1031        let ctx = create_context(content);
1032        let fixed = rule.fix(&ctx).unwrap();
1033
1034        let expected = r#"JavaScript here.
1035<!-- javascript in comment -->
1036More JavaScript."#;
1037
1038        assert_eq!(
1039            fixed, expected,
1040            "Should not fix names inside HTML comments when disabled"
1041        );
1042    }
1043
1044    #[test]
1045    fn test_proper_names_in_links_not_flagged() {
1046        let rule = MD044ProperNames::new(
1047            vec!["JavaScript".to_string(), "Node.js".to_string(), "Python".to_string()],
1048            true,
1049        );
1050
1051        let content = r#"Check this [javascript documentation](https://javascript.info) for info.
1052
1053Visit [node.js homepage](https://nodejs.org) and [python tutorial](https://python.org).
1054
1055Real javascript should be flagged.
1056
1057Also see the [typescript guide][ts-ref] for more.
1058
1059Real python should be flagged too.
1060
1061[ts-ref]: https://typescript.org/handbook"#;
1062
1063        let ctx = create_context(content);
1064        let result = rule.check(&ctx).unwrap();
1065
1066        // Only the real standalone proper names should be flagged
1067        assert_eq!(
1068            result.len(),
1069            2,
1070            "Expected exactly 2 warnings for standalone proper names"
1071        );
1072        assert!(result[0].message.contains("'javascript' should be 'JavaScript'"));
1073        assert!(result[1].message.contains("'python' should be 'Python'"));
1074        // Should be on lines with standalone instances
1075        assert!(result[0].line == 5); // "Real javascript should be flagged."
1076        assert!(result[1].line == 9); // "Real python should be flagged too."
1077    }
1078
1079    #[test]
1080    fn test_proper_names_in_images_not_flagged() {
1081        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1082
1083        let content = r#"Here is a ![javascript logo](javascript.png "javascript icon") image.
1084
1085Real javascript should be flagged."#;
1086
1087        let ctx = create_context(content);
1088        let result = rule.check(&ctx).unwrap();
1089
1090        // Only the standalone proper name should be flagged
1091        assert_eq!(result.len(), 1, "Expected exactly 1 warning for standalone proper name");
1092        assert!(result[0].message.contains("'javascript' should be 'JavaScript'"));
1093        assert!(result[0].line == 3); // "Real javascript should be flagged."
1094    }
1095
1096    #[test]
1097    fn test_proper_names_in_reference_definitions_not_flagged() {
1098        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "TypeScript".to_string()], true);
1099
1100        let content = r#"Check the [javascript guide][js-ref] for details.
1101
1102Real javascript should be flagged.
1103
1104[js-ref]: https://javascript.info/typescript/guide"#;
1105
1106        let ctx = create_context(content);
1107        let result = rule.check(&ctx).unwrap();
1108
1109        // Only the standalone proper name should be flagged
1110        assert_eq!(result.len(), 1, "Expected exactly 1 warning for standalone proper name");
1111        assert!(result[0].message.contains("'javascript' should be 'JavaScript'"));
1112        assert!(result[0].line == 3); // "Real javascript should be flagged."
1113    }
1114}
rumdl_lib/rules/md044_proper_names.rs

rumdl_lib/rules/
md044_proper_names.rs