rumdl_lib/rules/
md044_proper_names.rs

1use crate::utils::fast_hash;
2use crate::utils::range_utils::LineIndex;
3use crate::utils::regex_cache::{escape_regex, get_cached_fancy_regex};
4
5use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, Severity};
6use fancy_regex::Regex;
7use lazy_static::lazy_static;
8use std::collections::HashMap;
9use std::sync::{Arc, Mutex};
10
11mod md044_config;
12use md044_config::MD044Config;
13
14lazy_static! {
15    static ref HTML_COMMENT_REGEX: Regex = Regex::new(r"<!--([\s\S]*?)-->").unwrap();
16    // Reference definition pattern - matches [ref]: url "title"
17    static ref REF_DEF_REGEX: regex::Regex = regex::Regex::new(
18        r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#
19    ).unwrap();
20}
21
22type WarningPosition = (usize, usize, String); // (line, column, found_name)
23
24/// Rule MD044: Proper names should be capitalized
25///
26/// See [docs/md044.md](../../docs/md044.md) for full documentation, configuration, and examples.
27///
28/// This rule is triggered when proper names are not capitalized correctly in the document.
29/// For example, if you have defined "JavaScript" as a proper name, the rule will flag any
30/// occurrences of "javascript" or "Javascript" as violations.
31///
32/// ## Purpose
33///
34/// Ensuring consistent capitalization of proper names improves document quality and
35/// professionalism. This is especially important for technical documentation where
36/// product names, programming languages, and technologies often have specific
37/// capitalization conventions.
38///
39/// ## Configuration Options
40///
41/// The rule supports the following configuration options:
42///
43/// ```yaml
44/// MD044:
45///   names: []                # List of proper names to check for correct capitalization
46///   code-blocks: false       # Whether to check code blocks (default: false)
47/// ```
48///
49/// Example configuration:
50///
51/// ```yaml
52/// MD044:
53///   names: ["JavaScript", "Node.js", "TypeScript"]
54///   code-blocks: true
55/// ```
56///
57/// ## Performance Optimizations
58///
59/// This rule implements several performance optimizations:
60///
61/// 1. **Regex Caching**: Pre-compiles and caches regex patterns for each proper name
62/// 2. **Content Caching**: Caches results based on content hashing for repeated checks
63/// 3. **Efficient Text Processing**: Uses optimized algorithms to avoid redundant text processing
64/// 4. **Smart Code Block Detection**: Efficiently identifies and optionally excludes code blocks
65///
66/// ## Edge Cases Handled
67///
68/// - **Word Boundaries**: Only matches complete words, not substrings within other words
69/// - **Case Sensitivity**: Properly handles case-specific matching
70/// - **Code Blocks**: Optionally checks code blocks (controlled by code-blocks setting)
71/// - **Markdown Formatting**: Handles proper names within Markdown formatting elements
72///
73/// ## Fix Behavior
74///
75/// When fixing issues, this rule replaces incorrect capitalization with the correct form
76/// as defined in the configuration.
77///
78#[derive(Clone)]
79pub struct MD044ProperNames {
80    config: MD044Config,
81    // Cache the combined regex pattern string
82    combined_pattern: Option<String>,
83    // Cache for name violations by content hash
84    content_cache: Arc<Mutex<HashMap<u64, Vec<WarningPosition>>>>,
85}
86
87impl MD044ProperNames {
88    pub fn new(names: Vec<String>, code_blocks: bool) -> Self {
89        let config = MD044Config {
90            names,
91            code_blocks,
92            html_comments: true, // Default to checking HTML comments
93        };
94        let combined_pattern = Self::create_combined_pattern(&config);
95        Self {
96            config,
97            combined_pattern,
98            content_cache: Arc::new(Mutex::new(HashMap::new())),
99        }
100    }
101
102    // Helper function for consistent ASCII normalization
103    fn ascii_normalize(s: &str) -> String {
104        s.replace(['é', 'è', 'ê', 'ë'], "e")
105            .replace(['à', 'á', 'â', 'ä'], "a")
106            .replace(['ï', 'î', 'í', 'ì'], "i")
107            .replace(['ü', 'ú', 'ù', 'û'], "u")
108            .replace(['ö', 'ó', 'ò', 'ô'], "o")
109            .replace('ñ', "n")
110            .replace('ç', "c")
111    }
112
113    pub fn from_config_struct(config: MD044Config) -> Self {
114        let combined_pattern = Self::create_combined_pattern(&config);
115        Self {
116            config,
117            combined_pattern,
118            content_cache: Arc::new(Mutex::new(HashMap::new())),
119        }
120    }
121
122    // Create a combined regex pattern for all proper names
123    fn create_combined_pattern(config: &MD044Config) -> Option<String> {
124        if config.names.is_empty() {
125            return None;
126        }
127
128        // Create patterns for all names and their variations
129        let mut patterns: Vec<String> = config
130            .names
131            .iter()
132            .flat_map(|name| {
133                let mut variations = vec![];
134                let lower_name = name.to_lowercase();
135
136                // Add the lowercase version
137                variations.push(escape_regex(&lower_name));
138
139                // Add version without dots
140                let lower_name_no_dots = lower_name.replace('.', "");
141                if lower_name != lower_name_no_dots {
142                    variations.push(escape_regex(&lower_name_no_dots));
143                }
144
145                // Add ASCII-normalized versions for common accented characters
146                let ascii_normalized = Self::ascii_normalize(&lower_name);
147
148                if ascii_normalized != lower_name {
149                    variations.push(escape_regex(&ascii_normalized));
150
151                    // Also add version without dots
152                    let ascii_no_dots = ascii_normalized.replace('.', "");
153                    if ascii_normalized != ascii_no_dots {
154                        variations.push(escape_regex(&ascii_no_dots));
155                    }
156                }
157
158                variations
159            })
160            .collect();
161
162        // Sort patterns by length (longest first) to avoid shorter patterns matching within longer ones
163        patterns.sort_by_key(|b| std::cmp::Reverse(b.len()));
164
165        // Combine all patterns into a single regex with capture groups
166        // Don't use \b as it doesn't work with Unicode - we'll check boundaries manually
167        Some(format!(r"(?i)({})", patterns.join("|")))
168    }
169
170    // Find all name violations in the content and return positions
171    fn find_name_violations(&self, content: &str, ctx: &crate::lint_context::LintContext) -> Vec<WarningPosition> {
172        // Early return: if no names configured or content is empty
173        if self.config.names.is_empty() || content.is_empty() || self.combined_pattern.is_none() {
174            return Vec::new();
175        }
176
177        // Early return: quick check if any of the configured names might be in content
178        let content_lower = content.to_lowercase();
179        let has_potential_matches = self.config.names.iter().any(|name| {
180            let name_lower = name.to_lowercase();
181            let name_no_dots = name_lower.replace('.', "");
182
183            // Check direct match
184            if content_lower.contains(&name_lower) || content_lower.contains(&name_no_dots) {
185                return true;
186            }
187
188            // Also check ASCII-normalized version
189            let ascii_normalized = Self::ascii_normalize(&name_lower);
190
191            if ascii_normalized != name_lower {
192                if content_lower.contains(&ascii_normalized) {
193                    return true;
194                }
195                let ascii_no_dots = ascii_normalized.replace('.', "");
196                if ascii_normalized != ascii_no_dots && content_lower.contains(&ascii_no_dots) {
197                    return true;
198                }
199            }
200
201            false
202        });
203
204        if !has_potential_matches {
205            return Vec::new();
206        }
207
208        // Check if we have cached results
209        let hash = fast_hash(content);
210        {
211            // Use a separate scope for borrowing to minimize lock time
212            let cache = self.content_cache.lock().unwrap();
213            if let Some(cached) = cache.get(&hash) {
214                return cached.clone();
215            }
216        }
217
218        let mut violations = Vec::new();
219
220        // Get the regex from global cache
221        let combined_regex = match &self.combined_pattern {
222            Some(pattern) => match get_cached_fancy_regex(pattern) {
223                Ok(regex) => regex,
224                Err(_) => return Vec::new(),
225            },
226            None => return Vec::new(),
227        };
228
229        // Use ctx.lines for better performance
230        for (line_idx, line_info) in ctx.lines.iter().enumerate() {
231            let line_num = line_idx + 1;
232            let line = &line_info.content;
233
234            // Skip code fence lines (```language or ~~~language)
235            let trimmed = line.trim_start();
236            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
237                continue;
238            }
239
240            // Skip if in code block (when code_blocks = false)
241            if !self.config.code_blocks && line_info.in_code_block {
242                continue;
243            }
244
245            // Check if we should skip HTML comments
246            let in_html_comment = if !self.config.html_comments {
247                // Check if this position is within an HTML comment
248                self.is_in_html_comment(content, line_info.byte_offset)
249            } else {
250                false
251            };
252
253            if in_html_comment {
254                continue;
255            }
256
257            // Early return: skip lines that don't contain any potential matches
258            let line_lower = line.to_lowercase();
259            let has_line_matches = self.config.names.iter().any(|name| {
260                let name_lower = name.to_lowercase();
261                let name_no_dots = name_lower.replace('.', "");
262
263                // Check direct match
264                if line_lower.contains(&name_lower) || line_lower.contains(&name_no_dots) {
265                    return true;
266                }
267
268                // Also check ASCII-normalized version
269                let ascii_normalized = Self::ascii_normalize(&name_lower);
270                if ascii_normalized != name_lower {
271                    if line_lower.contains(&ascii_normalized) {
272                        return true;
273                    }
274                    let ascii_no_dots = ascii_normalized.replace('.', "");
275                    if ascii_normalized != ascii_no_dots && line_lower.contains(&ascii_no_dots) {
276                        return true;
277                    }
278                }
279
280                false
281            });
282
283            if !has_line_matches {
284                continue;
285            }
286
287            // Use the combined regex to find all matches in one pass
288            for cap_result in combined_regex.find_iter(line) {
289                match cap_result {
290                    Ok(cap) => {
291                        let found_name = &line[cap.start()..cap.end()];
292
293                        // Check word boundaries manually for Unicode support
294                        let start_pos = cap.start();
295                        let end_pos = cap.end();
296
297                        if !self.is_at_word_boundary(line, start_pos, true)
298                            || !self.is_at_word_boundary(line, end_pos, false)
299                        {
300                            continue; // Not at word boundary
301                        }
302
303                        // Skip if in inline code when code_blocks is false
304                        if !self.config.code_blocks {
305                            let byte_pos = line_info.byte_offset + cap.start();
306                            if ctx.is_in_code_block_or_span(byte_pos) {
307                                continue;
308                            }
309                        }
310
311                        // Skip if in link (inline links, reference links, or reference definitions)
312                        let byte_pos = line_info.byte_offset + cap.start();
313                        if self.is_in_link(ctx, byte_pos) {
314                            continue;
315                        }
316
317                        // Find which proper name this matches
318                        if let Some(proper_name) = self.get_proper_name_for(found_name) {
319                            // Only flag if it's not already correct
320                            if found_name != proper_name {
321                                violations.push((line_num, cap.start() + 1, found_name.to_string()));
322                            }
323                        }
324                    }
325                    Err(e) => {
326                        eprintln!("Regex execution error on line {line_num}: {e}");
327                    }
328                }
329            }
330        }
331
332        // Store in cache
333        self.content_cache.lock().unwrap().insert(hash, violations.clone());
334        violations
335    }
336
337    // Check if a byte position is within an HTML comment
338    fn is_in_html_comment(&self, content: &str, byte_pos: usize) -> bool {
339        for m in HTML_COMMENT_REGEX.find_iter(content).flatten() {
340            if m.start() <= byte_pos && byte_pos < m.end() {
341                return true;
342            }
343        }
344        false
345    }
346
347    /// Check if a byte position is within a link (inline links, reference links, or reference definitions)
348    fn is_in_link(&self, ctx: &crate::lint_context::LintContext, byte_pos: usize) -> bool {
349        // Check inline and reference links
350        for link in &ctx.links {
351            if link.byte_offset <= byte_pos && byte_pos < link.byte_end {
352                return true;
353            }
354        }
355
356        // Check images (which use similar syntax)
357        for image in &ctx.images {
358            if image.byte_offset <= byte_pos && byte_pos < image.byte_end {
359                return true;
360            }
361        }
362
363        // Check reference definitions [ref]: url "title" using regex pattern
364        for m in REF_DEF_REGEX.find_iter(ctx.content) {
365            if m.start() <= byte_pos && byte_pos < m.end() {
366                return true;
367            }
368        }
369
370        false
371    }
372
373    // Check if a character is a word boundary (handles Unicode)
374    fn is_word_boundary_char(c: char) -> bool {
375        !c.is_alphanumeric()
376    }
377
378    // Check if position is at a word boundary
379    fn is_at_word_boundary(&self, content: &str, pos: usize, is_start: bool) -> bool {
380        let chars: Vec<char> = content.chars().collect();
381        let char_indices: Vec<(usize, char)> = content.char_indices().collect();
382
383        // Find the character position
384        let char_pos = char_indices.iter().position(|(idx, _)| *idx == pos);
385        if char_pos.is_none() {
386            return true; // If we can't find position, assume boundary
387        }
388        let char_pos = char_pos.unwrap();
389
390        if is_start {
391            // Check character before position
392            if char_pos == 0 {
393                return true; // Start of string
394            }
395            Self::is_word_boundary_char(chars[char_pos - 1])
396        } else {
397            // Check character after position
398            if char_pos >= chars.len() {
399                return true; // End of string
400            }
401            Self::is_word_boundary_char(chars[char_pos])
402        }
403    }
404
405    // Get the proper name that should be used for a found name
406    fn get_proper_name_for(&self, found_name: &str) -> Option<String> {
407        let found_lower = found_name.to_lowercase();
408
409        // Iterate through the configured proper names
410        for name in &self.config.names {
411            let lower_name = name.to_lowercase();
412            let lower_name_no_dots = lower_name.replace('.', "");
413
414            // Direct match
415            if found_lower == lower_name || found_lower == lower_name_no_dots {
416                return Some(name.clone());
417            }
418
419            // Check ASCII-normalized version
420            let ascii_normalized = Self::ascii_normalize(&lower_name);
421
422            let ascii_no_dots = ascii_normalized.replace('.', "");
423
424            if found_lower == ascii_normalized || found_lower == ascii_no_dots {
425                return Some(name.clone());
426            }
427        }
428        None
429    }
430}
431
432impl Rule for MD044ProperNames {
433    fn name(&self) -> &'static str {
434        "MD044"
435    }
436
437    fn description(&self) -> &'static str {
438        "Proper names should have the correct capitalization"
439    }
440
441    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
442        if self.config.names.is_empty() {
443            return true;
444        }
445        // Quick check if any configured names exist (case-insensitive)
446        let content_lower = ctx.content.to_lowercase();
447        !self
448            .config
449            .names
450            .iter()
451            .any(|name| content_lower.contains(&name.to_lowercase()))
452    }
453
454    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
455        let content = ctx.content;
456        if content.is_empty() || self.config.names.is_empty() || self.combined_pattern.is_none() {
457            return Ok(Vec::new());
458        }
459
460        // Early return: quick check if any of the configured names might be in content
461        let content_lower = content.to_lowercase();
462        let has_potential_matches = self.config.names.iter().any(|name| {
463            let name_lower = name.to_lowercase();
464            let name_no_dots = name_lower.replace('.', "");
465
466            // Check direct match
467            if content_lower.contains(&name_lower) || content_lower.contains(&name_no_dots) {
468                return true;
469            }
470
471            // Also check ASCII-normalized version
472            let ascii_normalized = Self::ascii_normalize(&name_lower);
473
474            if ascii_normalized != name_lower {
475                if content_lower.contains(&ascii_normalized) {
476                    return true;
477                }
478                let ascii_no_dots = ascii_normalized.replace('.', "");
479                if ascii_normalized != ascii_no_dots && content_lower.contains(&ascii_no_dots) {
480                    return true;
481                }
482            }
483
484            false
485        });
486
487        if !has_potential_matches {
488            return Ok(Vec::new());
489        }
490
491        let line_index = LineIndex::new(content.to_string());
492        let violations = self.find_name_violations(content, ctx);
493
494        let warnings = violations
495            .into_iter()
496            .filter_map(|(line, column, found_name)| {
497                self.get_proper_name_for(&found_name).map(|proper_name| LintWarning {
498                    rule_name: Some(self.name().to_string()),
499                    line,
500                    column,
501                    end_line: line,
502                    end_column: column + found_name.len(),
503                    message: format!("Proper name '{found_name}' should be '{proper_name}'"),
504                    severity: Severity::Warning,
505                    fix: Some(Fix {
506                        range: line_index.line_col_to_byte_range(line, column),
507                        replacement: proper_name,
508                    }),
509                })
510            })
511            .collect();
512
513        Ok(warnings)
514    }
515
516    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
517        let content = ctx.content;
518        if content.is_empty() || self.config.names.is_empty() {
519            return Ok(content.to_string());
520        }
521
522        let violations = self.find_name_violations(content, ctx);
523        if violations.is_empty() {
524            return Ok(content.to_string());
525        }
526
527        // Process lines and build the fixed content
528        let mut fixed_lines = Vec::new();
529
530        // Group violations by line
531        let mut violations_by_line: HashMap<usize, Vec<(usize, String)>> = HashMap::new();
532        for (line_num, col_num, found_name) in violations {
533            violations_by_line
534                .entry(line_num)
535                .or_default()
536                .push((col_num, found_name));
537        }
538
539        // Sort violations within each line in reverse order
540        for violations in violations_by_line.values_mut() {
541            violations.sort_by_key(|b| std::cmp::Reverse(b.0));
542        }
543
544        // Process each line
545        for (line_idx, line_info) in ctx.lines.iter().enumerate() {
546            let line_num = line_idx + 1;
547
548            if let Some(line_violations) = violations_by_line.get(&line_num) {
549                // This line has violations, fix them
550                let mut fixed_line = line_info.content.clone();
551
552                for (col_num, found_name) in line_violations {
553                    if let Some(proper_name) = self.get_proper_name_for(found_name) {
554                        let start_col = col_num - 1; // Convert to 0-based
555                        let end_col = start_col + found_name.len();
556
557                        if end_col <= fixed_line.len()
558                            && fixed_line.is_char_boundary(start_col)
559                            && fixed_line.is_char_boundary(end_col)
560                        {
561                            fixed_line.replace_range(start_col..end_col, &proper_name);
562                        }
563                    }
564                }
565
566                fixed_lines.push(fixed_line);
567            } else {
568                // No violations on this line, keep it as is
569                fixed_lines.push(line_info.content.clone());
570            }
571        }
572
573        // Join lines with newlines, preserving the original ending
574        let mut result = fixed_lines.join("\n");
575        if content.ends_with('\n') && !result.ends_with('\n') {
576            result.push('\n');
577        }
578        Ok(result)
579    }
580
581    fn as_any(&self) -> &dyn std::any::Any {
582        self
583    }
584
585    fn default_config_section(&self) -> Option<(String, toml::Value)> {
586        let json_value = serde_json::to_value(&self.config).ok()?;
587        Some((
588            self.name().to_string(),
589            crate::rule_config_serde::json_to_toml_value(&json_value)?,
590        ))
591    }
592
593    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
594    where
595        Self: Sized,
596    {
597        let rule_config = crate::rule_config_serde::load_rule_config::<MD044Config>(config);
598        Box::new(Self::from_config_struct(rule_config))
599    }
600}
601
602#[cfg(test)]
603mod tests {
604    use super::*;
605    use crate::lint_context::LintContext;
606
607    fn create_context(content: &str) -> LintContext<'_> {
608        LintContext::new(content, crate::config::MarkdownFlavor::Standard)
609    }
610
611    #[test]
612    fn test_correctly_capitalized_names() {
613        let rule = MD044ProperNames::new(
614            vec![
615                "JavaScript".to_string(),
616                "TypeScript".to_string(),
617                "Node.js".to_string(),
618            ],
619            true,
620        );
621
622        let content = "This document uses JavaScript, TypeScript, and Node.js correctly.";
623        let ctx = create_context(content);
624        let result = rule.check(&ctx).unwrap();
625        assert!(result.is_empty(), "Should not flag correctly capitalized names");
626    }
627
628    #[test]
629    fn test_incorrectly_capitalized_names() {
630        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "TypeScript".to_string()], true);
631
632        let content = "This document uses javascript and typescript incorrectly.";
633        let ctx = create_context(content);
634        let result = rule.check(&ctx).unwrap();
635
636        assert_eq!(result.len(), 2, "Should flag two incorrect capitalizations");
637        assert_eq!(result[0].message, "Proper name 'javascript' should be 'JavaScript'");
638        assert_eq!(result[0].line, 1);
639        assert_eq!(result[0].column, 20);
640        assert_eq!(result[1].message, "Proper name 'typescript' should be 'TypeScript'");
641        assert_eq!(result[1].line, 1);
642        assert_eq!(result[1].column, 35);
643    }
644
645    #[test]
646    fn test_names_at_beginning_of_sentences() {
647        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "Python".to_string()], true);
648
649        let content = "javascript is a great language. python is also popular.";
650        let ctx = create_context(content);
651        let result = rule.check(&ctx).unwrap();
652
653        assert_eq!(result.len(), 2, "Should flag names at beginning of sentences");
654        assert_eq!(result[0].line, 1);
655        assert_eq!(result[0].column, 1);
656        assert_eq!(result[1].line, 1);
657        assert_eq!(result[1].column, 33);
658    }
659
660    #[test]
661    fn test_names_in_code_blocks_checked_by_default() {
662        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
663
664        let content = r#"Here is some text with JavaScript.
665
666```javascript
667// This javascript should be checked
668const lang = "javascript";
669```
670
671But this javascript should be flagged."#;
672
673        let ctx = create_context(content);
674        let result = rule.check(&ctx).unwrap();
675
676        assert_eq!(result.len(), 3, "Should flag javascript inside and outside code blocks");
677        assert_eq!(result[0].line, 4);
678        assert_eq!(result[1].line, 5);
679        assert_eq!(result[2].line, 8);
680    }
681
682    #[test]
683    fn test_names_in_code_blocks_ignored_when_disabled() {
684        let rule = MD044ProperNames::new(
685            vec!["JavaScript".to_string()],
686            false, // code_blocks = false means skip code blocks
687        );
688
689        let content = r#"```
690javascript in code block
691```"#;
692
693        let ctx = create_context(content);
694        let result = rule.check(&ctx).unwrap();
695
696        assert_eq!(
697            result.len(),
698            0,
699            "Should not flag javascript in code blocks when code_blocks is false"
700        );
701    }
702
703    #[test]
704    fn test_names_in_inline_code_checked_by_default() {
705        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
706
707        let content = "This is `javascript` in inline code and javascript outside.";
708        let ctx = create_context(content);
709        let result = rule.check(&ctx).unwrap();
710
711        // When code_blocks=true, inline code should be checked
712        assert_eq!(result.len(), 2, "Should flag javascript inside and outside inline code");
713        assert_eq!(result[0].column, 10); // javascript in inline code
714        assert_eq!(result[1].column, 41); // javascript outside
715    }
716
717    #[test]
718    fn test_multiple_names_in_same_line() {
719        let rule = MD044ProperNames::new(
720            vec!["JavaScript".to_string(), "TypeScript".to_string(), "React".to_string()],
721            true,
722        );
723
724        let content = "I use javascript, typescript, and react in my projects.";
725        let ctx = create_context(content);
726        let result = rule.check(&ctx).unwrap();
727
728        assert_eq!(result.len(), 3, "Should flag all three incorrect names");
729        assert_eq!(result[0].message, "Proper name 'javascript' should be 'JavaScript'");
730        assert_eq!(result[1].message, "Proper name 'typescript' should be 'TypeScript'");
731        assert_eq!(result[2].message, "Proper name 'react' should be 'React'");
732    }
733
734    #[test]
735    fn test_case_sensitivity() {
736        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
737
738        let content = "JAVASCRIPT, Javascript, javascript, and JavaScript variations.";
739        let ctx = create_context(content);
740        let result = rule.check(&ctx).unwrap();
741
742        assert_eq!(result.len(), 3, "Should flag all incorrect case variations");
743        // JavaScript (correct) should not be flagged
744        assert!(result.iter().all(|w| w.message.contains("should be 'JavaScript'")));
745    }
746
747    #[test]
748    fn test_configuration_with_custom_name_list() {
749        let config = MD044Config {
750            names: vec!["GitHub".to_string(), "GitLab".to_string(), "DevOps".to_string()],
751            code_blocks: true,
752            html_comments: true,
753        };
754        let rule = MD044ProperNames::from_config_struct(config);
755
756        let content = "We use github, gitlab, and devops for our workflow.";
757        let ctx = create_context(content);
758        let result = rule.check(&ctx).unwrap();
759
760        assert_eq!(result.len(), 3, "Should flag all custom names");
761        assert_eq!(result[0].message, "Proper name 'github' should be 'GitHub'");
762        assert_eq!(result[1].message, "Proper name 'gitlab' should be 'GitLab'");
763        assert_eq!(result[2].message, "Proper name 'devops' should be 'DevOps'");
764    }
765
766    #[test]
767    fn test_empty_configuration() {
768        let rule = MD044ProperNames::new(vec![], true);
769
770        let content = "This has javascript and typescript but no configured names.";
771        let ctx = create_context(content);
772        let result = rule.check(&ctx).unwrap();
773
774        assert!(result.is_empty(), "Should not flag anything with empty configuration");
775    }
776
777    #[test]
778    fn test_names_with_special_characters() {
779        let rule = MD044ProperNames::new(
780            vec!["Node.js".to_string(), "ASP.NET".to_string(), "C++".to_string()],
781            true,
782        );
783
784        let content = "We use nodejs, asp.net, ASP.NET, and c++ in our stack.";
785        let ctx = create_context(content);
786        let result = rule.check(&ctx).unwrap();
787
788        // nodejs should match Node.js (dotless variation)
789        // asp.net should be flagged (wrong case)
790        // ASP.NET should not be flagged (correct)
791        // c++ should be flagged
792        assert_eq!(result.len(), 3, "Should handle special characters correctly");
793
794        let messages: Vec<&str> = result.iter().map(|w| w.message.as_str()).collect();
795        assert!(messages.contains(&"Proper name 'nodejs' should be 'Node.js'"));
796        assert!(messages.contains(&"Proper name 'asp.net' should be 'ASP.NET'"));
797        assert!(messages.contains(&"Proper name 'c++' should be 'C++'"));
798    }
799
800    #[test]
801    fn test_word_boundaries() {
802        let rule = MD044ProperNames::new(vec!["Java".to_string(), "Script".to_string()], true);
803
804        let content = "JavaScript is not java or script, but Java and Script are separate.";
805        let ctx = create_context(content);
806        let result = rule.check(&ctx).unwrap();
807
808        // Should only flag lowercase "java" and "script" as separate words
809        assert_eq!(result.len(), 2, "Should respect word boundaries");
810        assert!(result.iter().any(|w| w.column == 19)); // "java" position
811        assert!(result.iter().any(|w| w.column == 27)); // "script" position
812    }
813
814    #[test]
815    fn test_fix_method() {
816        let rule = MD044ProperNames::new(
817            vec![
818                "JavaScript".to_string(),
819                "TypeScript".to_string(),
820                "Node.js".to_string(),
821            ],
822            true,
823        );
824
825        let content = "I love javascript, typescript, and nodejs!";
826        let ctx = create_context(content);
827        let fixed = rule.fix(&ctx).unwrap();
828
829        assert_eq!(fixed, "I love JavaScript, TypeScript, and Node.js!");
830    }
831
832    #[test]
833    fn test_fix_multiple_occurrences() {
834        let rule = MD044ProperNames::new(vec!["Python".to_string()], true);
835
836        let content = "python is great. I use python daily. PYTHON is powerful.";
837        let ctx = create_context(content);
838        let fixed = rule.fix(&ctx).unwrap();
839
840        assert_eq!(fixed, "Python is great. I use Python daily. Python is powerful.");
841    }
842
843    #[test]
844    fn test_fix_checks_code_blocks_by_default() {
845        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
846
847        let content = r#"I love javascript.
848
849```
850const lang = "javascript";
851```
852
853More javascript here."#;
854
855        let ctx = create_context(content);
856        let fixed = rule.fix(&ctx).unwrap();
857
858        let expected = r#"I love JavaScript.
859
860```
861const lang = "JavaScript";
862```
863
864More JavaScript here."#;
865
866        assert_eq!(fixed, expected);
867    }
868
869    #[test]
870    fn test_multiline_content() {
871        let rule = MD044ProperNames::new(vec!["Rust".to_string(), "Python".to_string()], true);
872
873        let content = r#"First line with rust.
874Second line with python.
875Third line with RUST and PYTHON."#;
876
877        let ctx = create_context(content);
878        let result = rule.check(&ctx).unwrap();
879
880        assert_eq!(result.len(), 4, "Should flag all incorrect occurrences");
881        assert_eq!(result[0].line, 1);
882        assert_eq!(result[1].line, 2);
883        assert_eq!(result[2].line, 3);
884        assert_eq!(result[3].line, 3);
885    }
886
887    #[test]
888    fn test_default_config() {
889        let config = MD044Config::default();
890        assert!(config.names.is_empty());
891        assert!(!config.code_blocks); // Default is false (skip code blocks)
892    }
893
894    #[test]
895    fn test_performance_with_many_names() {
896        let mut names = vec![];
897        for i in 0..50 {
898            names.push(format!("ProperName{i}"));
899        }
900
901        let rule = MD044ProperNames::new(names, true);
902
903        let content = "This has propername0, propername25, and propername49 incorrectly.";
904        let ctx = create_context(content);
905        let result = rule.check(&ctx).unwrap();
906
907        assert_eq!(result.len(), 3, "Should handle many configured names efficiently");
908    }
909
910    #[test]
911    fn test_large_name_count_performance() {
912        // Verify MD044 can handle large numbers of names without regex limitations
913        // This test confirms that fancy-regex handles large patterns well
914        let names = (0..1000).map(|i| format!("ProperName{i}")).collect::<Vec<_>>();
915
916        let rule = MD044ProperNames::new(names, true);
917
918        // The combined pattern should be created successfully
919        assert!(rule.combined_pattern.is_some());
920
921        // Should be able to check content without errors
922        let content = "This has propername0 and propername999 in it.";
923        let ctx = create_context(content);
924        let result = rule.check(&ctx).unwrap();
925
926        // Should detect both incorrect names
927        assert_eq!(result.len(), 2, "Should handle 1000 names without issues");
928    }
929
930    #[test]
931    fn test_cache_behavior() {
932        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
933
934        let content = "Using javascript here.";
935        let ctx = create_context(content);
936
937        // First check
938        let result1 = rule.check(&ctx).unwrap();
939        assert_eq!(result1.len(), 1);
940
941        // Second check should use cache
942        let result2 = rule.check(&ctx).unwrap();
943        assert_eq!(result2.len(), 1);
944
945        // Results should be identical
946        assert_eq!(result1[0].line, result2[0].line);
947        assert_eq!(result1[0].column, result2[0].column);
948    }
949
950    #[test]
951    fn test_html_comments_not_checked_when_disabled() {
952        let config = MD044Config {
953            names: vec!["JavaScript".to_string()],
954            code_blocks: true,    // Check code blocks
955            html_comments: false, // Don't check HTML comments
956        };
957        let rule = MD044ProperNames::from_config_struct(config);
958
959        let content = r#"Regular javascript here.
960<!-- This javascript in HTML comment should be ignored -->
961More javascript outside."#;
962
963        let ctx = create_context(content);
964        let result = rule.check(&ctx).unwrap();
965
966        assert_eq!(result.len(), 2, "Should only flag javascript outside HTML comments");
967        assert_eq!(result[0].line, 1);
968        assert_eq!(result[1].line, 3);
969    }
970
971    #[test]
972    fn test_html_comments_checked_when_enabled() {
973        let config = MD044Config {
974            names: vec!["JavaScript".to_string()],
975            code_blocks: true,   // Check code blocks
976            html_comments: true, // Check HTML comments
977        };
978        let rule = MD044ProperNames::from_config_struct(config);
979
980        let content = r#"Regular javascript here.
981<!-- This javascript in HTML comment should be checked -->
982More javascript outside."#;
983
984        let ctx = create_context(content);
985        let result = rule.check(&ctx).unwrap();
986
987        assert_eq!(
988            result.len(),
989            3,
990            "Should flag all javascript occurrences including in HTML comments"
991        );
992    }
993
994    #[test]
995    fn test_multiline_html_comments() {
996        let config = MD044Config {
997            names: vec!["Python".to_string(), "JavaScript".to_string()],
998            code_blocks: true,    // Check code blocks
999            html_comments: false, // Don't check HTML comments
1000        };
1001        let rule = MD044ProperNames::from_config_struct(config);
1002
1003        let content = r#"Regular python here.
1004<!--
1005This is a multiline comment
1006with javascript and python
1007that should be ignored
1008-->
1009More javascript outside."#;
1010
1011        let ctx = create_context(content);
1012        let result = rule.check(&ctx).unwrap();
1013
1014        assert_eq!(result.len(), 2, "Should only flag names outside HTML comments");
1015        assert_eq!(result[0].line, 1); // python
1016        assert_eq!(result[1].line, 7); // javascript
1017    }
1018
1019    #[test]
1020    fn test_fix_preserves_html_comments_when_disabled() {
1021        let config = MD044Config {
1022            names: vec!["JavaScript".to_string()],
1023            code_blocks: true,    // Check code blocks
1024            html_comments: false, // Don't check HTML comments
1025        };
1026        let rule = MD044ProperNames::from_config_struct(config);
1027
1028        let content = r#"javascript here.
1029<!-- javascript in comment -->
1030More javascript."#;
1031
1032        let ctx = create_context(content);
1033        let fixed = rule.fix(&ctx).unwrap();
1034
1035        let expected = r#"JavaScript here.
1036<!-- javascript in comment -->
1037More JavaScript."#;
1038
1039        assert_eq!(
1040            fixed, expected,
1041            "Should not fix names inside HTML comments when disabled"
1042        );
1043    }
1044
1045    #[test]
1046    fn test_proper_names_in_links_not_flagged() {
1047        let rule = MD044ProperNames::new(
1048            vec!["JavaScript".to_string(), "Node.js".to_string(), "Python".to_string()],
1049            true,
1050        );
1051
1052        let content = r#"Check this [javascript documentation](https://javascript.info) for info.
1053
1054Visit [node.js homepage](https://nodejs.org) and [python tutorial](https://python.org).
1055
1056Real javascript should be flagged.
1057
1058Also see the [typescript guide][ts-ref] for more.
1059
1060Real python should be flagged too.
1061
1062[ts-ref]: https://typescript.org/handbook"#;
1063
1064        let ctx = create_context(content);
1065        let result = rule.check(&ctx).unwrap();
1066
1067        // Only the real standalone proper names should be flagged
1068        assert_eq!(
1069            result.len(),
1070            2,
1071            "Expected exactly 2 warnings for standalone proper names"
1072        );
1073        assert!(result[0].message.contains("'javascript' should be 'JavaScript'"));
1074        assert!(result[1].message.contains("'python' should be 'Python'"));
1075        // Should be on lines with standalone instances
1076        assert!(result[0].line == 5); // "Real javascript should be flagged."
1077        assert!(result[1].line == 9); // "Real python should be flagged too."
1078    }
1079
1080    #[test]
1081    fn test_proper_names_in_images_not_flagged() {
1082        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1083
1084        let content = r#"Here is a ![javascript logo](javascript.png "javascript icon") image.
1085
1086Real javascript should be flagged."#;
1087
1088        let ctx = create_context(content);
1089        let result = rule.check(&ctx).unwrap();
1090
1091        // Only the standalone proper name should be flagged
1092        assert_eq!(result.len(), 1, "Expected exactly 1 warning for standalone proper name");
1093        assert!(result[0].message.contains("'javascript' should be 'JavaScript'"));
1094        assert!(result[0].line == 3); // "Real javascript should be flagged."
1095    }
1096
1097    #[test]
1098    fn test_proper_names_in_reference_definitions_not_flagged() {
1099        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "TypeScript".to_string()], true);
1100
1101        let content = r#"Check the [javascript guide][js-ref] for details.
1102
1103Real javascript should be flagged.
1104
1105[js-ref]: https://javascript.info/typescript/guide"#;
1106
1107        let ctx = create_context(content);
1108        let result = rule.check(&ctx).unwrap();
1109
1110        // Only the standalone proper name should be flagged
1111        assert_eq!(result.len(), 1, "Expected exactly 1 warning for standalone proper name");
1112        assert!(result[0].message.contains("'javascript' should be 'JavaScript'"));
1113        assert!(result[0].line == 3); // "Real javascript should be flagged."
1114    }
1115}
rumdl_lib/rules/md044_proper_names.rs

rumdl_lib/rules/
md044_proper_names.rs