Skip to main content

rumdl_lib/rules/
md044_proper_names.rs

1use crate::utils::fast_hash;
2use crate::utils::regex_cache::{escape_regex, get_cached_fancy_regex};
3
4use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, Severity};
5use std::collections::{HashMap, HashSet};
6use std::sync::{Arc, Mutex};
7
8mod md044_config;
9use md044_config::MD044Config;
10
11type WarningPosition = (usize, usize, String); // (line, column, found_name)
12
13/// Rule MD044: Proper names should be capitalized
14///
15/// See [docs/md044.md](../../docs/md044.md) for full documentation, configuration, and examples.
16///
17/// This rule is triggered when proper names are not capitalized correctly in the document.
18/// For example, if you have defined "JavaScript" as a proper name, the rule will flag any
19/// occurrences of "javascript" or "Javascript" as violations.
20///
21/// ## Purpose
22///
23/// Ensuring consistent capitalization of proper names improves document quality and
24/// professionalism. This is especially important for technical documentation where
25/// product names, programming languages, and technologies often have specific
26/// capitalization conventions.
27///
28/// ## Configuration Options
29///
30/// The rule supports the following configuration options:
31///
32/// ```yaml
33/// MD044:
34///   names: []                # List of proper names to check for correct capitalization
35///   code-blocks: false       # Whether to check code blocks (default: false)
36/// ```
37///
38/// Example configuration:
39///
40/// ```yaml
41/// MD044:
42///   names: ["JavaScript", "Node.js", "TypeScript"]
43///   code-blocks: true
44/// ```
45///
46/// ## Performance Optimizations
47///
48/// This rule implements several performance optimizations:
49///
50/// 1. **Regex Caching**: Pre-compiles and caches regex patterns for each proper name
51/// 2. **Content Caching**: Caches results based on content hashing for repeated checks
52/// 3. **Efficient Text Processing**: Uses optimized algorithms to avoid redundant text processing
53/// 4. **Smart Code Block Detection**: Efficiently identifies and optionally excludes code blocks
54///
55/// ## Edge Cases Handled
56///
57/// - **Word Boundaries**: Only matches complete words, not substrings within other words
58/// - **Case Sensitivity**: Properly handles case-specific matching
59/// - **Code Blocks**: Optionally checks code blocks (controlled by code-blocks setting)
60/// - **Markdown Formatting**: Handles proper names within Markdown formatting elements
61///
62/// ## Fix Behavior
63///
64/// When fixing issues, this rule replaces incorrect capitalization with the correct form
65/// as defined in the configuration.
66///
67#[derive(Clone)]
68pub struct MD044ProperNames {
69    config: MD044Config,
70    // Cache the combined regex pattern string
71    combined_pattern: Option<String>,
72    // Precomputed lowercase name variants for fast pre-checks
73    name_variants: Vec<String>,
74    // Cache for name violations by content hash
75    content_cache: Arc<Mutex<HashMap<u64, Vec<WarningPosition>>>>,
76}
77
78impl MD044ProperNames {
79    pub fn new(names: Vec<String>, code_blocks: bool) -> Self {
80        let config = MD044Config {
81            names,
82            code_blocks,
83            html_elements: true, // Default to checking HTML elements
84            html_comments: true, // Default to checking HTML comments
85        };
86        let combined_pattern = Self::create_combined_pattern(&config);
87        let name_variants = Self::build_name_variants(&config);
88        Self {
89            config,
90            combined_pattern,
91            name_variants,
92            content_cache: Arc::new(Mutex::new(HashMap::new())),
93        }
94    }
95
96    // Helper function for consistent ASCII normalization
97    fn ascii_normalize(s: &str) -> String {
98        s.replace(['é', 'è', 'ê', 'ë'], "e")
99            .replace(['à', 'á', 'â', 'ä', 'ã', 'å'], "a")
100            .replace(['ï', 'î', 'í', 'ì'], "i")
101            .replace(['ü', 'ú', 'ù', 'û'], "u")
102            .replace(['ö', 'ó', 'ò', 'ô', 'õ'], "o")
103            .replace('ñ', "n")
104            .replace('ç', "c")
105    }
106
107    pub fn from_config_struct(config: MD044Config) -> Self {
108        let combined_pattern = Self::create_combined_pattern(&config);
109        let name_variants = Self::build_name_variants(&config);
110        Self {
111            config,
112            combined_pattern,
113            name_variants,
114            content_cache: Arc::new(Mutex::new(HashMap::new())),
115        }
116    }
117
118    // Create a combined regex pattern for all proper names
119    fn create_combined_pattern(config: &MD044Config) -> Option<String> {
120        if config.names.is_empty() {
121            return None;
122        }
123
124        // Create patterns for all names and their variations
125        let mut patterns: Vec<String> = config
126            .names
127            .iter()
128            .flat_map(|name| {
129                let mut variations = vec![];
130                let lower_name = name.to_lowercase();
131
132                // Add the lowercase version
133                variations.push(escape_regex(&lower_name));
134
135                // Add version without dots
136                let lower_name_no_dots = lower_name.replace('.', "");
137                if lower_name != lower_name_no_dots {
138                    variations.push(escape_regex(&lower_name_no_dots));
139                }
140
141                // Add ASCII-normalized versions for common accented characters
142                let ascii_normalized = Self::ascii_normalize(&lower_name);
143
144                if ascii_normalized != lower_name {
145                    variations.push(escape_regex(&ascii_normalized));
146
147                    // Also add version without dots
148                    let ascii_no_dots = ascii_normalized.replace('.', "");
149                    if ascii_normalized != ascii_no_dots {
150                        variations.push(escape_regex(&ascii_no_dots));
151                    }
152                }
153
154                variations
155            })
156            .collect();
157
158        // Sort patterns by length (longest first) to avoid shorter patterns matching within longer ones
159        patterns.sort_by_key(|b| std::cmp::Reverse(b.len()));
160
161        // Combine all patterns into a single regex with capture groups
162        // Don't use \b as it doesn't work with Unicode - we'll check boundaries manually
163        Some(format!(r"(?i)({})", patterns.join("|")))
164    }
165
166    fn build_name_variants(config: &MD044Config) -> Vec<String> {
167        let mut variants = HashSet::new();
168        for name in &config.names {
169            let lower_name = name.to_lowercase();
170            variants.insert(lower_name.clone());
171
172            let lower_no_dots = lower_name.replace('.', "");
173            if lower_name != lower_no_dots {
174                variants.insert(lower_no_dots);
175            }
176
177            let ascii_normalized = Self::ascii_normalize(&lower_name);
178            if ascii_normalized != lower_name {
179                variants.insert(ascii_normalized.clone());
180
181                let ascii_no_dots = ascii_normalized.replace('.', "");
182                if ascii_normalized != ascii_no_dots {
183                    variants.insert(ascii_no_dots);
184                }
185            }
186        }
187
188        variants.into_iter().collect()
189    }
190
191    // Find all name violations in the content and return positions.
192    // `content_lower` is the pre-computed lowercase version of `content` to avoid redundant allocations.
193    fn find_name_violations(
194        &self,
195        content: &str,
196        ctx: &crate::lint_context::LintContext,
197        content_lower: &str,
198    ) -> Vec<WarningPosition> {
199        // Early return: if no names configured or content is empty
200        if self.config.names.is_empty() || content.is_empty() || self.combined_pattern.is_none() {
201            return Vec::new();
202        }
203
204        // Early return: quick check if any of the configured names might be in content
205        let has_potential_matches = self.name_variants.iter().any(|name| content_lower.contains(name));
206
207        if !has_potential_matches {
208            return Vec::new();
209        }
210
211        // Check if we have cached results
212        let hash = fast_hash(content);
213        {
214            // Use a separate scope for borrowing to minimize lock time
215            if let Ok(cache) = self.content_cache.lock()
216                && let Some(cached) = cache.get(&hash)
217            {
218                return cached.clone();
219            }
220        }
221
222        let mut violations = Vec::new();
223
224        // Get the regex from global cache
225        let combined_regex = match &self.combined_pattern {
226            Some(pattern) => match get_cached_fancy_regex(pattern) {
227                Ok(regex) => regex,
228                Err(_) => return Vec::new(),
229            },
230            None => return Vec::new(),
231        };
232
233        // Use ctx.lines for better performance
234        for (line_idx, line_info) in ctx.lines.iter().enumerate() {
235            let line_num = line_idx + 1;
236            let line = line_info.content(ctx.content);
237
238            // Skip code fence lines (```language or ~~~language)
239            let trimmed = line.trim_start();
240            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
241                continue;
242            }
243
244            // Skip if in code block (when code_blocks = false)
245            if !self.config.code_blocks && line_info.in_code_block {
246                continue;
247            }
248
249            // Skip if in HTML block (when html_elements = false)
250            if !self.config.html_elements && line_info.in_html_block {
251                continue;
252            }
253
254            // Skip HTML comments using pre-computed line flag
255            if !self.config.html_comments && line_info.in_html_comment {
256                continue;
257            }
258
259            // Skip JSX expressions and MDX comments (MDX flavor)
260            if line_info.in_jsx_expression || line_info.in_mdx_comment {
261                continue;
262            }
263
264            // Skip Obsidian comments (Obsidian flavor)
265            if line_info.in_obsidian_comment {
266                continue;
267            }
268
269            // Early return: skip lines that don't contain any potential matches
270            let line_lower = line.to_lowercase();
271            let has_line_matches = self.name_variants.iter().any(|name| line_lower.contains(name));
272
273            if !has_line_matches {
274                continue;
275            }
276
277            // Use the combined regex to find all matches in one pass
278            for cap_result in combined_regex.find_iter(line) {
279                match cap_result {
280                    Ok(cap) => {
281                        let found_name = &line[cap.start()..cap.end()];
282
283                        // Check word boundaries manually for Unicode support
284                        let start_pos = cap.start();
285                        let end_pos = cap.end();
286
287                        if !Self::is_at_word_boundary(line, start_pos, true)
288                            || !Self::is_at_word_boundary(line, end_pos, false)
289                        {
290                            continue; // Not at word boundary
291                        }
292
293                        // Skip if in inline code when code_blocks is false
294                        if !self.config.code_blocks {
295                            let byte_pos = line_info.byte_offset + cap.start();
296                            if ctx.is_in_code_block_or_span(byte_pos) {
297                                continue;
298                            }
299                        }
300
301                        // Skip if in link URL or reference definition
302                        let byte_pos = line_info.byte_offset + cap.start();
303                        if Self::is_in_link(ctx, byte_pos) {
304                            continue;
305                        }
306
307                        // Find which proper name this matches
308                        if let Some(proper_name) = self.get_proper_name_for(found_name) {
309                            // Only flag if it's not already correct
310                            if found_name != proper_name {
311                                violations.push((line_num, cap.start() + 1, found_name.to_string()));
312                            }
313                        }
314                    }
315                    Err(e) => {
316                        eprintln!("Regex execution error on line {line_num}: {e}");
317                    }
318                }
319            }
320        }
321
322        // Store in cache (ignore if mutex is poisoned)
323        if let Ok(mut cache) = self.content_cache.lock() {
324            cache.insert(hash, violations.clone());
325        }
326        violations
327    }
328
329    /// Check if a byte position is within a link URL (not link text)
330    ///
331    /// Link text should be checked for proper names, but URLs should be skipped.
332    /// For `[text](url)` - check text, skip url
333    /// For `[text][ref]` - check text, skip reference portion
334    /// For `[[text]]` (WikiLinks) - check text, skip brackets
335    fn is_in_link(ctx: &crate::lint_context::LintContext, byte_pos: usize) -> bool {
336        use pulldown_cmark::LinkType;
337
338        // Binary search links (sorted by byte_offset) to find candidate containing byte_pos
339        let link_idx = ctx.links.partition_point(|link| link.byte_offset <= byte_pos);
340        if link_idx > 0 {
341            let link = &ctx.links[link_idx - 1];
342            if byte_pos < link.byte_end {
343                // WikiLinks [[text]] start with '[[', regular links [text] start with '['
344                let text_start = if matches!(link.link_type, LinkType::WikiLink { .. }) {
345                    link.byte_offset + 2
346                } else {
347                    link.byte_offset + 1
348                };
349                let text_end = text_start + link.text.len();
350
351                // If position is within the text portion, don't skip
352                if byte_pos >= text_start && byte_pos < text_end {
353                    return false;
354                }
355                // Position is in the URL/reference portion, skip it
356                return true;
357            }
358        }
359
360        // Binary search images (sorted by byte_offset) to find candidate containing byte_pos
361        let image_idx = ctx.images.partition_point(|img| img.byte_offset <= byte_pos);
362        if image_idx > 0 {
363            let image = &ctx.images[image_idx - 1];
364            if byte_pos < image.byte_end {
365                // Image starts with '![' so alt text starts at byte_offset + 2
366                let alt_start = image.byte_offset + 2;
367                let alt_end = alt_start + image.alt_text.len();
368
369                // If position is within the alt text portion, don't skip
370                if byte_pos >= alt_start && byte_pos < alt_end {
371                    return false;
372                }
373                // Position is in the URL/reference portion, skip it
374                return true;
375            }
376        }
377
378        // Check pre-computed reference definitions
379        ctx.is_in_reference_def(byte_pos)
380    }
381
382    // Check if a character is a word boundary (handles Unicode)
383    fn is_word_boundary_char(c: char) -> bool {
384        !c.is_alphanumeric()
385    }
386
387    // Check if position is at a word boundary using O(1) byte-level lookups
388    fn is_at_word_boundary(content: &str, pos: usize, is_start: bool) -> bool {
389        if is_start {
390            if pos == 0 {
391                return true;
392            }
393            // Get the character immediately before `pos`
394            match content[..pos].chars().next_back() {
395                None => true,
396                Some(c) => Self::is_word_boundary_char(c),
397            }
398        } else {
399            if pos >= content.len() {
400                return true;
401            }
402            // Get the character at `pos`
403            match content[pos..].chars().next() {
404                None => true,
405                Some(c) => Self::is_word_boundary_char(c),
406            }
407        }
408    }
409
410    // Get the proper name that should be used for a found name
411    fn get_proper_name_for(&self, found_name: &str) -> Option<String> {
412        let found_lower = found_name.to_lowercase();
413
414        // Iterate through the configured proper names
415        for name in &self.config.names {
416            let lower_name = name.to_lowercase();
417            let lower_name_no_dots = lower_name.replace('.', "");
418
419            // Direct match
420            if found_lower == lower_name || found_lower == lower_name_no_dots {
421                return Some(name.clone());
422            }
423
424            // Check ASCII-normalized version
425            let ascii_normalized = Self::ascii_normalize(&lower_name);
426
427            let ascii_no_dots = ascii_normalized.replace('.', "");
428
429            if found_lower == ascii_normalized || found_lower == ascii_no_dots {
430                return Some(name.clone());
431            }
432        }
433        None
434    }
435}
436
437impl Rule for MD044ProperNames {
438    fn name(&self) -> &'static str {
439        "MD044"
440    }
441
442    fn description(&self) -> &'static str {
443        "Proper names should have the correct capitalization"
444    }
445
446    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
447        if self.config.names.is_empty() {
448            return true;
449        }
450        // Quick check if any configured name variants exist (case-insensitive)
451        let content_lower = if ctx.content.is_ascii() {
452            ctx.content.to_ascii_lowercase()
453        } else {
454            ctx.content.to_lowercase()
455        };
456        !self.name_variants.iter().any(|name| content_lower.contains(name))
457    }
458
459    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
460        let content = ctx.content;
461        if content.is_empty() || self.config.names.is_empty() || self.combined_pattern.is_none() {
462            return Ok(Vec::new());
463        }
464
465        // Compute lowercase content once and reuse across all checks
466        let content_lower = if content.is_ascii() {
467            content.to_ascii_lowercase()
468        } else {
469            content.to_lowercase()
470        };
471
472        // Early return: use pre-computed name_variants for the quick check
473        let has_potential_matches = self.name_variants.iter().any(|name| content_lower.contains(name));
474
475        if !has_potential_matches {
476            return Ok(Vec::new());
477        }
478
479        let line_index = &ctx.line_index;
480        let violations = self.find_name_violations(content, ctx, &content_lower);
481
482        let warnings = violations
483            .into_iter()
484            .filter_map(|(line, column, found_name)| {
485                self.get_proper_name_for(&found_name).map(|proper_name| LintWarning {
486                    rule_name: Some(self.name().to_string()),
487                    line,
488                    column,
489                    end_line: line,
490                    end_column: column + found_name.len(),
491                    message: format!("Proper name '{found_name}' should be '{proper_name}'"),
492                    severity: Severity::Warning,
493                    fix: Some(Fix {
494                        range: line_index.line_col_to_byte_range(line, column),
495                        replacement: proper_name,
496                    }),
497                })
498            })
499            .collect();
500
501        Ok(warnings)
502    }
503
504    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
505        let content = ctx.content;
506        if content.is_empty() || self.config.names.is_empty() {
507            return Ok(content.to_string());
508        }
509
510        let content_lower = if content.is_ascii() {
511            content.to_ascii_lowercase()
512        } else {
513            content.to_lowercase()
514        };
515        let violations = self.find_name_violations(content, ctx, &content_lower);
516        if violations.is_empty() {
517            return Ok(content.to_string());
518        }
519
520        // Process lines and build the fixed content
521        let mut fixed_lines = Vec::new();
522
523        // Group violations by line
524        let mut violations_by_line: HashMap<usize, Vec<(usize, String)>> = HashMap::new();
525        for (line_num, col_num, found_name) in violations {
526            violations_by_line
527                .entry(line_num)
528                .or_default()
529                .push((col_num, found_name));
530        }
531
532        // Sort violations within each line in reverse order
533        for violations in violations_by_line.values_mut() {
534            violations.sort_by_key(|b| std::cmp::Reverse(b.0));
535        }
536
537        // Process each line
538        for (line_idx, line_info) in ctx.lines.iter().enumerate() {
539            let line_num = line_idx + 1;
540
541            if let Some(line_violations) = violations_by_line.get(&line_num) {
542                // This line has violations, fix them
543                let mut fixed_line = line_info.content(ctx.content).to_string();
544
545                for (col_num, found_name) in line_violations {
546                    if let Some(proper_name) = self.get_proper_name_for(found_name) {
547                        let start_col = col_num - 1; // Convert to 0-based
548                        let end_col = start_col + found_name.len();
549
550                        if end_col <= fixed_line.len()
551                            && fixed_line.is_char_boundary(start_col)
552                            && fixed_line.is_char_boundary(end_col)
553                        {
554                            fixed_line.replace_range(start_col..end_col, &proper_name);
555                        }
556                    }
557                }
558
559                fixed_lines.push(fixed_line);
560            } else {
561                // No violations on this line, keep it as is
562                fixed_lines.push(line_info.content(ctx.content).to_string());
563            }
564        }
565
566        // Join lines with newlines, preserving the original ending
567        let mut result = fixed_lines.join("\n");
568        if content.ends_with('\n') && !result.ends_with('\n') {
569            result.push('\n');
570        }
571        Ok(result)
572    }
573
574    fn as_any(&self) -> &dyn std::any::Any {
575        self
576    }
577
578    fn default_config_section(&self) -> Option<(String, toml::Value)> {
579        let json_value = serde_json::to_value(&self.config).ok()?;
580        Some((
581            self.name().to_string(),
582            crate::rule_config_serde::json_to_toml_value(&json_value)?,
583        ))
584    }
585
586    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
587    where
588        Self: Sized,
589    {
590        let rule_config = crate::rule_config_serde::load_rule_config::<MD044Config>(config);
591        Box::new(Self::from_config_struct(rule_config))
592    }
593}
594
595#[cfg(test)]
596mod tests {
597    use super::*;
598    use crate::lint_context::LintContext;
599
600    fn create_context(content: &str) -> LintContext<'_> {
601        LintContext::new(content, crate::config::MarkdownFlavor::Standard, None)
602    }
603
604    #[test]
605    fn test_correctly_capitalized_names() {
606        let rule = MD044ProperNames::new(
607            vec![
608                "JavaScript".to_string(),
609                "TypeScript".to_string(),
610                "Node.js".to_string(),
611            ],
612            true,
613        );
614
615        let content = "This document uses JavaScript, TypeScript, and Node.js correctly.";
616        let ctx = create_context(content);
617        let result = rule.check(&ctx).unwrap();
618        assert!(result.is_empty(), "Should not flag correctly capitalized names");
619    }
620
621    #[test]
622    fn test_incorrectly_capitalized_names() {
623        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "TypeScript".to_string()], true);
624
625        let content = "This document uses javascript and typescript incorrectly.";
626        let ctx = create_context(content);
627        let result = rule.check(&ctx).unwrap();
628
629        assert_eq!(result.len(), 2, "Should flag two incorrect capitalizations");
630        assert_eq!(result[0].message, "Proper name 'javascript' should be 'JavaScript'");
631        assert_eq!(result[0].line, 1);
632        assert_eq!(result[0].column, 20);
633        assert_eq!(result[1].message, "Proper name 'typescript' should be 'TypeScript'");
634        assert_eq!(result[1].line, 1);
635        assert_eq!(result[1].column, 35);
636    }
637
638    #[test]
639    fn test_names_at_beginning_of_sentences() {
640        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "Python".to_string()], true);
641
642        let content = "javascript is a great language. python is also popular.";
643        let ctx = create_context(content);
644        let result = rule.check(&ctx).unwrap();
645
646        assert_eq!(result.len(), 2, "Should flag names at beginning of sentences");
647        assert_eq!(result[0].line, 1);
648        assert_eq!(result[0].column, 1);
649        assert_eq!(result[1].line, 1);
650        assert_eq!(result[1].column, 33);
651    }
652
653    #[test]
654    fn test_names_in_code_blocks_checked_by_default() {
655        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
656
657        let content = r#"Here is some text with JavaScript.
658
659```javascript
660// This javascript should be checked
661const lang = "javascript";
662```
663
664But this javascript should be flagged."#;
665
666        let ctx = create_context(content);
667        let result = rule.check(&ctx).unwrap();
668
669        assert_eq!(result.len(), 3, "Should flag javascript inside and outside code blocks");
670        assert_eq!(result[0].line, 4);
671        assert_eq!(result[1].line, 5);
672        assert_eq!(result[2].line, 8);
673    }
674
675    #[test]
676    fn test_names_in_code_blocks_ignored_when_disabled() {
677        let rule = MD044ProperNames::new(
678            vec!["JavaScript".to_string()],
679            false, // code_blocks = false means skip code blocks
680        );
681
682        let content = r#"```
683javascript in code block
684```"#;
685
686        let ctx = create_context(content);
687        let result = rule.check(&ctx).unwrap();
688
689        assert_eq!(
690            result.len(),
691            0,
692            "Should not flag javascript in code blocks when code_blocks is false"
693        );
694    }
695
696    #[test]
697    fn test_names_in_inline_code_checked_by_default() {
698        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
699
700        let content = "This is `javascript` in inline code and javascript outside.";
701        let ctx = create_context(content);
702        let result = rule.check(&ctx).unwrap();
703
704        // When code_blocks=true, inline code should be checked
705        assert_eq!(result.len(), 2, "Should flag javascript inside and outside inline code");
706        assert_eq!(result[0].column, 10); // javascript in inline code
707        assert_eq!(result[1].column, 41); // javascript outside
708    }
709
710    #[test]
711    fn test_multiple_names_in_same_line() {
712        let rule = MD044ProperNames::new(
713            vec!["JavaScript".to_string(), "TypeScript".to_string(), "React".to_string()],
714            true,
715        );
716
717        let content = "I use javascript, typescript, and react in my projects.";
718        let ctx = create_context(content);
719        let result = rule.check(&ctx).unwrap();
720
721        assert_eq!(result.len(), 3, "Should flag all three incorrect names");
722        assert_eq!(result[0].message, "Proper name 'javascript' should be 'JavaScript'");
723        assert_eq!(result[1].message, "Proper name 'typescript' should be 'TypeScript'");
724        assert_eq!(result[2].message, "Proper name 'react' should be 'React'");
725    }
726
727    #[test]
728    fn test_case_sensitivity() {
729        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
730
731        let content = "JAVASCRIPT, Javascript, javascript, and JavaScript variations.";
732        let ctx = create_context(content);
733        let result = rule.check(&ctx).unwrap();
734
735        assert_eq!(result.len(), 3, "Should flag all incorrect case variations");
736        // JavaScript (correct) should not be flagged
737        assert!(result.iter().all(|w| w.message.contains("should be 'JavaScript'")));
738    }
739
740    #[test]
741    fn test_configuration_with_custom_name_list() {
742        let config = MD044Config {
743            names: vec!["GitHub".to_string(), "GitLab".to_string(), "DevOps".to_string()],
744            code_blocks: true,
745            html_elements: true,
746            html_comments: true,
747        };
748        let rule = MD044ProperNames::from_config_struct(config);
749
750        let content = "We use github, gitlab, and devops for our workflow.";
751        let ctx = create_context(content);
752        let result = rule.check(&ctx).unwrap();
753
754        assert_eq!(result.len(), 3, "Should flag all custom names");
755        assert_eq!(result[0].message, "Proper name 'github' should be 'GitHub'");
756        assert_eq!(result[1].message, "Proper name 'gitlab' should be 'GitLab'");
757        assert_eq!(result[2].message, "Proper name 'devops' should be 'DevOps'");
758    }
759
760    #[test]
761    fn test_empty_configuration() {
762        let rule = MD044ProperNames::new(vec![], true);
763
764        let content = "This has javascript and typescript but no configured names.";
765        let ctx = create_context(content);
766        let result = rule.check(&ctx).unwrap();
767
768        assert!(result.is_empty(), "Should not flag anything with empty configuration");
769    }
770
771    #[test]
772    fn test_names_with_special_characters() {
773        let rule = MD044ProperNames::new(
774            vec!["Node.js".to_string(), "ASP.NET".to_string(), "C++".to_string()],
775            true,
776        );
777
778        let content = "We use nodejs, asp.net, ASP.NET, and c++ in our stack.";
779        let ctx = create_context(content);
780        let result = rule.check(&ctx).unwrap();
781
782        // nodejs should match Node.js (dotless variation)
783        // asp.net should be flagged (wrong case)
784        // ASP.NET should not be flagged (correct)
785        // c++ should be flagged
786        assert_eq!(result.len(), 3, "Should handle special characters correctly");
787
788        let messages: Vec<&str> = result.iter().map(|w| w.message.as_str()).collect();
789        assert!(messages.contains(&"Proper name 'nodejs' should be 'Node.js'"));
790        assert!(messages.contains(&"Proper name 'asp.net' should be 'ASP.NET'"));
791        assert!(messages.contains(&"Proper name 'c++' should be 'C++'"));
792    }
793
794    #[test]
795    fn test_word_boundaries() {
796        let rule = MD044ProperNames::new(vec!["Java".to_string(), "Script".to_string()], true);
797
798        let content = "JavaScript is not java or script, but Java and Script are separate.";
799        let ctx = create_context(content);
800        let result = rule.check(&ctx).unwrap();
801
802        // Should only flag lowercase "java" and "script" as separate words
803        assert_eq!(result.len(), 2, "Should respect word boundaries");
804        assert!(result.iter().any(|w| w.column == 19)); // "java" position
805        assert!(result.iter().any(|w| w.column == 27)); // "script" position
806    }
807
808    #[test]
809    fn test_fix_method() {
810        let rule = MD044ProperNames::new(
811            vec![
812                "JavaScript".to_string(),
813                "TypeScript".to_string(),
814                "Node.js".to_string(),
815            ],
816            true,
817        );
818
819        let content = "I love javascript, typescript, and nodejs!";
820        let ctx = create_context(content);
821        let fixed = rule.fix(&ctx).unwrap();
822
823        assert_eq!(fixed, "I love JavaScript, TypeScript, and Node.js!");
824    }
825
826    #[test]
827    fn test_fix_multiple_occurrences() {
828        let rule = MD044ProperNames::new(vec!["Python".to_string()], true);
829
830        let content = "python is great. I use python daily. PYTHON is powerful.";
831        let ctx = create_context(content);
832        let fixed = rule.fix(&ctx).unwrap();
833
834        assert_eq!(fixed, "Python is great. I use Python daily. Python is powerful.");
835    }
836
837    #[test]
838    fn test_fix_checks_code_blocks_by_default() {
839        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
840
841        let content = r#"I love javascript.
842
843```
844const lang = "javascript";
845```
846
847More javascript here."#;
848
849        let ctx = create_context(content);
850        let fixed = rule.fix(&ctx).unwrap();
851
852        let expected = r#"I love JavaScript.
853
854```
855const lang = "JavaScript";
856```
857
858More JavaScript here."#;
859
860        assert_eq!(fixed, expected);
861    }
862
863    #[test]
864    fn test_multiline_content() {
865        let rule = MD044ProperNames::new(vec!["Rust".to_string(), "Python".to_string()], true);
866
867        let content = r#"First line with rust.
868Second line with python.
869Third line with RUST and PYTHON."#;
870
871        let ctx = create_context(content);
872        let result = rule.check(&ctx).unwrap();
873
874        assert_eq!(result.len(), 4, "Should flag all incorrect occurrences");
875        assert_eq!(result[0].line, 1);
876        assert_eq!(result[1].line, 2);
877        assert_eq!(result[2].line, 3);
878        assert_eq!(result[3].line, 3);
879    }
880
881    #[test]
882    fn test_default_config() {
883        let config = MD044Config::default();
884        assert!(config.names.is_empty());
885        assert!(!config.code_blocks); // Default is false (skip code blocks)
886    }
887
888    #[test]
889    fn test_performance_with_many_names() {
890        let mut names = vec![];
891        for i in 0..50 {
892            names.push(format!("ProperName{i}"));
893        }
894
895        let rule = MD044ProperNames::new(names, true);
896
897        let content = "This has propername0, propername25, and propername49 incorrectly.";
898        let ctx = create_context(content);
899        let result = rule.check(&ctx).unwrap();
900
901        assert_eq!(result.len(), 3, "Should handle many configured names efficiently");
902    }
903
904    #[test]
905    fn test_large_name_count_performance() {
906        // Verify MD044 can handle large numbers of names without regex limitations
907        // This test confirms that fancy-regex handles large patterns well
908        let names = (0..1000).map(|i| format!("ProperName{i}")).collect::<Vec<_>>();
909
910        let rule = MD044ProperNames::new(names, true);
911
912        // The combined pattern should be created successfully
913        assert!(rule.combined_pattern.is_some());
914
915        // Should be able to check content without errors
916        let content = "This has propername0 and propername999 in it.";
917        let ctx = create_context(content);
918        let result = rule.check(&ctx).unwrap();
919
920        // Should detect both incorrect names
921        assert_eq!(result.len(), 2, "Should handle 1000 names without issues");
922    }
923
924    #[test]
925    fn test_cache_behavior() {
926        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
927
928        let content = "Using javascript here.";
929        let ctx = create_context(content);
930
931        // First check
932        let result1 = rule.check(&ctx).unwrap();
933        assert_eq!(result1.len(), 1);
934
935        // Second check should use cache
936        let result2 = rule.check(&ctx).unwrap();
937        assert_eq!(result2.len(), 1);
938
939        // Results should be identical
940        assert_eq!(result1[0].line, result2[0].line);
941        assert_eq!(result1[0].column, result2[0].column);
942    }
943
944    #[test]
945    fn test_html_comments_not_checked_when_disabled() {
946        let config = MD044Config {
947            names: vec!["JavaScript".to_string()],
948            code_blocks: true,    // Check code blocks
949            html_elements: true,  // Check HTML elements
950            html_comments: false, // Don't check HTML comments
951        };
952        let rule = MD044ProperNames::from_config_struct(config);
953
954        let content = r#"Regular javascript here.
955<!-- This javascript in HTML comment should be ignored -->
956More javascript outside."#;
957
958        let ctx = create_context(content);
959        let result = rule.check(&ctx).unwrap();
960
961        assert_eq!(result.len(), 2, "Should only flag javascript outside HTML comments");
962        assert_eq!(result[0].line, 1);
963        assert_eq!(result[1].line, 3);
964    }
965
966    #[test]
967    fn test_html_comments_checked_when_enabled() {
968        let config = MD044Config {
969            names: vec!["JavaScript".to_string()],
970            code_blocks: true,   // Check code blocks
971            html_elements: true, // Check HTML elements
972            html_comments: true, // Check HTML comments
973        };
974        let rule = MD044ProperNames::from_config_struct(config);
975
976        let content = r#"Regular javascript here.
977<!-- This javascript in HTML comment should be checked -->
978More javascript outside."#;
979
980        let ctx = create_context(content);
981        let result = rule.check(&ctx).unwrap();
982
983        assert_eq!(
984            result.len(),
985            3,
986            "Should flag all javascript occurrences including in HTML comments"
987        );
988    }
989
990    #[test]
991    fn test_multiline_html_comments() {
992        let config = MD044Config {
993            names: vec!["Python".to_string(), "JavaScript".to_string()],
994            code_blocks: true,    // Check code blocks
995            html_elements: true,  // Check HTML elements
996            html_comments: false, // Don't check HTML comments
997        };
998        let rule = MD044ProperNames::from_config_struct(config);
999
1000        let content = r#"Regular python here.
1001<!--
1002This is a multiline comment
1003with javascript and python
1004that should be ignored
1005-->
1006More javascript outside."#;
1007
1008        let ctx = create_context(content);
1009        let result = rule.check(&ctx).unwrap();
1010
1011        assert_eq!(result.len(), 2, "Should only flag names outside HTML comments");
1012        assert_eq!(result[0].line, 1); // python
1013        assert_eq!(result[1].line, 7); // javascript
1014    }
1015
1016    #[test]
1017    fn test_fix_preserves_html_comments_when_disabled() {
1018        let config = MD044Config {
1019            names: vec!["JavaScript".to_string()],
1020            code_blocks: true,    // Check code blocks
1021            html_elements: true,  // Check HTML elements
1022            html_comments: false, // Don't check HTML comments
1023        };
1024        let rule = MD044ProperNames::from_config_struct(config);
1025
1026        let content = r#"javascript here.
1027<!-- javascript in comment -->
1028More javascript."#;
1029
1030        let ctx = create_context(content);
1031        let fixed = rule.fix(&ctx).unwrap();
1032
1033        let expected = r#"JavaScript here.
1034<!-- javascript in comment -->
1035More JavaScript."#;
1036
1037        assert_eq!(
1038            fixed, expected,
1039            "Should not fix names inside HTML comments when disabled"
1040        );
1041    }
1042
1043    #[test]
1044    fn test_proper_names_in_link_text_are_flagged() {
1045        let rule = MD044ProperNames::new(
1046            vec!["JavaScript".to_string(), "Node.js".to_string(), "Python".to_string()],
1047            true,
1048        );
1049
1050        let content = r#"Check this [javascript documentation](https://javascript.info) for info.
1051
1052Visit [node.js homepage](https://nodejs.org) and [python tutorial](https://python.org).
1053
1054Real javascript should be flagged.
1055
1056Also see the [typescript guide][ts-ref] for more.
1057
1058Real python should be flagged too.
1059
1060[ts-ref]: https://typescript.org/handbook"#;
1061
1062        let ctx = create_context(content);
1063        let result = rule.check(&ctx).unwrap();
1064
1065        // Link text should be checked, URLs should not be checked
1066        // Line 1: [javascript documentation] - "javascript" should be flagged
1067        // Line 3: [node.js homepage] - "node.js" should be flagged (matches "Node.js")
1068        // Line 3: [python tutorial] - "python" should be flagged
1069        // Line 5: standalone javascript
1070        // Line 9: standalone python
1071        assert_eq!(result.len(), 5, "Expected 5 warnings: 3 in link text + 2 standalone");
1072
1073        // Verify line numbers for link text warnings
1074        let line_1_warnings: Vec<_> = result.iter().filter(|w| w.line == 1).collect();
1075        assert_eq!(line_1_warnings.len(), 1);
1076        assert!(
1077            line_1_warnings[0]
1078                .message
1079                .contains("'javascript' should be 'JavaScript'")
1080        );
1081
1082        let line_3_warnings: Vec<_> = result.iter().filter(|w| w.line == 3).collect();
1083        assert_eq!(line_3_warnings.len(), 2); // node.js and python
1084
1085        // Standalone warnings
1086        assert!(result.iter().any(|w| w.line == 5 && w.message.contains("'javascript'")));
1087        assert!(result.iter().any(|w| w.line == 9 && w.message.contains("'python'")));
1088    }
1089
1090    #[test]
1091    fn test_link_urls_not_flagged() {
1092        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1093
1094        // URL contains "javascript" but should NOT be flagged
1095        let content = r#"[Link Text](https://javascript.info/guide)"#;
1096
1097        let ctx = create_context(content);
1098        let result = rule.check(&ctx).unwrap();
1099
1100        // URL should not be checked
1101        assert!(result.is_empty(), "URLs should not be checked for proper names");
1102    }
1103
1104    #[test]
1105    fn test_proper_names_in_image_alt_text_are_flagged() {
1106        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1107
1108        let content = r#"Here is a ![javascript logo](javascript.png "javascript icon") image.
1109
1110Real javascript should be flagged."#;
1111
1112        let ctx = create_context(content);
1113        let result = rule.check(&ctx).unwrap();
1114
1115        // Image alt text should be checked, URL and title should not be checked
1116        // Line 1: ![javascript logo] - "javascript" should be flagged
1117        // Line 3: standalone javascript
1118        assert_eq!(result.len(), 2, "Expected 2 warnings: 1 in alt text + 1 standalone");
1119        assert!(result[0].message.contains("'javascript' should be 'JavaScript'"));
1120        assert!(result[0].line == 1); // "![javascript logo]"
1121        assert!(result[1].message.contains("'javascript' should be 'JavaScript'"));
1122        assert!(result[1].line == 3); // "Real javascript should be flagged."
1123    }
1124
1125    #[test]
1126    fn test_image_urls_not_flagged() {
1127        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1128
1129        // URL contains "javascript" but should NOT be flagged
1130        let content = r#"![Logo](https://javascript.info/logo.png)"#;
1131
1132        let ctx = create_context(content);
1133        let result = rule.check(&ctx).unwrap();
1134
1135        // Image URL should not be checked
1136        assert!(result.is_empty(), "Image URLs should not be checked for proper names");
1137    }
1138
1139    #[test]
1140    fn test_reference_link_text_flagged_but_definition_not() {
1141        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "TypeScript".to_string()], true);
1142
1143        let content = r#"Check the [javascript guide][js-ref] for details.
1144
1145Real javascript should be flagged.
1146
1147[js-ref]: https://javascript.info/typescript/guide"#;
1148
1149        let ctx = create_context(content);
1150        let result = rule.check(&ctx).unwrap();
1151
1152        // Link text should be checked, reference definitions should not
1153        // Line 1: [javascript guide] - should be flagged
1154        // Line 3: standalone javascript - should be flagged
1155        // Line 5: reference definition - should NOT be flagged
1156        assert_eq!(result.len(), 2, "Expected 2 warnings: 1 in link text + 1 standalone");
1157        assert!(result.iter().any(|w| w.line == 1 && w.message.contains("'javascript'")));
1158        assert!(result.iter().any(|w| w.line == 3 && w.message.contains("'javascript'")));
1159    }
1160
1161    #[test]
1162    fn test_reference_definitions_not_flagged() {
1163        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1164
1165        // Reference definition should NOT be flagged
1166        let content = r#"[js-ref]: https://javascript.info/guide"#;
1167
1168        let ctx = create_context(content);
1169        let result = rule.check(&ctx).unwrap();
1170
1171        // Reference definition URLs should not be checked
1172        assert!(result.is_empty(), "Reference definitions should not be checked");
1173    }
1174
1175    #[test]
1176    fn test_wikilinks_text_is_flagged() {
1177        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1178
1179        // WikiLinks [[destination]] should have their text checked
1180        let content = r#"[[javascript]]
1181
1182Regular javascript here.
1183
1184[[JavaScript|display text]]"#;
1185
1186        let ctx = create_context(content);
1187        let result = rule.check(&ctx).unwrap();
1188
1189        // Line 1: [[javascript]] - should be flagged (WikiLink text)
1190        // Line 3: standalone javascript - should be flagged
1191        // Line 5: [[JavaScript|display text]] - correct capitalization, no flag
1192        assert_eq!(result.len(), 2, "Expected 2 warnings: 1 in WikiLink + 1 standalone");
1193        assert!(
1194            result
1195                .iter()
1196                .any(|w| w.line == 1 && w.column == 3 && w.message.contains("'javascript'"))
1197        );
1198        assert!(result.iter().any(|w| w.line == 3 && w.message.contains("'javascript'")));
1199    }
1200}