rumdl_lib/rules/
md044_proper_names.rs

1use crate::utils::fast_hash;
2use crate::utils::regex_cache::{escape_regex, get_cached_fancy_regex};
3
4use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, Severity};
5use fancy_regex::Regex;
6use std::collections::{HashMap, HashSet};
7use std::sync::LazyLock;
8use std::sync::{Arc, Mutex};
9
10mod md044_config;
11use md044_config::MD044Config;
12
13static HTML_COMMENT_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<!--([\s\S]*?)-->").unwrap());
14// Reference definition pattern - matches [ref]: url "title"
15static REF_DEF_REGEX: LazyLock<regex::Regex> = LazyLock::new(|| {
16    regex::Regex::new(r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*([^\s]+)(?:\s+(?:"([^"]*)"|'([^']*)'))?$"#).unwrap()
17});
18
19type WarningPosition = (usize, usize, String); // (line, column, found_name)
20
21/// Rule MD044: Proper names should be capitalized
22///
23/// See [docs/md044.md](../../docs/md044.md) for full documentation, configuration, and examples.
24///
25/// This rule is triggered when proper names are not capitalized correctly in the document.
26/// For example, if you have defined "JavaScript" as a proper name, the rule will flag any
27/// occurrences of "javascript" or "Javascript" as violations.
28///
29/// ## Purpose
30///
31/// Ensuring consistent capitalization of proper names improves document quality and
32/// professionalism. This is especially important for technical documentation where
33/// product names, programming languages, and technologies often have specific
34/// capitalization conventions.
35///
36/// ## Configuration Options
37///
38/// The rule supports the following configuration options:
39///
40/// ```yaml
41/// MD044:
42///   names: []                # List of proper names to check for correct capitalization
43///   code-blocks: false       # Whether to check code blocks (default: false)
44/// ```
45///
46/// Example configuration:
47///
48/// ```yaml
49/// MD044:
50///   names: ["JavaScript", "Node.js", "TypeScript"]
51///   code-blocks: true
52/// ```
53///
54/// ## Performance Optimizations
55///
56/// This rule implements several performance optimizations:
57///
58/// 1. **Regex Caching**: Pre-compiles and caches regex patterns for each proper name
59/// 2. **Content Caching**: Caches results based on content hashing for repeated checks
60/// 3. **Efficient Text Processing**: Uses optimized algorithms to avoid redundant text processing
61/// 4. **Smart Code Block Detection**: Efficiently identifies and optionally excludes code blocks
62///
63/// ## Edge Cases Handled
64///
65/// - **Word Boundaries**: Only matches complete words, not substrings within other words
66/// - **Case Sensitivity**: Properly handles case-specific matching
67/// - **Code Blocks**: Optionally checks code blocks (controlled by code-blocks setting)
68/// - **Markdown Formatting**: Handles proper names within Markdown formatting elements
69///
70/// ## Fix Behavior
71///
72/// When fixing issues, this rule replaces incorrect capitalization with the correct form
73/// as defined in the configuration.
74///
75#[derive(Clone)]
76pub struct MD044ProperNames {
77    config: MD044Config,
78    // Cache the combined regex pattern string
79    combined_pattern: Option<String>,
80    // Precomputed lowercase name variants for fast pre-checks
81    name_variants: Vec<String>,
82    // Cache for name violations by content hash
83    content_cache: Arc<Mutex<HashMap<u64, Vec<WarningPosition>>>>,
84}
85
86impl MD044ProperNames {
87    pub fn new(names: Vec<String>, code_blocks: bool) -> Self {
88        let config = MD044Config {
89            names,
90            code_blocks,
91            html_elements: true, // Default to checking HTML elements
92            html_comments: true, // Default to checking HTML comments
93        };
94        let combined_pattern = Self::create_combined_pattern(&config);
95        let name_variants = Self::build_name_variants(&config);
96        Self {
97            config,
98            combined_pattern,
99            name_variants,
100            content_cache: Arc::new(Mutex::new(HashMap::new())),
101        }
102    }
103
104    // Helper function for consistent ASCII normalization
105    fn ascii_normalize(s: &str) -> String {
106        s.replace(['é', 'è', 'ê', 'ë'], "e")
107            .replace(['à', 'á', 'â', 'ä', 'ã', 'å'], "a")
108            .replace(['ï', 'î', 'í', 'ì'], "i")
109            .replace(['ü', 'ú', 'ù', 'û'], "u")
110            .replace(['ö', 'ó', 'ò', 'ô', 'õ'], "o")
111            .replace('ñ', "n")
112            .replace('ç', "c")
113    }
114
115    pub fn from_config_struct(config: MD044Config) -> Self {
116        let combined_pattern = Self::create_combined_pattern(&config);
117        let name_variants = Self::build_name_variants(&config);
118        Self {
119            config,
120            combined_pattern,
121            name_variants,
122            content_cache: Arc::new(Mutex::new(HashMap::new())),
123        }
124    }
125
126    // Create a combined regex pattern for all proper names
127    fn create_combined_pattern(config: &MD044Config) -> Option<String> {
128        if config.names.is_empty() {
129            return None;
130        }
131
132        // Create patterns for all names and their variations
133        let mut patterns: Vec<String> = config
134            .names
135            .iter()
136            .flat_map(|name| {
137                let mut variations = vec![];
138                let lower_name = name.to_lowercase();
139
140                // Add the lowercase version
141                variations.push(escape_regex(&lower_name));
142
143                // Add version without dots
144                let lower_name_no_dots = lower_name.replace('.', "");
145                if lower_name != lower_name_no_dots {
146                    variations.push(escape_regex(&lower_name_no_dots));
147                }
148
149                // Add ASCII-normalized versions for common accented characters
150                let ascii_normalized = Self::ascii_normalize(&lower_name);
151
152                if ascii_normalized != lower_name {
153                    variations.push(escape_regex(&ascii_normalized));
154
155                    // Also add version without dots
156                    let ascii_no_dots = ascii_normalized.replace('.', "");
157                    if ascii_normalized != ascii_no_dots {
158                        variations.push(escape_regex(&ascii_no_dots));
159                    }
160                }
161
162                variations
163            })
164            .collect();
165
166        // Sort patterns by length (longest first) to avoid shorter patterns matching within longer ones
167        patterns.sort_by_key(|b| std::cmp::Reverse(b.len()));
168
169        // Combine all patterns into a single regex with capture groups
170        // Don't use \b as it doesn't work with Unicode - we'll check boundaries manually
171        Some(format!(r"(?i)({})", patterns.join("|")))
172    }
173
174    fn build_name_variants(config: &MD044Config) -> Vec<String> {
175        let mut variants = HashSet::new();
176        for name in &config.names {
177            let lower_name = name.to_lowercase();
178            variants.insert(lower_name.clone());
179
180            let lower_no_dots = lower_name.replace('.', "");
181            if lower_name != lower_no_dots {
182                variants.insert(lower_no_dots);
183            }
184
185            let ascii_normalized = Self::ascii_normalize(&lower_name);
186            if ascii_normalized != lower_name {
187                variants.insert(ascii_normalized.clone());
188
189                let ascii_no_dots = ascii_normalized.replace('.', "");
190                if ascii_normalized != ascii_no_dots {
191                    variants.insert(ascii_no_dots);
192                }
193            }
194        }
195
196        variants.into_iter().collect()
197    }
198
199    // Find all name violations in the content and return positions
200    fn find_name_violations(&self, content: &str, ctx: &crate::lint_context::LintContext) -> Vec<WarningPosition> {
201        // Early return: if no names configured or content is empty
202        if self.config.names.is_empty() || content.is_empty() || self.combined_pattern.is_none() {
203            return Vec::new();
204        }
205
206        // Early return: quick check if any of the configured names might be in content
207        let content_lower = if content.is_ascii() {
208            content.to_ascii_lowercase()
209        } else {
210            content.to_lowercase()
211        };
212        let has_potential_matches = self.name_variants.iter().any(|name| content_lower.contains(name));
213
214        if !has_potential_matches {
215            return Vec::new();
216        }
217
218        // Check if we have cached results
219        let hash = fast_hash(content);
220        {
221            // Use a separate scope for borrowing to minimize lock time
222            if let Ok(cache) = self.content_cache.lock()
223                && let Some(cached) = cache.get(&hash)
224            {
225                return cached.clone();
226            }
227        }
228
229        let mut violations = Vec::new();
230
231        // Get the regex from global cache
232        let combined_regex = match &self.combined_pattern {
233            Some(pattern) => match get_cached_fancy_regex(pattern) {
234                Ok(regex) => regex,
235                Err(_) => return Vec::new(),
236            },
237            None => return Vec::new(),
238        };
239
240        // Use ctx.lines for better performance
241        for (line_idx, line_info) in ctx.lines.iter().enumerate() {
242            let line_num = line_idx + 1;
243            let line = line_info.content(ctx.content);
244
245            // Skip code fence lines (```language or ~~~language)
246            let trimmed = line.trim_start();
247            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
248                continue;
249            }
250
251            // Skip if in code block (when code_blocks = false)
252            if !self.config.code_blocks && line_info.in_code_block {
253                continue;
254            }
255
256            // Skip if in HTML block (when html_elements = false)
257            if !self.config.html_elements && line_info.in_html_block {
258                continue;
259            }
260
261            // Check if we should skip HTML comments
262            let in_html_comment = if !self.config.html_comments {
263                // Check if this position is within an HTML comment
264                self.is_in_html_comment(content, line_info.byte_offset)
265            } else {
266                false
267            };
268
269            if in_html_comment {
270                continue;
271            }
272
273            // Skip JSX expressions and MDX comments (MDX flavor)
274            if line_info.in_jsx_expression || line_info.in_mdx_comment {
275                continue;
276            }
277
278            // Skip Obsidian comments (Obsidian flavor)
279            if line_info.in_obsidian_comment {
280                continue;
281            }
282
283            // Early return: skip lines that don't contain any potential matches
284            let line_lower = line.to_lowercase();
285            let has_line_matches = self.name_variants.iter().any(|name| line_lower.contains(name));
286
287            if !has_line_matches {
288                continue;
289            }
290
291            // Use the combined regex to find all matches in one pass
292            for cap_result in combined_regex.find_iter(line) {
293                match cap_result {
294                    Ok(cap) => {
295                        let found_name = &line[cap.start()..cap.end()];
296
297                        // Check word boundaries manually for Unicode support
298                        let start_pos = cap.start();
299                        let end_pos = cap.end();
300
301                        if !self.is_at_word_boundary(line, start_pos, true)
302                            || !self.is_at_word_boundary(line, end_pos, false)
303                        {
304                            continue; // Not at word boundary
305                        }
306
307                        // Skip if in inline code when code_blocks is false
308                        if !self.config.code_blocks {
309                            let byte_pos = line_info.byte_offset + cap.start();
310                            if ctx.is_in_code_block_or_span(byte_pos) {
311                                continue;
312                            }
313                        }
314
315                        // Skip if in link (inline links, reference links, or reference definitions)
316                        let byte_pos = line_info.byte_offset + cap.start();
317                        if self.is_in_link(ctx, byte_pos) {
318                            continue;
319                        }
320
321                        // Find which proper name this matches
322                        if let Some(proper_name) = self.get_proper_name_for(found_name) {
323                            // Only flag if it's not already correct
324                            if found_name != proper_name {
325                                violations.push((line_num, cap.start() + 1, found_name.to_string()));
326                            }
327                        }
328                    }
329                    Err(e) => {
330                        eprintln!("Regex execution error on line {line_num}: {e}");
331                    }
332                }
333            }
334        }
335
336        // Store in cache (ignore if mutex is poisoned)
337        if let Ok(mut cache) = self.content_cache.lock() {
338            cache.insert(hash, violations.clone());
339        }
340        violations
341    }
342
343    // Check if a byte position is within an HTML comment
344    fn is_in_html_comment(&self, content: &str, byte_pos: usize) -> bool {
345        for m in HTML_COMMENT_REGEX.find_iter(content).flatten() {
346            if m.start() <= byte_pos && byte_pos < m.end() {
347                return true;
348            }
349        }
350        false
351    }
352
353    /// Check if a byte position is within a link URL (not link text)
354    ///
355    /// Link text should be checked for proper names, but URLs should be skipped.
356    /// For `[text](url)` - check text, skip url
357    /// For `[text][ref]` - check text, skip reference portion
358    /// For `[[text]]` (WikiLinks) - check text, skip brackets
359    fn is_in_link(&self, ctx: &crate::lint_context::LintContext, byte_pos: usize) -> bool {
360        use pulldown_cmark::LinkType;
361
362        // Check inline and reference links - only skip if position is in URL portion, not text portion
363        for link in &ctx.links {
364            if link.byte_offset <= byte_pos && byte_pos < link.byte_end {
365                // Calculate where the link text starts based on link type
366                // WikiLinks [[text]] start with '[[' so text is at byte_offset + 2
367                // Regular links [text] start with '[' so text is at byte_offset + 1
368                let text_start = if matches!(link.link_type, LinkType::WikiLink { .. }) {
369                    link.byte_offset + 2
370                } else {
371                    link.byte_offset + 1
372                };
373                let text_end = text_start + link.text.len();
374
375                // If position is within the text portion, don't skip (return false)
376                if byte_pos >= text_start && byte_pos < text_end {
377                    return false;
378                }
379
380                // Position is in the URL/reference portion, skip it
381                return true;
382            }
383        }
384
385        // Check images - only skip URL portion, not alt text
386        for image in &ctx.images {
387            if image.byte_offset <= byte_pos && byte_pos < image.byte_end {
388                // Image starts with '![' so alt text starts at byte_offset + 2
389                let alt_start = image.byte_offset + 2;
390                let alt_end = alt_start + image.alt_text.len();
391
392                // If position is within the alt text portion, don't skip (return false)
393                if byte_pos >= alt_start && byte_pos < alt_end {
394                    return false;
395                }
396
397                // Position is in the URL/reference portion, skip it
398                return true;
399            }
400        }
401
402        // Check reference definitions [ref]: url "title" using regex pattern
403        // Skip the entire reference definition line
404        for m in REF_DEF_REGEX.find_iter(ctx.content) {
405            if m.start() <= byte_pos && byte_pos < m.end() {
406                return true;
407            }
408        }
409
410        false
411    }
412
413    // Check if a character is a word boundary (handles Unicode)
414    fn is_word_boundary_char(c: char) -> bool {
415        !c.is_alphanumeric()
416    }
417
418    // Check if position is at a word boundary
419    fn is_at_word_boundary(&self, content: &str, pos: usize, is_start: bool) -> bool {
420        let chars: Vec<char> = content.chars().collect();
421        let char_indices: Vec<(usize, char)> = content.char_indices().collect();
422
423        // Find the character position
424        let char_pos = char_indices.iter().position(|(idx, _)| *idx == pos);
425        if char_pos.is_none() {
426            return true; // If we can't find position, assume boundary
427        }
428        let char_pos = char_pos.unwrap();
429
430        if is_start {
431            // Check character before position
432            if char_pos == 0 {
433                return true; // Start of string
434            }
435            Self::is_word_boundary_char(chars[char_pos - 1])
436        } else {
437            // Check character after position
438            if char_pos >= chars.len() {
439                return true; // End of string
440            }
441            Self::is_word_boundary_char(chars[char_pos])
442        }
443    }
444
445    // Get the proper name that should be used for a found name
446    fn get_proper_name_for(&self, found_name: &str) -> Option<String> {
447        let found_lower = found_name.to_lowercase();
448
449        // Iterate through the configured proper names
450        for name in &self.config.names {
451            let lower_name = name.to_lowercase();
452            let lower_name_no_dots = lower_name.replace('.', "");
453
454            // Direct match
455            if found_lower == lower_name || found_lower == lower_name_no_dots {
456                return Some(name.clone());
457            }
458
459            // Check ASCII-normalized version
460            let ascii_normalized = Self::ascii_normalize(&lower_name);
461
462            let ascii_no_dots = ascii_normalized.replace('.', "");
463
464            if found_lower == ascii_normalized || found_lower == ascii_no_dots {
465                return Some(name.clone());
466            }
467        }
468        None
469    }
470}
471
472impl Rule for MD044ProperNames {
473    fn name(&self) -> &'static str {
474        "MD044"
475    }
476
477    fn description(&self) -> &'static str {
478        "Proper names should have the correct capitalization"
479    }
480
481    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
482        if self.config.names.is_empty() {
483            return true;
484        }
485        // Quick check if any configured names exist (case-insensitive)
486        let content_lower = ctx.content.to_lowercase();
487        !self
488            .config
489            .names
490            .iter()
491            .any(|name| content_lower.contains(&name.to_lowercase()))
492    }
493
494    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
495        let content = ctx.content;
496        if content.is_empty() || self.config.names.is_empty() || self.combined_pattern.is_none() {
497            return Ok(Vec::new());
498        }
499
500        // Early return: quick check if any of the configured names might be in content
501        let content_lower = content.to_lowercase();
502        let has_potential_matches = self.config.names.iter().any(|name| {
503            let name_lower = name.to_lowercase();
504            let name_no_dots = name_lower.replace('.', "");
505
506            // Check direct match
507            if content_lower.contains(&name_lower) || content_lower.contains(&name_no_dots) {
508                return true;
509            }
510
511            // Also check ASCII-normalized version
512            let ascii_normalized = Self::ascii_normalize(&name_lower);
513
514            if ascii_normalized != name_lower {
515                if content_lower.contains(&ascii_normalized) {
516                    return true;
517                }
518                let ascii_no_dots = ascii_normalized.replace('.', "");
519                if ascii_normalized != ascii_no_dots && content_lower.contains(&ascii_no_dots) {
520                    return true;
521                }
522            }
523
524            false
525        });
526
527        if !has_potential_matches {
528            return Ok(Vec::new());
529        }
530
531        let line_index = &ctx.line_index;
532        let violations = self.find_name_violations(content, ctx);
533
534        let warnings = violations
535            .into_iter()
536            .filter_map(|(line, column, found_name)| {
537                self.get_proper_name_for(&found_name).map(|proper_name| LintWarning {
538                    rule_name: Some(self.name().to_string()),
539                    line,
540                    column,
541                    end_line: line,
542                    end_column: column + found_name.len(),
543                    message: format!("Proper name '{found_name}' should be '{proper_name}'"),
544                    severity: Severity::Warning,
545                    fix: Some(Fix {
546                        range: line_index.line_col_to_byte_range(line, column),
547                        replacement: proper_name,
548                    }),
549                })
550            })
551            .collect();
552
553        Ok(warnings)
554    }
555
556    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
557        let content = ctx.content;
558        if content.is_empty() || self.config.names.is_empty() {
559            return Ok(content.to_string());
560        }
561
562        let violations = self.find_name_violations(content, ctx);
563        if violations.is_empty() {
564            return Ok(content.to_string());
565        }
566
567        // Process lines and build the fixed content
568        let mut fixed_lines = Vec::new();
569
570        // Group violations by line
571        let mut violations_by_line: HashMap<usize, Vec<(usize, String)>> = HashMap::new();
572        for (line_num, col_num, found_name) in violations {
573            violations_by_line
574                .entry(line_num)
575                .or_default()
576                .push((col_num, found_name));
577        }
578
579        // Sort violations within each line in reverse order
580        for violations in violations_by_line.values_mut() {
581            violations.sort_by_key(|b| std::cmp::Reverse(b.0));
582        }
583
584        // Process each line
585        for (line_idx, line_info) in ctx.lines.iter().enumerate() {
586            let line_num = line_idx + 1;
587
588            if let Some(line_violations) = violations_by_line.get(&line_num) {
589                // This line has violations, fix them
590                let mut fixed_line = line_info.content(ctx.content).to_string();
591
592                for (col_num, found_name) in line_violations {
593                    if let Some(proper_name) = self.get_proper_name_for(found_name) {
594                        let start_col = col_num - 1; // Convert to 0-based
595                        let end_col = start_col + found_name.len();
596
597                        if end_col <= fixed_line.len()
598                            && fixed_line.is_char_boundary(start_col)
599                            && fixed_line.is_char_boundary(end_col)
600                        {
601                            fixed_line.replace_range(start_col..end_col, &proper_name);
602                        }
603                    }
604                }
605
606                fixed_lines.push(fixed_line);
607            } else {
608                // No violations on this line, keep it as is
609                fixed_lines.push(line_info.content(ctx.content).to_string());
610            }
611        }
612
613        // Join lines with newlines, preserving the original ending
614        let mut result = fixed_lines.join("\n");
615        if content.ends_with('\n') && !result.ends_with('\n') {
616            result.push('\n');
617        }
618        Ok(result)
619    }
620
621    fn as_any(&self) -> &dyn std::any::Any {
622        self
623    }
624
625    fn default_config_section(&self) -> Option<(String, toml::Value)> {
626        let json_value = serde_json::to_value(&self.config).ok()?;
627        Some((
628            self.name().to_string(),
629            crate::rule_config_serde::json_to_toml_value(&json_value)?,
630        ))
631    }
632
633    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
634    where
635        Self: Sized,
636    {
637        let rule_config = crate::rule_config_serde::load_rule_config::<MD044Config>(config);
638        Box::new(Self::from_config_struct(rule_config))
639    }
640}
641
642#[cfg(test)]
643mod tests {
644    use super::*;
645    use crate::lint_context::LintContext;
646
647    fn create_context(content: &str) -> LintContext<'_> {
648        LintContext::new(content, crate::config::MarkdownFlavor::Standard, None)
649    }
650
651    #[test]
652    fn test_correctly_capitalized_names() {
653        let rule = MD044ProperNames::new(
654            vec![
655                "JavaScript".to_string(),
656                "TypeScript".to_string(),
657                "Node.js".to_string(),
658            ],
659            true,
660        );
661
662        let content = "This document uses JavaScript, TypeScript, and Node.js correctly.";
663        let ctx = create_context(content);
664        let result = rule.check(&ctx).unwrap();
665        assert!(result.is_empty(), "Should not flag correctly capitalized names");
666    }
667
668    #[test]
669    fn test_incorrectly_capitalized_names() {
670        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "TypeScript".to_string()], true);
671
672        let content = "This document uses javascript and typescript incorrectly.";
673        let ctx = create_context(content);
674        let result = rule.check(&ctx).unwrap();
675
676        assert_eq!(result.len(), 2, "Should flag two incorrect capitalizations");
677        assert_eq!(result[0].message, "Proper name 'javascript' should be 'JavaScript'");
678        assert_eq!(result[0].line, 1);
679        assert_eq!(result[0].column, 20);
680        assert_eq!(result[1].message, "Proper name 'typescript' should be 'TypeScript'");
681        assert_eq!(result[1].line, 1);
682        assert_eq!(result[1].column, 35);
683    }
684
685    #[test]
686    fn test_names_at_beginning_of_sentences() {
687        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "Python".to_string()], true);
688
689        let content = "javascript is a great language. python is also popular.";
690        let ctx = create_context(content);
691        let result = rule.check(&ctx).unwrap();
692
693        assert_eq!(result.len(), 2, "Should flag names at beginning of sentences");
694        assert_eq!(result[0].line, 1);
695        assert_eq!(result[0].column, 1);
696        assert_eq!(result[1].line, 1);
697        assert_eq!(result[1].column, 33);
698    }
699
700    #[test]
701    fn test_names_in_code_blocks_checked_by_default() {
702        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
703
704        let content = r#"Here is some text with JavaScript.
705
706```javascript
707// This javascript should be checked
708const lang = "javascript";
709```
710
711But this javascript should be flagged."#;
712
713        let ctx = create_context(content);
714        let result = rule.check(&ctx).unwrap();
715
716        assert_eq!(result.len(), 3, "Should flag javascript inside and outside code blocks");
717        assert_eq!(result[0].line, 4);
718        assert_eq!(result[1].line, 5);
719        assert_eq!(result[2].line, 8);
720    }
721
722    #[test]
723    fn test_names_in_code_blocks_ignored_when_disabled() {
724        let rule = MD044ProperNames::new(
725            vec!["JavaScript".to_string()],
726            false, // code_blocks = false means skip code blocks
727        );
728
729        let content = r#"```
730javascript in code block
731```"#;
732
733        let ctx = create_context(content);
734        let result = rule.check(&ctx).unwrap();
735
736        assert_eq!(
737            result.len(),
738            0,
739            "Should not flag javascript in code blocks when code_blocks is false"
740        );
741    }
742
743    #[test]
744    fn test_names_in_inline_code_checked_by_default() {
745        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
746
747        let content = "This is `javascript` in inline code and javascript outside.";
748        let ctx = create_context(content);
749        let result = rule.check(&ctx).unwrap();
750
751        // When code_blocks=true, inline code should be checked
752        assert_eq!(result.len(), 2, "Should flag javascript inside and outside inline code");
753        assert_eq!(result[0].column, 10); // javascript in inline code
754        assert_eq!(result[1].column, 41); // javascript outside
755    }
756
757    #[test]
758    fn test_multiple_names_in_same_line() {
759        let rule = MD044ProperNames::new(
760            vec!["JavaScript".to_string(), "TypeScript".to_string(), "React".to_string()],
761            true,
762        );
763
764        let content = "I use javascript, typescript, and react in my projects.";
765        let ctx = create_context(content);
766        let result = rule.check(&ctx).unwrap();
767
768        assert_eq!(result.len(), 3, "Should flag all three incorrect names");
769        assert_eq!(result[0].message, "Proper name 'javascript' should be 'JavaScript'");
770        assert_eq!(result[1].message, "Proper name 'typescript' should be 'TypeScript'");
771        assert_eq!(result[2].message, "Proper name 'react' should be 'React'");
772    }
773
774    #[test]
775    fn test_case_sensitivity() {
776        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
777
778        let content = "JAVASCRIPT, Javascript, javascript, and JavaScript variations.";
779        let ctx = create_context(content);
780        let result = rule.check(&ctx).unwrap();
781
782        assert_eq!(result.len(), 3, "Should flag all incorrect case variations");
783        // JavaScript (correct) should not be flagged
784        assert!(result.iter().all(|w| w.message.contains("should be 'JavaScript'")));
785    }
786
787    #[test]
788    fn test_configuration_with_custom_name_list() {
789        let config = MD044Config {
790            names: vec!["GitHub".to_string(), "GitLab".to_string(), "DevOps".to_string()],
791            code_blocks: true,
792            html_elements: true,
793            html_comments: true,
794        };
795        let rule = MD044ProperNames::from_config_struct(config);
796
797        let content = "We use github, gitlab, and devops for our workflow.";
798        let ctx = create_context(content);
799        let result = rule.check(&ctx).unwrap();
800
801        assert_eq!(result.len(), 3, "Should flag all custom names");
802        assert_eq!(result[0].message, "Proper name 'github' should be 'GitHub'");
803        assert_eq!(result[1].message, "Proper name 'gitlab' should be 'GitLab'");
804        assert_eq!(result[2].message, "Proper name 'devops' should be 'DevOps'");
805    }
806
807    #[test]
808    fn test_empty_configuration() {
809        let rule = MD044ProperNames::new(vec![], true);
810
811        let content = "This has javascript and typescript but no configured names.";
812        let ctx = create_context(content);
813        let result = rule.check(&ctx).unwrap();
814
815        assert!(result.is_empty(), "Should not flag anything with empty configuration");
816    }
817
818    #[test]
819    fn test_names_with_special_characters() {
820        let rule = MD044ProperNames::new(
821            vec!["Node.js".to_string(), "ASP.NET".to_string(), "C++".to_string()],
822            true,
823        );
824
825        let content = "We use nodejs, asp.net, ASP.NET, and c++ in our stack.";
826        let ctx = create_context(content);
827        let result = rule.check(&ctx).unwrap();
828
829        // nodejs should match Node.js (dotless variation)
830        // asp.net should be flagged (wrong case)
831        // ASP.NET should not be flagged (correct)
832        // c++ should be flagged
833        assert_eq!(result.len(), 3, "Should handle special characters correctly");
834
835        let messages: Vec<&str> = result.iter().map(|w| w.message.as_str()).collect();
836        assert!(messages.contains(&"Proper name 'nodejs' should be 'Node.js'"));
837        assert!(messages.contains(&"Proper name 'asp.net' should be 'ASP.NET'"));
838        assert!(messages.contains(&"Proper name 'c++' should be 'C++'"));
839    }
840
841    #[test]
842    fn test_word_boundaries() {
843        let rule = MD044ProperNames::new(vec!["Java".to_string(), "Script".to_string()], true);
844
845        let content = "JavaScript is not java or script, but Java and Script are separate.";
846        let ctx = create_context(content);
847        let result = rule.check(&ctx).unwrap();
848
849        // Should only flag lowercase "java" and "script" as separate words
850        assert_eq!(result.len(), 2, "Should respect word boundaries");
851        assert!(result.iter().any(|w| w.column == 19)); // "java" position
852        assert!(result.iter().any(|w| w.column == 27)); // "script" position
853    }
854
855    #[test]
856    fn test_fix_method() {
857        let rule = MD044ProperNames::new(
858            vec![
859                "JavaScript".to_string(),
860                "TypeScript".to_string(),
861                "Node.js".to_string(),
862            ],
863            true,
864        );
865
866        let content = "I love javascript, typescript, and nodejs!";
867        let ctx = create_context(content);
868        let fixed = rule.fix(&ctx).unwrap();
869
870        assert_eq!(fixed, "I love JavaScript, TypeScript, and Node.js!");
871    }
872
873    #[test]
874    fn test_fix_multiple_occurrences() {
875        let rule = MD044ProperNames::new(vec!["Python".to_string()], true);
876
877        let content = "python is great. I use python daily. PYTHON is powerful.";
878        let ctx = create_context(content);
879        let fixed = rule.fix(&ctx).unwrap();
880
881        assert_eq!(fixed, "Python is great. I use Python daily. Python is powerful.");
882    }
883
884    #[test]
885    fn test_fix_checks_code_blocks_by_default() {
886        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
887
888        let content = r#"I love javascript.
889
890```
891const lang = "javascript";
892```
893
894More javascript here."#;
895
896        let ctx = create_context(content);
897        let fixed = rule.fix(&ctx).unwrap();
898
899        let expected = r#"I love JavaScript.
900
901```
902const lang = "JavaScript";
903```
904
905More JavaScript here."#;
906
907        assert_eq!(fixed, expected);
908    }
909
910    #[test]
911    fn test_multiline_content() {
912        let rule = MD044ProperNames::new(vec!["Rust".to_string(), "Python".to_string()], true);
913
914        let content = r#"First line with rust.
915Second line with python.
916Third line with RUST and PYTHON."#;
917
918        let ctx = create_context(content);
919        let result = rule.check(&ctx).unwrap();
920
921        assert_eq!(result.len(), 4, "Should flag all incorrect occurrences");
922        assert_eq!(result[0].line, 1);
923        assert_eq!(result[1].line, 2);
924        assert_eq!(result[2].line, 3);
925        assert_eq!(result[3].line, 3);
926    }
927
928    #[test]
929    fn test_default_config() {
930        let config = MD044Config::default();
931        assert!(config.names.is_empty());
932        assert!(!config.code_blocks); // Default is false (skip code blocks)
933    }
934
935    #[test]
936    fn test_performance_with_many_names() {
937        let mut names = vec![];
938        for i in 0..50 {
939            names.push(format!("ProperName{i}"));
940        }
941
942        let rule = MD044ProperNames::new(names, true);
943
944        let content = "This has propername0, propername25, and propername49 incorrectly.";
945        let ctx = create_context(content);
946        let result = rule.check(&ctx).unwrap();
947
948        assert_eq!(result.len(), 3, "Should handle many configured names efficiently");
949    }
950
951    #[test]
952    fn test_large_name_count_performance() {
953        // Verify MD044 can handle large numbers of names without regex limitations
954        // This test confirms that fancy-regex handles large patterns well
955        let names = (0..1000).map(|i| format!("ProperName{i}")).collect::<Vec<_>>();
956
957        let rule = MD044ProperNames::new(names, true);
958
959        // The combined pattern should be created successfully
960        assert!(rule.combined_pattern.is_some());
961
962        // Should be able to check content without errors
963        let content = "This has propername0 and propername999 in it.";
964        let ctx = create_context(content);
965        let result = rule.check(&ctx).unwrap();
966
967        // Should detect both incorrect names
968        assert_eq!(result.len(), 2, "Should handle 1000 names without issues");
969    }
970
971    #[test]
972    fn test_cache_behavior() {
973        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
974
975        let content = "Using javascript here.";
976        let ctx = create_context(content);
977
978        // First check
979        let result1 = rule.check(&ctx).unwrap();
980        assert_eq!(result1.len(), 1);
981
982        // Second check should use cache
983        let result2 = rule.check(&ctx).unwrap();
984        assert_eq!(result2.len(), 1);
985
986        // Results should be identical
987        assert_eq!(result1[0].line, result2[0].line);
988        assert_eq!(result1[0].column, result2[0].column);
989    }
990
991    #[test]
992    fn test_html_comments_not_checked_when_disabled() {
993        let config = MD044Config {
994            names: vec!["JavaScript".to_string()],
995            code_blocks: true,    // Check code blocks
996            html_elements: true,  // Check HTML elements
997            html_comments: false, // Don't check HTML comments
998        };
999        let rule = MD044ProperNames::from_config_struct(config);
1000
1001        let content = r#"Regular javascript here.
1002<!-- This javascript in HTML comment should be ignored -->
1003More javascript outside."#;
1004
1005        let ctx = create_context(content);
1006        let result = rule.check(&ctx).unwrap();
1007
1008        assert_eq!(result.len(), 2, "Should only flag javascript outside HTML comments");
1009        assert_eq!(result[0].line, 1);
1010        assert_eq!(result[1].line, 3);
1011    }
1012
1013    #[test]
1014    fn test_html_comments_checked_when_enabled() {
1015        let config = MD044Config {
1016            names: vec!["JavaScript".to_string()],
1017            code_blocks: true,   // Check code blocks
1018            html_elements: true, // Check HTML elements
1019            html_comments: true, // Check HTML comments
1020        };
1021        let rule = MD044ProperNames::from_config_struct(config);
1022
1023        let content = r#"Regular javascript here.
1024<!-- This javascript in HTML comment should be checked -->
1025More javascript outside."#;
1026
1027        let ctx = create_context(content);
1028        let result = rule.check(&ctx).unwrap();
1029
1030        assert_eq!(
1031            result.len(),
1032            3,
1033            "Should flag all javascript occurrences including in HTML comments"
1034        );
1035    }
1036
1037    #[test]
1038    fn test_multiline_html_comments() {
1039        let config = MD044Config {
1040            names: vec!["Python".to_string(), "JavaScript".to_string()],
1041            code_blocks: true,    // Check code blocks
1042            html_elements: true,  // Check HTML elements
1043            html_comments: false, // Don't check HTML comments
1044        };
1045        let rule = MD044ProperNames::from_config_struct(config);
1046
1047        let content = r#"Regular python here.
1048<!--
1049This is a multiline comment
1050with javascript and python
1051that should be ignored
1052-->
1053More javascript outside."#;
1054
1055        let ctx = create_context(content);
1056        let result = rule.check(&ctx).unwrap();
1057
1058        assert_eq!(result.len(), 2, "Should only flag names outside HTML comments");
1059        assert_eq!(result[0].line, 1); // python
1060        assert_eq!(result[1].line, 7); // javascript
1061    }
1062
1063    #[test]
1064    fn test_fix_preserves_html_comments_when_disabled() {
1065        let config = MD044Config {
1066            names: vec!["JavaScript".to_string()],
1067            code_blocks: true,    // Check code blocks
1068            html_elements: true,  // Check HTML elements
1069            html_comments: false, // Don't check HTML comments
1070        };
1071        let rule = MD044ProperNames::from_config_struct(config);
1072
1073        let content = r#"javascript here.
1074<!-- javascript in comment -->
1075More javascript."#;
1076
1077        let ctx = create_context(content);
1078        let fixed = rule.fix(&ctx).unwrap();
1079
1080        let expected = r#"JavaScript here.
1081<!-- javascript in comment -->
1082More JavaScript."#;
1083
1084        assert_eq!(
1085            fixed, expected,
1086            "Should not fix names inside HTML comments when disabled"
1087        );
1088    }
1089
1090    #[test]
1091    fn test_proper_names_in_link_text_are_flagged() {
1092        let rule = MD044ProperNames::new(
1093            vec!["JavaScript".to_string(), "Node.js".to_string(), "Python".to_string()],
1094            true,
1095        );
1096
1097        let content = r#"Check this [javascript documentation](https://javascript.info) for info.
1098
1099Visit [node.js homepage](https://nodejs.org) and [python tutorial](https://python.org).
1100
1101Real javascript should be flagged.
1102
1103Also see the [typescript guide][ts-ref] for more.
1104
1105Real python should be flagged too.
1106
1107[ts-ref]: https://typescript.org/handbook"#;
1108
1109        let ctx = create_context(content);
1110        let result = rule.check(&ctx).unwrap();
1111
1112        // Link text should be checked, URLs should not be checked
1113        // Line 1: [javascript documentation] - "javascript" should be flagged
1114        // Line 3: [node.js homepage] - "node.js" should be flagged (matches "Node.js")
1115        // Line 3: [python tutorial] - "python" should be flagged
1116        // Line 5: standalone javascript
1117        // Line 9: standalone python
1118        assert_eq!(result.len(), 5, "Expected 5 warnings: 3 in link text + 2 standalone");
1119
1120        // Verify line numbers for link text warnings
1121        let line_1_warnings: Vec<_> = result.iter().filter(|w| w.line == 1).collect();
1122        assert_eq!(line_1_warnings.len(), 1);
1123        assert!(
1124            line_1_warnings[0]
1125                .message
1126                .contains("'javascript' should be 'JavaScript'")
1127        );
1128
1129        let line_3_warnings: Vec<_> = result.iter().filter(|w| w.line == 3).collect();
1130        assert_eq!(line_3_warnings.len(), 2); // node.js and python
1131
1132        // Standalone warnings
1133        assert!(result.iter().any(|w| w.line == 5 && w.message.contains("'javascript'")));
1134        assert!(result.iter().any(|w| w.line == 9 && w.message.contains("'python'")));
1135    }
1136
1137    #[test]
1138    fn test_link_urls_not_flagged() {
1139        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1140
1141        // URL contains "javascript" but should NOT be flagged
1142        let content = r#"[Link Text](https://javascript.info/guide)"#;
1143
1144        let ctx = create_context(content);
1145        let result = rule.check(&ctx).unwrap();
1146
1147        // URL should not be checked
1148        assert!(result.is_empty(), "URLs should not be checked for proper names");
1149    }
1150
1151    #[test]
1152    fn test_proper_names_in_image_alt_text_are_flagged() {
1153        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1154
1155        let content = r#"Here is a ![javascript logo](javascript.png "javascript icon") image.
1156
1157Real javascript should be flagged."#;
1158
1159        let ctx = create_context(content);
1160        let result = rule.check(&ctx).unwrap();
1161
1162        // Image alt text should be checked, URL and title should not be checked
1163        // Line 1: ![javascript logo] - "javascript" should be flagged
1164        // Line 3: standalone javascript
1165        assert_eq!(result.len(), 2, "Expected 2 warnings: 1 in alt text + 1 standalone");
1166        assert!(result[0].message.contains("'javascript' should be 'JavaScript'"));
1167        assert!(result[0].line == 1); // "![javascript logo]"
1168        assert!(result[1].message.contains("'javascript' should be 'JavaScript'"));
1169        assert!(result[1].line == 3); // "Real javascript should be flagged."
1170    }
1171
1172    #[test]
1173    fn test_image_urls_not_flagged() {
1174        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1175
1176        // URL contains "javascript" but should NOT be flagged
1177        let content = r#"![Logo](https://javascript.info/logo.png)"#;
1178
1179        let ctx = create_context(content);
1180        let result = rule.check(&ctx).unwrap();
1181
1182        // Image URL should not be checked
1183        assert!(result.is_empty(), "Image URLs should not be checked for proper names");
1184    }
1185
1186    #[test]
1187    fn test_reference_link_text_flagged_but_definition_not() {
1188        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "TypeScript".to_string()], true);
1189
1190        let content = r#"Check the [javascript guide][js-ref] for details.
1191
1192Real javascript should be flagged.
1193
1194[js-ref]: https://javascript.info/typescript/guide"#;
1195
1196        let ctx = create_context(content);
1197        let result = rule.check(&ctx).unwrap();
1198
1199        // Link text should be checked, reference definitions should not
1200        // Line 1: [javascript guide] - should be flagged
1201        // Line 3: standalone javascript - should be flagged
1202        // Line 5: reference definition - should NOT be flagged
1203        assert_eq!(result.len(), 2, "Expected 2 warnings: 1 in link text + 1 standalone");
1204        assert!(result.iter().any(|w| w.line == 1 && w.message.contains("'javascript'")));
1205        assert!(result.iter().any(|w| w.line == 3 && w.message.contains("'javascript'")));
1206    }
1207
1208    #[test]
1209    fn test_reference_definitions_not_flagged() {
1210        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1211
1212        // Reference definition should NOT be flagged
1213        let content = r#"[js-ref]: https://javascript.info/guide"#;
1214
1215        let ctx = create_context(content);
1216        let result = rule.check(&ctx).unwrap();
1217
1218        // Reference definition URLs should not be checked
1219        assert!(result.is_empty(), "Reference definitions should not be checked");
1220    }
1221
1222    #[test]
1223    fn test_wikilinks_text_is_flagged() {
1224        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1225
1226        // WikiLinks [[destination]] should have their text checked
1227        let content = r#"[[javascript]]
1228
1229Regular javascript here.
1230
1231[[JavaScript|display text]]"#;
1232
1233        let ctx = create_context(content);
1234        let result = rule.check(&ctx).unwrap();
1235
1236        // Line 1: [[javascript]] - should be flagged (WikiLink text)
1237        // Line 3: standalone javascript - should be flagged
1238        // Line 5: [[JavaScript|display text]] - correct capitalization, no flag
1239        assert_eq!(result.len(), 2, "Expected 2 warnings: 1 in WikiLink + 1 standalone");
1240        assert!(
1241            result
1242                .iter()
1243                .any(|w| w.line == 1 && w.column == 3 && w.message.contains("'javascript'"))
1244        );
1245        assert!(result.iter().any(|w| w.line == 3 && w.message.contains("'javascript'")));
1246    }
1247}
rumdl_lib/rules/md044_proper_names.rs

rumdl_lib/rules/
md044_proper_names.rs