rumdl_lib/rules/
md044_proper_names.rs

1use crate::utils::fast_hash;
2use crate::utils::regex_cache::{escape_regex, get_cached_fancy_regex};
3
4use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, Severity};
5use std::collections::{HashMap, HashSet};
6use std::sync::{Arc, Mutex};
7
8mod md044_config;
9use md044_config::MD044Config;
10
11type WarningPosition = (usize, usize, String); // (line, column, found_name)
12
13/// Rule MD044: Proper names should be capitalized
14///
15/// See [docs/md044.md](../../docs/md044.md) for full documentation, configuration, and examples.
16///
17/// This rule is triggered when proper names are not capitalized correctly in the document.
18/// For example, if you have defined "JavaScript" as a proper name, the rule will flag any
19/// occurrences of "javascript" or "Javascript" as violations.
20///
21/// ## Purpose
22///
23/// Ensuring consistent capitalization of proper names improves document quality and
24/// professionalism. This is especially important for technical documentation where
25/// product names, programming languages, and technologies often have specific
26/// capitalization conventions.
27///
28/// ## Configuration Options
29///
30/// The rule supports the following configuration options:
31///
32/// ```yaml
33/// MD044:
34///   names: []                # List of proper names to check for correct capitalization
35///   code-blocks: false       # Whether to check code blocks (default: false)
36/// ```
37///
38/// Example configuration:
39///
40/// ```yaml
41/// MD044:
42///   names: ["JavaScript", "Node.js", "TypeScript"]
43///   code-blocks: true
44/// ```
45///
46/// ## Performance Optimizations
47///
48/// This rule implements several performance optimizations:
49///
50/// 1. **Regex Caching**: Pre-compiles and caches regex patterns for each proper name
51/// 2. **Content Caching**: Caches results based on content hashing for repeated checks
52/// 3. **Efficient Text Processing**: Uses optimized algorithms to avoid redundant text processing
53/// 4. **Smart Code Block Detection**: Efficiently identifies and optionally excludes code blocks
54///
55/// ## Edge Cases Handled
56///
57/// - **Word Boundaries**: Only matches complete words, not substrings within other words
58/// - **Case Sensitivity**: Properly handles case-specific matching
59/// - **Code Blocks**: Optionally checks code blocks (controlled by code-blocks setting)
60/// - **Markdown Formatting**: Handles proper names within Markdown formatting elements
61///
62/// ## Fix Behavior
63///
64/// When fixing issues, this rule replaces incorrect capitalization with the correct form
65/// as defined in the configuration.
66///
67#[derive(Clone)]
68pub struct MD044ProperNames {
69    config: MD044Config,
70    // Cache the combined regex pattern string
71    combined_pattern: Option<String>,
72    // Precomputed lowercase name variants for fast pre-checks
73    name_variants: Vec<String>,
74    // Cache for name violations by content hash
75    content_cache: Arc<Mutex<HashMap<u64, Vec<WarningPosition>>>>,
76}
77
78impl MD044ProperNames {
79    pub fn new(names: Vec<String>, code_blocks: bool) -> Self {
80        let config = MD044Config {
81            names,
82            code_blocks,
83            html_elements: true, // Default to checking HTML elements
84            html_comments: true, // Default to checking HTML comments
85        };
86        let combined_pattern = Self::create_combined_pattern(&config);
87        let name_variants = Self::build_name_variants(&config);
88        Self {
89            config,
90            combined_pattern,
91            name_variants,
92            content_cache: Arc::new(Mutex::new(HashMap::new())),
93        }
94    }
95
96    // Helper function for consistent ASCII normalization
97    fn ascii_normalize(s: &str) -> String {
98        s.replace(['é', 'è', 'ê', 'ë'], "e")
99            .replace(['à', 'á', 'â', 'ä', 'ã', 'å'], "a")
100            .replace(['ï', 'î', 'í', 'ì'], "i")
101            .replace(['ü', 'ú', 'ù', 'û'], "u")
102            .replace(['ö', 'ó', 'ò', 'ô', 'õ'], "o")
103            .replace('ñ', "n")
104            .replace('ç', "c")
105    }
106
107    pub fn from_config_struct(config: MD044Config) -> Self {
108        let combined_pattern = Self::create_combined_pattern(&config);
109        let name_variants = Self::build_name_variants(&config);
110        Self {
111            config,
112            combined_pattern,
113            name_variants,
114            content_cache: Arc::new(Mutex::new(HashMap::new())),
115        }
116    }
117
118    // Create a combined regex pattern for all proper names
119    fn create_combined_pattern(config: &MD044Config) -> Option<String> {
120        if config.names.is_empty() {
121            return None;
122        }
123
124        // Create patterns for all names and their variations
125        let mut patterns: Vec<String> = config
126            .names
127            .iter()
128            .flat_map(|name| {
129                let mut variations = vec![];
130                let lower_name = name.to_lowercase();
131
132                // Add the lowercase version
133                variations.push(escape_regex(&lower_name));
134
135                // Add version without dots
136                let lower_name_no_dots = lower_name.replace('.', "");
137                if lower_name != lower_name_no_dots {
138                    variations.push(escape_regex(&lower_name_no_dots));
139                }
140
141                // Add ASCII-normalized versions for common accented characters
142                let ascii_normalized = Self::ascii_normalize(&lower_name);
143
144                if ascii_normalized != lower_name {
145                    variations.push(escape_regex(&ascii_normalized));
146
147                    // Also add version without dots
148                    let ascii_no_dots = ascii_normalized.replace('.', "");
149                    if ascii_normalized != ascii_no_dots {
150                        variations.push(escape_regex(&ascii_no_dots));
151                    }
152                }
153
154                variations
155            })
156            .collect();
157
158        // Sort patterns by length (longest first) to avoid shorter patterns matching within longer ones
159        patterns.sort_by_key(|b| std::cmp::Reverse(b.len()));
160
161        // Combine all patterns into a single regex with capture groups
162        // Don't use \b as it doesn't work with Unicode - we'll check boundaries manually
163        Some(format!(r"(?i)({})", patterns.join("|")))
164    }
165
166    fn build_name_variants(config: &MD044Config) -> Vec<String> {
167        let mut variants = HashSet::new();
168        for name in &config.names {
169            let lower_name = name.to_lowercase();
170            variants.insert(lower_name.clone());
171
172            let lower_no_dots = lower_name.replace('.', "");
173            if lower_name != lower_no_dots {
174                variants.insert(lower_no_dots);
175            }
176
177            let ascii_normalized = Self::ascii_normalize(&lower_name);
178            if ascii_normalized != lower_name {
179                variants.insert(ascii_normalized.clone());
180
181                let ascii_no_dots = ascii_normalized.replace('.', "");
182                if ascii_normalized != ascii_no_dots {
183                    variants.insert(ascii_no_dots);
184                }
185            }
186        }
187
188        variants.into_iter().collect()
189    }
190
191    // Find all name violations in the content and return positions.
192    // `content_lower` is the pre-computed lowercase version of `content` to avoid redundant allocations.
193    fn find_name_violations(
194        &self,
195        content: &str,
196        ctx: &crate::lint_context::LintContext,
197        content_lower: &str,
198    ) -> Vec<WarningPosition> {
199        // Early return: if no names configured or content is empty
200        if self.config.names.is_empty() || content.is_empty() || self.combined_pattern.is_none() {
201            return Vec::new();
202        }
203
204        // Early return: quick check if any of the configured names might be in content
205        let has_potential_matches = self.name_variants.iter().any(|name| content_lower.contains(name));
206
207        if !has_potential_matches {
208            return Vec::new();
209        }
210
211        // Check if we have cached results
212        let hash = fast_hash(content);
213        {
214            // Use a separate scope for borrowing to minimize lock time
215            if let Ok(cache) = self.content_cache.lock()
216                && let Some(cached) = cache.get(&hash)
217            {
218                return cached.clone();
219            }
220        }
221
222        let mut violations = Vec::new();
223
224        // Get the regex from global cache
225        let combined_regex = match &self.combined_pattern {
226            Some(pattern) => match get_cached_fancy_regex(pattern) {
227                Ok(regex) => regex,
228                Err(_) => return Vec::new(),
229            },
230            None => return Vec::new(),
231        };
232
233        // Use ctx.lines for better performance
234        for (line_idx, line_info) in ctx.lines.iter().enumerate() {
235            let line_num = line_idx + 1;
236            let line = line_info.content(ctx.content);
237
238            // Skip code fence lines (```language or ~~~language)
239            let trimmed = line.trim_start();
240            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
241                continue;
242            }
243
244            // Skip if in code block (when code_blocks = false)
245            if !self.config.code_blocks && line_info.in_code_block {
246                continue;
247            }
248
249            // Skip if in HTML block (when html_elements = false)
250            if !self.config.html_elements && line_info.in_html_block {
251                continue;
252            }
253
254            // Skip HTML comments using pre-computed line flag
255            if !self.config.html_comments && line_info.in_html_comment {
256                continue;
257            }
258
259            // Skip JSX expressions and MDX comments (MDX flavor)
260            if line_info.in_jsx_expression || line_info.in_mdx_comment {
261                continue;
262            }
263
264            // Skip Obsidian comments (Obsidian flavor)
265            if line_info.in_obsidian_comment {
266                continue;
267            }
268
269            // Early return: skip lines that don't contain any potential matches
270            let line_lower = line.to_lowercase();
271            let has_line_matches = self.name_variants.iter().any(|name| line_lower.contains(name));
272
273            if !has_line_matches {
274                continue;
275            }
276
277            // Use the combined regex to find all matches in one pass
278            for cap_result in combined_regex.find_iter(line) {
279                match cap_result {
280                    Ok(cap) => {
281                        let found_name = &line[cap.start()..cap.end()];
282
283                        // Check word boundaries manually for Unicode support
284                        let start_pos = cap.start();
285                        let end_pos = cap.end();
286
287                        if !Self::is_at_word_boundary(line, start_pos, true)
288                            || !Self::is_at_word_boundary(line, end_pos, false)
289                        {
290                            continue; // Not at word boundary
291                        }
292
293                        // Skip if in inline code when code_blocks is false
294                        if !self.config.code_blocks {
295                            let byte_pos = line_info.byte_offset + cap.start();
296                            if ctx.is_in_code_block_or_span(byte_pos) {
297                                continue;
298                            }
299                        }
300
301                        // Skip if in link URL or reference definition
302                        let byte_pos = line_info.byte_offset + cap.start();
303                        if Self::is_in_link(ctx, byte_pos) {
304                            continue;
305                        }
306
307                        // Find which proper name this matches
308                        if let Some(proper_name) = self.get_proper_name_for(found_name) {
309                            // Only flag if it's not already correct
310                            if found_name != proper_name {
311                                violations.push((line_num, cap.start() + 1, found_name.to_string()));
312                            }
313                        }
314                    }
315                    Err(e) => {
316                        eprintln!("Regex execution error on line {line_num}: {e}");
317                    }
318                }
319            }
320        }
321
322        // Store in cache (ignore if mutex is poisoned)
323        if let Ok(mut cache) = self.content_cache.lock() {
324            cache.insert(hash, violations.clone());
325        }
326        violations
327    }
328
329    /// Check if a byte position is within a link URL (not link text)
330    ///
331    /// Link text should be checked for proper names, but URLs should be skipped.
332    /// For `[text](url)` - check text, skip url
333    /// For `[text][ref]` - check text, skip reference portion
334    /// For `[[text]]` (WikiLinks) - check text, skip brackets
335    fn is_in_link(ctx: &crate::lint_context::LintContext, byte_pos: usize) -> bool {
336        use pulldown_cmark::LinkType;
337
338        // Binary search links (sorted by byte_offset) to find candidate containing byte_pos
339        let link_idx = ctx.links.partition_point(|link| link.byte_offset <= byte_pos);
340        if link_idx > 0 {
341            let link = &ctx.links[link_idx - 1];
342            if byte_pos < link.byte_end {
343                // WikiLinks [[text]] start with '[[', regular links [text] start with '['
344                let text_start = if matches!(link.link_type, LinkType::WikiLink { .. }) {
345                    link.byte_offset + 2
346                } else {
347                    link.byte_offset + 1
348                };
349                let text_end = text_start + link.text.len();
350
351                // If position is within the text portion, skip only if text is a URL
352                if byte_pos >= text_start && byte_pos < text_end {
353                    return Self::link_text_is_url(&link.text);
354                }
355                // Position is in the URL/reference portion, skip it
356                return true;
357            }
358        }
359
360        // Binary search images (sorted by byte_offset) to find candidate containing byte_pos
361        let image_idx = ctx.images.partition_point(|img| img.byte_offset <= byte_pos);
362        if image_idx > 0 {
363            let image = &ctx.images[image_idx - 1];
364            if byte_pos < image.byte_end {
365                // Image starts with '![' so alt text starts at byte_offset + 2
366                let alt_start = image.byte_offset + 2;
367                let alt_end = alt_start + image.alt_text.len();
368
369                // If position is within the alt text portion, don't skip
370                if byte_pos >= alt_start && byte_pos < alt_end {
371                    return false;
372                }
373                // Position is in the URL/reference portion, skip it
374                return true;
375            }
376        }
377
378        // Check pre-computed reference definitions
379        ctx.is_in_reference_def(byte_pos)
380    }
381
382    /// Check if link text is a URL that should not have proper name corrections.
383    /// Matches markdownlint behavior: skip text starting with `http://`, `https://`, or `www.`.
384    fn link_text_is_url(text: &str) -> bool {
385        let lower = text.trim().to_ascii_lowercase();
386        lower.starts_with("http://") || lower.starts_with("https://") || lower.starts_with("www.")
387    }
388
389    // Check if a character is a word boundary (handles Unicode)
390    fn is_word_boundary_char(c: char) -> bool {
391        !c.is_alphanumeric()
392    }
393
394    // Check if position is at a word boundary using O(1) byte-level lookups
395    fn is_at_word_boundary(content: &str, pos: usize, is_start: bool) -> bool {
396        if is_start {
397            if pos == 0 {
398                return true;
399            }
400            // Get the character immediately before `pos`
401            match content[..pos].chars().next_back() {
402                None => true,
403                Some(c) => Self::is_word_boundary_char(c),
404            }
405        } else {
406            if pos >= content.len() {
407                return true;
408            }
409            // Get the character at `pos`
410            match content[pos..].chars().next() {
411                None => true,
412                Some(c) => Self::is_word_boundary_char(c),
413            }
414        }
415    }
416
417    // Get the proper name that should be used for a found name
418    fn get_proper_name_for(&self, found_name: &str) -> Option<String> {
419        let found_lower = found_name.to_lowercase();
420
421        // Iterate through the configured proper names
422        for name in &self.config.names {
423            let lower_name = name.to_lowercase();
424            let lower_name_no_dots = lower_name.replace('.', "");
425
426            // Direct match
427            if found_lower == lower_name || found_lower == lower_name_no_dots {
428                return Some(name.clone());
429            }
430
431            // Check ASCII-normalized version
432            let ascii_normalized = Self::ascii_normalize(&lower_name);
433
434            let ascii_no_dots = ascii_normalized.replace('.', "");
435
436            if found_lower == ascii_normalized || found_lower == ascii_no_dots {
437                return Some(name.clone());
438            }
439        }
440        None
441    }
442}
443
444impl Rule for MD044ProperNames {
445    fn name(&self) -> &'static str {
446        "MD044"
447    }
448
449    fn description(&self) -> &'static str {
450        "Proper names should have the correct capitalization"
451    }
452
453    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
454        if self.config.names.is_empty() {
455            return true;
456        }
457        // Quick check if any configured name variants exist (case-insensitive)
458        let content_lower = if ctx.content.is_ascii() {
459            ctx.content.to_ascii_lowercase()
460        } else {
461            ctx.content.to_lowercase()
462        };
463        !self.name_variants.iter().any(|name| content_lower.contains(name))
464    }
465
466    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
467        let content = ctx.content;
468        if content.is_empty() || self.config.names.is_empty() || self.combined_pattern.is_none() {
469            return Ok(Vec::new());
470        }
471
472        // Compute lowercase content once and reuse across all checks
473        let content_lower = if content.is_ascii() {
474            content.to_ascii_lowercase()
475        } else {
476            content.to_lowercase()
477        };
478
479        // Early return: use pre-computed name_variants for the quick check
480        let has_potential_matches = self.name_variants.iter().any(|name| content_lower.contains(name));
481
482        if !has_potential_matches {
483            return Ok(Vec::new());
484        }
485
486        let line_index = &ctx.line_index;
487        let violations = self.find_name_violations(content, ctx, &content_lower);
488
489        let warnings = violations
490            .into_iter()
491            .filter_map(|(line, column, found_name)| {
492                self.get_proper_name_for(&found_name).map(|proper_name| LintWarning {
493                    rule_name: Some(self.name().to_string()),
494                    line,
495                    column,
496                    end_line: line,
497                    end_column: column + found_name.len(),
498                    message: format!("Proper name '{found_name}' should be '{proper_name}'"),
499                    severity: Severity::Warning,
500                    fix: Some(Fix {
501                        range: line_index.line_col_to_byte_range(line, column),
502                        replacement: proper_name,
503                    }),
504                })
505            })
506            .collect();
507
508        Ok(warnings)
509    }
510
511    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
512        let content = ctx.content;
513        if content.is_empty() || self.config.names.is_empty() {
514            return Ok(content.to_string());
515        }
516
517        let content_lower = if content.is_ascii() {
518            content.to_ascii_lowercase()
519        } else {
520            content.to_lowercase()
521        };
522        let violations = self.find_name_violations(content, ctx, &content_lower);
523        if violations.is_empty() {
524            return Ok(content.to_string());
525        }
526
527        // Process lines and build the fixed content
528        let mut fixed_lines = Vec::new();
529
530        // Group violations by line
531        let mut violations_by_line: HashMap<usize, Vec<(usize, String)>> = HashMap::new();
532        for (line_num, col_num, found_name) in violations {
533            violations_by_line
534                .entry(line_num)
535                .or_default()
536                .push((col_num, found_name));
537        }
538
539        // Sort violations within each line in reverse order
540        for violations in violations_by_line.values_mut() {
541            violations.sort_by_key(|b| std::cmp::Reverse(b.0));
542        }
543
544        // Process each line
545        for (line_idx, line_info) in ctx.lines.iter().enumerate() {
546            let line_num = line_idx + 1;
547
548            if let Some(line_violations) = violations_by_line.get(&line_num) {
549                // This line has violations, fix them
550                let mut fixed_line = line_info.content(ctx.content).to_string();
551
552                for (col_num, found_name) in line_violations {
553                    if let Some(proper_name) = self.get_proper_name_for(found_name) {
554                        let start_col = col_num - 1; // Convert to 0-based
555                        let end_col = start_col + found_name.len();
556
557                        if end_col <= fixed_line.len()
558                            && fixed_line.is_char_boundary(start_col)
559                            && fixed_line.is_char_boundary(end_col)
560                        {
561                            fixed_line.replace_range(start_col..end_col, &proper_name);
562                        }
563                    }
564                }
565
566                fixed_lines.push(fixed_line);
567            } else {
568                // No violations on this line, keep it as is
569                fixed_lines.push(line_info.content(ctx.content).to_string());
570            }
571        }
572
573        // Join lines with newlines, preserving the original ending
574        let mut result = fixed_lines.join("\n");
575        if content.ends_with('\n') && !result.ends_with('\n') {
576            result.push('\n');
577        }
578        Ok(result)
579    }
580
581    fn as_any(&self) -> &dyn std::any::Any {
582        self
583    }
584
585    fn default_config_section(&self) -> Option<(String, toml::Value)> {
586        let json_value = serde_json::to_value(&self.config).ok()?;
587        Some((
588            self.name().to_string(),
589            crate::rule_config_serde::json_to_toml_value(&json_value)?,
590        ))
591    }
592
593    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
594    where
595        Self: Sized,
596    {
597        let rule_config = crate::rule_config_serde::load_rule_config::<MD044Config>(config);
598        Box::new(Self::from_config_struct(rule_config))
599    }
600}
601
602#[cfg(test)]
603mod tests {
604    use super::*;
605    use crate::lint_context::LintContext;
606
607    fn create_context(content: &str) -> LintContext<'_> {
608        LintContext::new(content, crate::config::MarkdownFlavor::Standard, None)
609    }
610
611    #[test]
612    fn test_correctly_capitalized_names() {
613        let rule = MD044ProperNames::new(
614            vec![
615                "JavaScript".to_string(),
616                "TypeScript".to_string(),
617                "Node.js".to_string(),
618            ],
619            true,
620        );
621
622        let content = "This document uses JavaScript, TypeScript, and Node.js correctly.";
623        let ctx = create_context(content);
624        let result = rule.check(&ctx).unwrap();
625        assert!(result.is_empty(), "Should not flag correctly capitalized names");
626    }
627
628    #[test]
629    fn test_incorrectly_capitalized_names() {
630        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "TypeScript".to_string()], true);
631
632        let content = "This document uses javascript and typescript incorrectly.";
633        let ctx = create_context(content);
634        let result = rule.check(&ctx).unwrap();
635
636        assert_eq!(result.len(), 2, "Should flag two incorrect capitalizations");
637        assert_eq!(result[0].message, "Proper name 'javascript' should be 'JavaScript'");
638        assert_eq!(result[0].line, 1);
639        assert_eq!(result[0].column, 20);
640        assert_eq!(result[1].message, "Proper name 'typescript' should be 'TypeScript'");
641        assert_eq!(result[1].line, 1);
642        assert_eq!(result[1].column, 35);
643    }
644
645    #[test]
646    fn test_names_at_beginning_of_sentences() {
647        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "Python".to_string()], true);
648
649        let content = "javascript is a great language. python is also popular.";
650        let ctx = create_context(content);
651        let result = rule.check(&ctx).unwrap();
652
653        assert_eq!(result.len(), 2, "Should flag names at beginning of sentences");
654        assert_eq!(result[0].line, 1);
655        assert_eq!(result[0].column, 1);
656        assert_eq!(result[1].line, 1);
657        assert_eq!(result[1].column, 33);
658    }
659
660    #[test]
661    fn test_names_in_code_blocks_checked_by_default() {
662        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
663
664        let content = r#"Here is some text with JavaScript.
665
666```javascript
667// This javascript should be checked
668const lang = "javascript";
669```
670
671But this javascript should be flagged."#;
672
673        let ctx = create_context(content);
674        let result = rule.check(&ctx).unwrap();
675
676        assert_eq!(result.len(), 3, "Should flag javascript inside and outside code blocks");
677        assert_eq!(result[0].line, 4);
678        assert_eq!(result[1].line, 5);
679        assert_eq!(result[2].line, 8);
680    }
681
682    #[test]
683    fn test_names_in_code_blocks_ignored_when_disabled() {
684        let rule = MD044ProperNames::new(
685            vec!["JavaScript".to_string()],
686            false, // code_blocks = false means skip code blocks
687        );
688
689        let content = r#"```
690javascript in code block
691```"#;
692
693        let ctx = create_context(content);
694        let result = rule.check(&ctx).unwrap();
695
696        assert_eq!(
697            result.len(),
698            0,
699            "Should not flag javascript in code blocks when code_blocks is false"
700        );
701    }
702
703    #[test]
704    fn test_names_in_inline_code_checked_by_default() {
705        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
706
707        let content = "This is `javascript` in inline code and javascript outside.";
708        let ctx = create_context(content);
709        let result = rule.check(&ctx).unwrap();
710
711        // When code_blocks=true, inline code should be checked
712        assert_eq!(result.len(), 2, "Should flag javascript inside and outside inline code");
713        assert_eq!(result[0].column, 10); // javascript in inline code
714        assert_eq!(result[1].column, 41); // javascript outside
715    }
716
717    #[test]
718    fn test_multiple_names_in_same_line() {
719        let rule = MD044ProperNames::new(
720            vec!["JavaScript".to_string(), "TypeScript".to_string(), "React".to_string()],
721            true,
722        );
723
724        let content = "I use javascript, typescript, and react in my projects.";
725        let ctx = create_context(content);
726        let result = rule.check(&ctx).unwrap();
727
728        assert_eq!(result.len(), 3, "Should flag all three incorrect names");
729        assert_eq!(result[0].message, "Proper name 'javascript' should be 'JavaScript'");
730        assert_eq!(result[1].message, "Proper name 'typescript' should be 'TypeScript'");
731        assert_eq!(result[2].message, "Proper name 'react' should be 'React'");
732    }
733
734    #[test]
735    fn test_case_sensitivity() {
736        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
737
738        let content = "JAVASCRIPT, Javascript, javascript, and JavaScript variations.";
739        let ctx = create_context(content);
740        let result = rule.check(&ctx).unwrap();
741
742        assert_eq!(result.len(), 3, "Should flag all incorrect case variations");
743        // JavaScript (correct) should not be flagged
744        assert!(result.iter().all(|w| w.message.contains("should be 'JavaScript'")));
745    }
746
747    #[test]
748    fn test_configuration_with_custom_name_list() {
749        let config = MD044Config {
750            names: vec!["GitHub".to_string(), "GitLab".to_string(), "DevOps".to_string()],
751            code_blocks: true,
752            html_elements: true,
753            html_comments: true,
754        };
755        let rule = MD044ProperNames::from_config_struct(config);
756
757        let content = "We use github, gitlab, and devops for our workflow.";
758        let ctx = create_context(content);
759        let result = rule.check(&ctx).unwrap();
760
761        assert_eq!(result.len(), 3, "Should flag all custom names");
762        assert_eq!(result[0].message, "Proper name 'github' should be 'GitHub'");
763        assert_eq!(result[1].message, "Proper name 'gitlab' should be 'GitLab'");
764        assert_eq!(result[2].message, "Proper name 'devops' should be 'DevOps'");
765    }
766
767    #[test]
768    fn test_empty_configuration() {
769        let rule = MD044ProperNames::new(vec![], true);
770
771        let content = "This has javascript and typescript but no configured names.";
772        let ctx = create_context(content);
773        let result = rule.check(&ctx).unwrap();
774
775        assert!(result.is_empty(), "Should not flag anything with empty configuration");
776    }
777
778    #[test]
779    fn test_names_with_special_characters() {
780        let rule = MD044ProperNames::new(
781            vec!["Node.js".to_string(), "ASP.NET".to_string(), "C++".to_string()],
782            true,
783        );
784
785        let content = "We use nodejs, asp.net, ASP.NET, and c++ in our stack.";
786        let ctx = create_context(content);
787        let result = rule.check(&ctx).unwrap();
788
789        // nodejs should match Node.js (dotless variation)
790        // asp.net should be flagged (wrong case)
791        // ASP.NET should not be flagged (correct)
792        // c++ should be flagged
793        assert_eq!(result.len(), 3, "Should handle special characters correctly");
794
795        let messages: Vec<&str> = result.iter().map(|w| w.message.as_str()).collect();
796        assert!(messages.contains(&"Proper name 'nodejs' should be 'Node.js'"));
797        assert!(messages.contains(&"Proper name 'asp.net' should be 'ASP.NET'"));
798        assert!(messages.contains(&"Proper name 'c++' should be 'C++'"));
799    }
800
801    #[test]
802    fn test_word_boundaries() {
803        let rule = MD044ProperNames::new(vec!["Java".to_string(), "Script".to_string()], true);
804
805        let content = "JavaScript is not java or script, but Java and Script are separate.";
806        let ctx = create_context(content);
807        let result = rule.check(&ctx).unwrap();
808
809        // Should only flag lowercase "java" and "script" as separate words
810        assert_eq!(result.len(), 2, "Should respect word boundaries");
811        assert!(result.iter().any(|w| w.column == 19)); // "java" position
812        assert!(result.iter().any(|w| w.column == 27)); // "script" position
813    }
814
815    #[test]
816    fn test_fix_method() {
817        let rule = MD044ProperNames::new(
818            vec![
819                "JavaScript".to_string(),
820                "TypeScript".to_string(),
821                "Node.js".to_string(),
822            ],
823            true,
824        );
825
826        let content = "I love javascript, typescript, and nodejs!";
827        let ctx = create_context(content);
828        let fixed = rule.fix(&ctx).unwrap();
829
830        assert_eq!(fixed, "I love JavaScript, TypeScript, and Node.js!");
831    }
832
833    #[test]
834    fn test_fix_multiple_occurrences() {
835        let rule = MD044ProperNames::new(vec!["Python".to_string()], true);
836
837        let content = "python is great. I use python daily. PYTHON is powerful.";
838        let ctx = create_context(content);
839        let fixed = rule.fix(&ctx).unwrap();
840
841        assert_eq!(fixed, "Python is great. I use Python daily. Python is powerful.");
842    }
843
844    #[test]
845    fn test_fix_checks_code_blocks_by_default() {
846        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
847
848        let content = r#"I love javascript.
849
850```
851const lang = "javascript";
852```
853
854More javascript here."#;
855
856        let ctx = create_context(content);
857        let fixed = rule.fix(&ctx).unwrap();
858
859        let expected = r#"I love JavaScript.
860
861```
862const lang = "JavaScript";
863```
864
865More JavaScript here."#;
866
867        assert_eq!(fixed, expected);
868    }
869
870    #[test]
871    fn test_multiline_content() {
872        let rule = MD044ProperNames::new(vec!["Rust".to_string(), "Python".to_string()], true);
873
874        let content = r#"First line with rust.
875Second line with python.
876Third line with RUST and PYTHON."#;
877
878        let ctx = create_context(content);
879        let result = rule.check(&ctx).unwrap();
880
881        assert_eq!(result.len(), 4, "Should flag all incorrect occurrences");
882        assert_eq!(result[0].line, 1);
883        assert_eq!(result[1].line, 2);
884        assert_eq!(result[2].line, 3);
885        assert_eq!(result[3].line, 3);
886    }
887
888    #[test]
889    fn test_default_config() {
890        let config = MD044Config::default();
891        assert!(config.names.is_empty());
892        assert!(!config.code_blocks); // Default is false (skip code blocks)
893    }
894
895    #[test]
896    fn test_performance_with_many_names() {
897        let mut names = vec![];
898        for i in 0..50 {
899            names.push(format!("ProperName{i}"));
900        }
901
902        let rule = MD044ProperNames::new(names, true);
903
904        let content = "This has propername0, propername25, and propername49 incorrectly.";
905        let ctx = create_context(content);
906        let result = rule.check(&ctx).unwrap();
907
908        assert_eq!(result.len(), 3, "Should handle many configured names efficiently");
909    }
910
911    #[test]
912    fn test_large_name_count_performance() {
913        // Verify MD044 can handle large numbers of names without regex limitations
914        // This test confirms that fancy-regex handles large patterns well
915        let names = (0..1000).map(|i| format!("ProperName{i}")).collect::<Vec<_>>();
916
917        let rule = MD044ProperNames::new(names, true);
918
919        // The combined pattern should be created successfully
920        assert!(rule.combined_pattern.is_some());
921
922        // Should be able to check content without errors
923        let content = "This has propername0 and propername999 in it.";
924        let ctx = create_context(content);
925        let result = rule.check(&ctx).unwrap();
926
927        // Should detect both incorrect names
928        assert_eq!(result.len(), 2, "Should handle 1000 names without issues");
929    }
930
931    #[test]
932    fn test_cache_behavior() {
933        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
934
935        let content = "Using javascript here.";
936        let ctx = create_context(content);
937
938        // First check
939        let result1 = rule.check(&ctx).unwrap();
940        assert_eq!(result1.len(), 1);
941
942        // Second check should use cache
943        let result2 = rule.check(&ctx).unwrap();
944        assert_eq!(result2.len(), 1);
945
946        // Results should be identical
947        assert_eq!(result1[0].line, result2[0].line);
948        assert_eq!(result1[0].column, result2[0].column);
949    }
950
951    #[test]
952    fn test_html_comments_not_checked_when_disabled() {
953        let config = MD044Config {
954            names: vec!["JavaScript".to_string()],
955            code_blocks: true,    // Check code blocks
956            html_elements: true,  // Check HTML elements
957            html_comments: false, // Don't check HTML comments
958        };
959        let rule = MD044ProperNames::from_config_struct(config);
960
961        let content = r#"Regular javascript here.
962<!-- This javascript in HTML comment should be ignored -->
963More javascript outside."#;
964
965        let ctx = create_context(content);
966        let result = rule.check(&ctx).unwrap();
967
968        assert_eq!(result.len(), 2, "Should only flag javascript outside HTML comments");
969        assert_eq!(result[0].line, 1);
970        assert_eq!(result[1].line, 3);
971    }
972
973    #[test]
974    fn test_html_comments_checked_when_enabled() {
975        let config = MD044Config {
976            names: vec!["JavaScript".to_string()],
977            code_blocks: true,   // Check code blocks
978            html_elements: true, // Check HTML elements
979            html_comments: true, // Check HTML comments
980        };
981        let rule = MD044ProperNames::from_config_struct(config);
982
983        let content = r#"Regular javascript here.
984<!-- This javascript in HTML comment should be checked -->
985More javascript outside."#;
986
987        let ctx = create_context(content);
988        let result = rule.check(&ctx).unwrap();
989
990        assert_eq!(
991            result.len(),
992            3,
993            "Should flag all javascript occurrences including in HTML comments"
994        );
995    }
996
997    #[test]
998    fn test_multiline_html_comments() {
999        let config = MD044Config {
1000            names: vec!["Python".to_string(), "JavaScript".to_string()],
1001            code_blocks: true,    // Check code blocks
1002            html_elements: true,  // Check HTML elements
1003            html_comments: false, // Don't check HTML comments
1004        };
1005        let rule = MD044ProperNames::from_config_struct(config);
1006
1007        let content = r#"Regular python here.
1008<!--
1009This is a multiline comment
1010with javascript and python
1011that should be ignored
1012-->
1013More javascript outside."#;
1014
1015        let ctx = create_context(content);
1016        let result = rule.check(&ctx).unwrap();
1017
1018        assert_eq!(result.len(), 2, "Should only flag names outside HTML comments");
1019        assert_eq!(result[0].line, 1); // python
1020        assert_eq!(result[1].line, 7); // javascript
1021    }
1022
1023    #[test]
1024    fn test_fix_preserves_html_comments_when_disabled() {
1025        let config = MD044Config {
1026            names: vec!["JavaScript".to_string()],
1027            code_blocks: true,    // Check code blocks
1028            html_elements: true,  // Check HTML elements
1029            html_comments: false, // Don't check HTML comments
1030        };
1031        let rule = MD044ProperNames::from_config_struct(config);
1032
1033        let content = r#"javascript here.
1034<!-- javascript in comment -->
1035More javascript."#;
1036
1037        let ctx = create_context(content);
1038        let fixed = rule.fix(&ctx).unwrap();
1039
1040        let expected = r#"JavaScript here.
1041<!-- javascript in comment -->
1042More JavaScript."#;
1043
1044        assert_eq!(
1045            fixed, expected,
1046            "Should not fix names inside HTML comments when disabled"
1047        );
1048    }
1049
1050    #[test]
1051    fn test_proper_names_in_link_text_are_flagged() {
1052        let rule = MD044ProperNames::new(
1053            vec!["JavaScript".to_string(), "Node.js".to_string(), "Python".to_string()],
1054            true,
1055        );
1056
1057        let content = r#"Check this [javascript documentation](https://javascript.info) for info.
1058
1059Visit [node.js homepage](https://nodejs.org) and [python tutorial](https://python.org).
1060
1061Real javascript should be flagged.
1062
1063Also see the [typescript guide][ts-ref] for more.
1064
1065Real python should be flagged too.
1066
1067[ts-ref]: https://typescript.org/handbook"#;
1068
1069        let ctx = create_context(content);
1070        let result = rule.check(&ctx).unwrap();
1071
1072        // Link text should be checked, URLs should not be checked
1073        // Line 1: [javascript documentation] - "javascript" should be flagged
1074        // Line 3: [node.js homepage] - "node.js" should be flagged (matches "Node.js")
1075        // Line 3: [python tutorial] - "python" should be flagged
1076        // Line 5: standalone javascript
1077        // Line 9: standalone python
1078        assert_eq!(result.len(), 5, "Expected 5 warnings: 3 in link text + 2 standalone");
1079
1080        // Verify line numbers for link text warnings
1081        let line_1_warnings: Vec<_> = result.iter().filter(|w| w.line == 1).collect();
1082        assert_eq!(line_1_warnings.len(), 1);
1083        assert!(
1084            line_1_warnings[0]
1085                .message
1086                .contains("'javascript' should be 'JavaScript'")
1087        );
1088
1089        let line_3_warnings: Vec<_> = result.iter().filter(|w| w.line == 3).collect();
1090        assert_eq!(line_3_warnings.len(), 2); // node.js and python
1091
1092        // Standalone warnings
1093        assert!(result.iter().any(|w| w.line == 5 && w.message.contains("'javascript'")));
1094        assert!(result.iter().any(|w| w.line == 9 && w.message.contains("'python'")));
1095    }
1096
1097    #[test]
1098    fn test_link_urls_not_flagged() {
1099        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1100
1101        // URL contains "javascript" but should NOT be flagged
1102        let content = r#"[Link Text](https://javascript.info/guide)"#;
1103
1104        let ctx = create_context(content);
1105        let result = rule.check(&ctx).unwrap();
1106
1107        // URL should not be checked
1108        assert!(result.is_empty(), "URLs should not be checked for proper names");
1109    }
1110
1111    #[test]
1112    fn test_proper_names_in_image_alt_text_are_flagged() {
1113        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1114
1115        let content = r#"Here is a ![javascript logo](javascript.png "javascript icon") image.
1116
1117Real javascript should be flagged."#;
1118
1119        let ctx = create_context(content);
1120        let result = rule.check(&ctx).unwrap();
1121
1122        // Image alt text should be checked, URL and title should not be checked
1123        // Line 1: ![javascript logo] - "javascript" should be flagged
1124        // Line 3: standalone javascript
1125        assert_eq!(result.len(), 2, "Expected 2 warnings: 1 in alt text + 1 standalone");
1126        assert!(result[0].message.contains("'javascript' should be 'JavaScript'"));
1127        assert!(result[0].line == 1); // "![javascript logo]"
1128        assert!(result[1].message.contains("'javascript' should be 'JavaScript'"));
1129        assert!(result[1].line == 3); // "Real javascript should be flagged."
1130    }
1131
1132    #[test]
1133    fn test_image_urls_not_flagged() {
1134        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1135
1136        // URL contains "javascript" but should NOT be flagged
1137        let content = r#"![Logo](https://javascript.info/logo.png)"#;
1138
1139        let ctx = create_context(content);
1140        let result = rule.check(&ctx).unwrap();
1141
1142        // Image URL should not be checked
1143        assert!(result.is_empty(), "Image URLs should not be checked for proper names");
1144    }
1145
1146    #[test]
1147    fn test_reference_link_text_flagged_but_definition_not() {
1148        let rule = MD044ProperNames::new(vec!["JavaScript".to_string(), "TypeScript".to_string()], true);
1149
1150        let content = r#"Check the [javascript guide][js-ref] for details.
1151
1152Real javascript should be flagged.
1153
1154[js-ref]: https://javascript.info/typescript/guide"#;
1155
1156        let ctx = create_context(content);
1157        let result = rule.check(&ctx).unwrap();
1158
1159        // Link text should be checked, reference definitions should not
1160        // Line 1: [javascript guide] - should be flagged
1161        // Line 3: standalone javascript - should be flagged
1162        // Line 5: reference definition - should NOT be flagged
1163        assert_eq!(result.len(), 2, "Expected 2 warnings: 1 in link text + 1 standalone");
1164        assert!(result.iter().any(|w| w.line == 1 && w.message.contains("'javascript'")));
1165        assert!(result.iter().any(|w| w.line == 3 && w.message.contains("'javascript'")));
1166    }
1167
1168    #[test]
1169    fn test_reference_definitions_not_flagged() {
1170        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1171
1172        // Reference definition should NOT be flagged
1173        let content = r#"[js-ref]: https://javascript.info/guide"#;
1174
1175        let ctx = create_context(content);
1176        let result = rule.check(&ctx).unwrap();
1177
1178        // Reference definition URLs should not be checked
1179        assert!(result.is_empty(), "Reference definitions should not be checked");
1180    }
1181
1182    #[test]
1183    fn test_wikilinks_text_is_flagged() {
1184        let rule = MD044ProperNames::new(vec!["JavaScript".to_string()], true);
1185
1186        // WikiLinks [[destination]] should have their text checked
1187        let content = r#"[[javascript]]
1188
1189Regular javascript here.
1190
1191[[JavaScript|display text]]"#;
1192
1193        let ctx = create_context(content);
1194        let result = rule.check(&ctx).unwrap();
1195
1196        // Line 1: [[javascript]] - should be flagged (WikiLink text)
1197        // Line 3: standalone javascript - should be flagged
1198        // Line 5: [[JavaScript|display text]] - correct capitalization, no flag
1199        assert_eq!(result.len(), 2, "Expected 2 warnings: 1 in WikiLink + 1 standalone");
1200        assert!(
1201            result
1202                .iter()
1203                .any(|w| w.line == 1 && w.column == 3 && w.message.contains("'javascript'"))
1204        );
1205        assert!(result.iter().any(|w| w.line == 3 && w.message.contains("'javascript'")));
1206    }
1207
1208    #[test]
1209    fn test_url_link_text_not_flagged() {
1210        let rule = MD044ProperNames::new(vec!["GitHub".to_string()], true);
1211
1212        // Link text that is itself a URL should not be flagged
1213        let content = r#"[https://github.com/org/repo](https://github.com/org/repo)
1214
1215[http://github.com/org/repo](http://github.com/org/repo)
1216
1217[www.github.com/org/repo](https://www.github.com/org/repo)"#;
1218
1219        let ctx = create_context(content);
1220        let result = rule.check(&ctx).unwrap();
1221
1222        assert!(
1223            result.is_empty(),
1224            "URL-like link text should not be flagged, got: {result:?}"
1225        );
1226    }
1227
1228    #[test]
1229    fn test_url_link_text_with_leading_space_not_flagged() {
1230        let rule = MD044ProperNames::new(vec!["GitHub".to_string()], true);
1231
1232        // Leading/trailing whitespace in link text should be trimmed before URL check
1233        let content = r#"[ https://github.com/org/repo](https://github.com/org/repo)"#;
1234
1235        let ctx = create_context(content);
1236        let result = rule.check(&ctx).unwrap();
1237
1238        assert!(
1239            result.is_empty(),
1240            "URL-like link text with leading space should not be flagged, got: {result:?}"
1241        );
1242    }
1243
1244    #[test]
1245    fn test_url_link_text_uppercase_scheme_not_flagged() {
1246        let rule = MD044ProperNames::new(vec!["GitHub".to_string()], true);
1247
1248        let content = r#"[HTTPS://GITHUB.COM/org/repo](https://github.com/org/repo)"#;
1249
1250        let ctx = create_context(content);
1251        let result = rule.check(&ctx).unwrap();
1252
1253        assert!(
1254            result.is_empty(),
1255            "URL-like link text with uppercase scheme should not be flagged, got: {result:?}"
1256        );
1257    }
1258
1259    #[test]
1260    fn test_non_url_link_text_still_flagged() {
1261        let rule = MD044ProperNames::new(vec!["GitHub".to_string()], true);
1262
1263        // Link text that is NOT a URL should still be flagged
1264        let content = r#"[github.com/org/repo](https://github.com/org/repo)
1265
1266[Visit github](https://github.com/org/repo)
1267
1268[//github.com/org/repo](//github.com/org/repo)
1269
1270[ftp://github.com/org/repo](ftp://github.com/org/repo)"#;
1271
1272        let ctx = create_context(content);
1273        let result = rule.check(&ctx).unwrap();
1274
1275        assert_eq!(result.len(), 4, "Non-URL link text should be flagged, got: {result:?}");
1276        assert!(result.iter().any(|w| w.line == 1)); // github.com (no protocol)
1277        assert!(result.iter().any(|w| w.line == 3)); // Visit github
1278        assert!(result.iter().any(|w| w.line == 5)); // //github.com (protocol-relative)
1279        assert!(result.iter().any(|w| w.line == 7)); // ftp://github.com
1280    }
1281
1282    #[test]
1283    fn test_url_link_text_fix_not_applied() {
1284        let rule = MD044ProperNames::new(vec!["GitHub".to_string()], true);
1285
1286        let content = "[https://github.com/org/repo](https://github.com/org/repo)\n";
1287
1288        let ctx = create_context(content);
1289        let result = rule.fix(&ctx).unwrap();
1290
1291        assert_eq!(result, content, "Fix should not modify URL-like link text");
1292    }
1293
1294    #[test]
1295    fn test_mixed_url_and_regular_link_text() {
1296        let rule = MD044ProperNames::new(vec!["GitHub".to_string()], true);
1297
1298        // Mix of URL link text (should skip) and regular text (should flag)
1299        let content = r#"[https://github.com/org/repo](https://github.com/org/repo)
1300
1301Visit [github documentation](https://github.com/docs) for details.
1302
1303[www.github.com/pricing](https://www.github.com/pricing)"#;
1304
1305        let ctx = create_context(content);
1306        let result = rule.check(&ctx).unwrap();
1307
1308        // Only line 3 should be flagged ("github documentation" is not a URL)
1309        assert_eq!(
1310            result.len(),
1311            1,
1312            "Only non-URL link text should be flagged, got: {result:?}"
1313        );
1314        assert_eq!(result[0].line, 3);
1315    }
1316}
rumdl_lib/rules/md044_proper_names.rs

rumdl_lib/rules/
md044_proper_names.rs