quickmark_core/rules/
md044.rs

1use regex::Regex;
2use serde::Deserialize;
3use std::collections::HashSet;
4use std::rc::Rc;
5
6use crate::{
7    linter::{range_from_tree_sitter, Context, RuleLinter, RuleViolation},
8    rules::{Rule, RuleType},
9};
10
11// MD044-specific configuration types
12#[derive(Debug, PartialEq, Clone, Deserialize)]
13pub struct MD044ProperNamesTable {
14    #[serde(default)]
15    pub names: Vec<String>,
16    #[serde(default)]
17    pub code_blocks: bool,
18    #[serde(default)]
19    pub html_elements: bool,
20}
21
22impl Default for MD044ProperNamesTable {
23    fn default() -> Self {
24        Self {
25            names: Vec::new(),
26            code_blocks: true,
27            html_elements: true,
28        }
29    }
30}
31
32pub(crate) struct MD044Linter {
33    context: Rc<Context>,
34    violations: Vec<RuleViolation>,
35    name_regexes: Vec<(String, Regex)>, // (original_name, compiled_regex)
36    all_names: HashSet<String>,         // Added for performance
37}
38
39impl MD044Linter {
40    pub fn new(context: Rc<Context>) -> Self {
41        let config = &context.config.linters.settings.proper_names;
42        let mut name_regexes = Vec::new();
43
44        // Use a HashSet for efficient lookups of correct names
45        let all_names: HashSet<String> = config.names.iter().cloned().collect();
46
47        // Sort names by length (longest first) to handle overlapping matches
48        let mut names = config.names.clone();
49        names.sort_by(|a, b| b.len().cmp(&a.len()).then_with(|| a.cmp(b)));
50
51        for name in names {
52            if !name.is_empty() {
53                // The original name is the "expected" name in case of a violation
54                if let Ok(regex) = create_name_regex(&name) {
55                    name_regexes.push((name, regex));
56                }
57            }
58        }
59
60        Self {
61            context,
62            violations: Vec::new(),
63            name_regexes,
64            all_names,
65        }
66    }
67
68    fn should_check_node(&self, node_kind: &str) -> bool {
69        let config = &self.context.config.linters.settings.proper_names;
70
71        match node_kind {
72            // Code blocks and inline code
73            "fenced_code_block" | "indented_code_block" | "code_span" => config.code_blocks,
74            // HTML elements
75            "html_block" | "html_inline" => config.html_elements,
76            // Regular text content
77            "text" | "paragraph" => true,
78            _ => false,
79        }
80    }
81
82    // This function is now immutable with respect to self and returns violations.
83    // This improves performance by allowing borrows of self.context in the caller (`feed`).
84    fn check_text_content(
85        &self,
86        text: &str,
87        start_line: usize,
88        start_column: usize,
89    ) -> Vec<RuleViolation> {
90        if self.name_regexes.is_empty() {
91            return Vec::new();
92        }
93
94        let mut violations = Vec::new();
95        let mut exclusion_ranges: Vec<(usize, usize)> = Vec::new(); // (start, end) byte ranges
96
97        for (expected_name, regex) in &self.name_regexes {
98            for match_result in regex.find_iter(text) {
99                let matched_text = match_result.as_str();
100                let match_start = match_result.start();
101                let match_end = match_result.end();
102
103                // Check if this range overlaps with any exclusion range
104                let overlaps = exclusion_ranges
105                    .iter()
106                    .any(|(start, end)| !(match_end <= *start || match_start >= *end));
107
108                if overlaps {
109                    continue;
110                }
111
112                // Performance: Use HashSet for O(1) average lookup and avoid String allocation.
113                if self.all_names.contains(matched_text) {
114                    // Add to exclusions even if it's valid to prevent overlaps with shorter, incorrect names
115                    exclusion_ranges.push((match_start, match_end));
116                    continue;
117                }
118
119                // Create violation range
120                let range = tree_sitter::Range {
121                    start_byte: match_start,
122                    end_byte: match_end,
123                    start_point: tree_sitter::Point {
124                        row: start_line,
125                        column: start_column + match_start,
126                    },
127                    end_point: tree_sitter::Point {
128                        row: start_line,
129                        column: start_column + match_end,
130                    },
131                };
132
133                violations.push(RuleViolation::new(
134                    &MD044,
135                    format!("Expected: {expected_name}; Actual: {matched_text}"),
136                    self.context.file_path.clone(),
137                    range_from_tree_sitter(&range),
138                ));
139
140                // Add violation range to exclusions to prevent multiple reports on the same text
141                exclusion_ranges.push((match_start, match_end));
142            }
143        }
144        violations
145    }
146}
147
148impl RuleLinter for MD044Linter {
149    fn feed(&mut self, node: &tree_sitter::Node) {
150        if !self.should_check_node(node.kind()) {
151            return;
152        }
153
154        let source = self.context.get_document_content();
155        let start_byte = node.start_byte();
156        let end_byte = node.end_byte();
157
158        if end_byte <= source.len() {
159            // Performance: Avoid allocating a new String for each node.
160            // Pass a string slice directly. This is possible because check_text_content
161            // no longer needs a mutable borrow of `self`, resolving the borrow checker conflict.
162            let text_slice = &source[start_byte..end_byte];
163            let start_line = node.start_position().row;
164            let start_column = node.start_position().column;
165
166            let new_violations = self.check_text_content(text_slice, start_line, start_column);
167            self.violations.extend(new_violations);
168        }
169    }
170
171    fn finalize(&mut self) -> Vec<RuleViolation> {
172        std::mem::take(&mut self.violations)
173    }
174}
175
176// Helper function to create a case-insensitive regex for a proper name.
177fn create_name_regex(name: &str) -> Result<Regex, regex::Error> {
178    let escaped_name = regex::escape(name);
179
180    // Word boundaries for the pattern, following original markdownlint logic.
181    // This ensures we match whole words.
182    let starts_with_word_char = name.chars().next().is_some_and(is_word_char);
183    let ends_with_word_char = name.chars().last().is_some_and(is_word_char);
184
185    let start_boundary = if starts_with_word_char { "\\b_*" } else { "" };
186    let end_boundary = if ends_with_word_char { "_*\\b" } else { "" };
187
188    // Performance: Use non-capturing groups (?:...) as we only need the full match.
189    let pattern = format!("(?i){start_boundary}{escaped_name}{end_boundary}");
190    Regex::new(&pattern)
191}
192
193// Helper function to check if a character is a word character (equivalent to \w in regex)
194fn is_word_char(c: char) -> bool {
195    c.is_alphanumeric() || c == '_'
196}
197
198pub const MD044: Rule = Rule {
199    id: "MD044",
200    alias: "proper-names",
201    tags: &["spelling"],
202    description: "Proper names should have the correct capitalization",
203    rule_type: RuleType::Token, // Changed from Special to Token as it processes specific node types
204    required_nodes: &[
205        "text",
206        "paragraph",
207        "fenced_code_block",
208        "indented_code_block",
209        "code_span",
210        "html_block",
211        "html_inline",
212    ],
213    new_linter: |context| Box::new(MD044Linter::new(context)),
214};
215
216#[cfg(test)]
217mod test {
218    use crate::config::{LintersSettingsTable, MD044ProperNamesTable, RuleSeverity};
219    use crate::linter::MultiRuleLinter;
220    use crate::test_utils::test_helpers::test_config_with_settings;
221    use std::path::PathBuf;
222
223    fn test_config(
224        names: Vec<String>,
225        code_blocks: bool,
226        html_elements: bool,
227    ) -> crate::config::QuickmarkConfig {
228        test_config_with_settings(
229            vec![("proper-names", RuleSeverity::Error)],
230            LintersSettingsTable {
231                proper_names: MD044ProperNamesTable {
232                    names,
233                    code_blocks,
234                    html_elements,
235                },
236                ..Default::default()
237            },
238        )
239    }
240
241    #[test]
242    fn test_no_names_configured() {
243        let config = test_config(vec![], true, true);
244        let input = "This contains javascript and GitHub text.";
245
246        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
247        let violations = linter.analyze();
248        assert_eq!(violations.len(), 0);
249    }
250
251    #[test]
252    fn test_exact_match_no_violations() {
253        let config = test_config(
254            vec!["JavaScript".to_string(), "GitHub".to_string()],
255            true,
256            true,
257        );
258        let input = "This text contains JavaScript and GitHub properly capitalized.";
259
260        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
261        let violations = linter.analyze();
262        assert_eq!(violations.len(), 0);
263    }
264
265    #[test]
266    fn test_incorrect_capitalization() {
267        let config = test_config(vec!["JavaScript".to_string()], true, true);
268        let input = "This text contains javascript with incorrect capitalization.";
269
270        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
271        let violations = linter.analyze();
272        assert_eq!(violations.len(), 1);
273        assert!(violations[0].message().contains("Expected: JavaScript"));
274        assert!(violations[0].message().contains("Actual: javascript"));
275    }
276
277    #[test]
278    fn test_multiple_violations() {
279        let config = test_config(
280            vec!["JavaScript".to_string(), "GitHub".to_string()],
281            true,
282            true,
283        );
284        let input = "We use javascript and github for development.";
285
286        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
287        let violations = linter.analyze();
288        assert_eq!(violations.len(), 2);
289    }
290
291    #[test]
292    fn test_code_blocks_enabled() {
293        let config = test_config(vec!["JavaScript".to_string()], true, true);
294        let input = "```\nlet x = javascript;\n```";
295
296        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
297        let violations = linter.analyze();
298        assert_eq!(violations.len(), 1);
299    }
300
301    #[test]
302    fn test_code_blocks_disabled() {
303        let config = test_config(vec!["JavaScript".to_string()], false, true);
304        let input = "```\nlet x = javascript;\n```";
305
306        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
307        let violations = linter.analyze();
308        assert_eq!(violations.len(), 0);
309    }
310
311    #[test]
312    fn test_html_elements_enabled() {
313        let config = test_config(vec!["JavaScript".to_string()], true, true);
314        let input = "<p>We use javascript here</p>";
315
316        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
317        let violations = linter.analyze();
318        assert_eq!(violations.len(), 1);
319    }
320
321    #[test]
322    fn test_html_elements_disabled() {
323        let config = test_config(vec!["JavaScript".to_string()], true, false);
324        let input = "<p>We use javascript here</p>";
325
326        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
327        let violations = linter.analyze();
328        assert_eq!(violations.len(), 0);
329    }
330
331    #[test]
332    fn test_word_boundaries() {
333        let config = test_config(vec!["JavaScript".to_string()], true, true);
334        let input = "The javascriptish language is not javascript.";
335
336        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
337        let violations = linter.analyze();
338        assert_eq!(violations.len(), 1); // Should only match whole word "javascript", not "javascriptish"
339    }
340
341    #[test]
342    fn test_sorting_by_length() {
343        // Test that longer names match first to avoid partial matches
344        let config = test_config(vec!["GitHub".to_string(), "git".to_string()], true, true);
345        let input = "We use github for version control.";
346
347        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
348        let violations = linter.analyze();
349        assert_eq!(violations.len(), 1);
350        assert!(violations[0].message().contains("Expected: GitHub"));
351    }
352
353    #[test]
354    fn test_mixed_case_names() {
355        let config = test_config(
356            vec!["GitHub".to_string(), "github.com".to_string()],
357            true,
358            true,
359        );
360        let input = "Visit github.com or use GITHUB for repos.";
361
362        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
363        let violations = linter.analyze();
364        assert_eq!(violations.len(), 1); // "github.com" is correct, "GITHUB" should be "GitHub"
365        assert!(violations[0].message().contains("Expected: GitHub"));
366        assert!(violations[0].message().contains("Actual: GITHUB"));
367    }
368}