quickmark_core/rules/
md033.rs

1use once_cell::sync::Lazy;
2use regex::Regex;
3use serde::Deserialize;
4use std::{collections::HashSet, rc::Rc};
5use tree_sitter::Node;
6
7use crate::{
8    linter::{range_from_tree_sitter, Context, RuleLinter, RuleViolation},
9    rules::{Rule, RuleType},
10};
11
12// MD033-specific configuration types
13#[derive(Debug, PartialEq, Clone, Deserialize, Default)]
14pub struct MD033InlineHtmlTable {
15    #[serde(default)]
16    pub allowed_elements: Vec<String>,
17}
18
19// Memoized regex patterns for HTML tag detection
20static HTML_TAG_REGEX: Lazy<Regex> = Lazy::new(|| {
21    Regex::new(r"<(/?)([a-zA-Z][a-zA-Z0-9]*)[^>]*/?>").expect("Invalid HTML tag regex")
22});
23
24static CODE_SPAN_REGEX: Lazy<Regex> =
25    Lazy::new(|| Regex::new(r"`[^`]*`").expect("Invalid code span regex"));
26
27pub(crate) struct MD033Linter {
28    context: Rc<Context>,
29    violations: Vec<RuleViolation>,
30    allowed_elements: HashSet<String>,
31    line_starts: Vec<usize>,
32}
33
34impl MD033Linter {
35    pub fn new(context: Rc<Context>) -> Self {
36        // Pre-process allowed elements into a HashSet for O(1) lookups
37        let allowed_elements: HashSet<String> = context
38            .config
39            .linters
40            .settings
41            .inline_html
42            .allowed_elements
43            .iter()
44            .map(|element| element.to_lowercase())
45            .collect();
46
47        // Pre-calculate line starts for efficient line/col lookup
48        let line_starts: Vec<usize> = std::iter::once(0)
49            .chain(
50                context
51                    .document_content
52                    .borrow()
53                    .match_indices('\n')
54                    .map(|(i, _)| i + 1),
55            )
56            .collect();
57
58        Self {
59            context,
60            violations: Vec::new(),
61            allowed_elements,
62            line_starts,
63        }
64    }
65
66    fn is_allowed_element(&self, element_name: &str) -> bool {
67        // O(1) lookup in pre-computed HashSet
68        self.allowed_elements.contains(&element_name.to_lowercase())
69    }
70
71    fn is_in_code_context(&self, node: &Node) -> bool {
72        // Check if this node is inside a code span or code block
73        let mut current = node.parent();
74        while let Some(parent) = current {
75            match parent.kind() {
76                "code_span" | "fenced_code_block" | "indented_code_block" => {
77                    return true;
78                }
79                _ => {
80                    current = parent.parent();
81                }
82            }
83        }
84        false
85    }
86
87    fn byte_to_line_col(&self, byte_pos: usize) -> (usize, usize) {
88        let line = match self.line_starts.binary_search(&byte_pos) {
89            Ok(line) => line,
90            Err(line) => line - 1,
91        };
92        let line_start = self.line_starts[line];
93        let col = byte_pos - line_start;
94        (line, col)
95    }
96
97    fn process_html_in_node(&mut self, node: &Node) {
98        let start_byte = node.start_byte();
99        let end_byte = node.end_byte();
100        let content = {
101            let document_content = self.context.document_content.borrow();
102            document_content[start_byte..end_byte].to_string()
103        };
104
105        if node.kind() == "inline" {
106            // Find all code span ranges using memoized regex pattern
107            let mut code_span_ranges = Vec::new();
108            for cap in CODE_SPAN_REGEX.captures_iter(&content) {
109                let span_start = cap.get(0).unwrap().start();
110                let span_end = cap.get(0).unwrap().end();
111                code_span_ranges.push((span_start, span_end));
112            }
113            self.process_html_with_regex(node, &content, start_byte, Some(&code_span_ranges));
114        } else {
115            // For html_block nodes, process directly
116            self.process_html_with_regex(node, &content, start_byte, None);
117        }
118    }
119
120    fn process_html_with_regex(
121        &mut self,
122        _node: &Node,
123        content: &str,
124        start_byte: usize,
125        exclude_ranges: Option<&[(usize, usize)]>,
126    ) {
127        // Use memoized HTML tag regex pattern
128        for cap in HTML_TAG_REGEX.captures_iter(content) {
129            if let Some(element_name_match) = cap.get(2) {
130                let tag_start = cap.get(0).unwrap().start();
131                let tag_end = cap.get(0).unwrap().end();
132
133                // If exclude_ranges are provided, check if the tag is inside one
134                if let Some(ranges) = exclude_ranges {
135                    let mut in_excluded_range = false;
136                    for &(exclude_start, exclude_end) in ranges {
137                        if tag_start >= exclude_start && tag_end <= exclude_end {
138                            in_excluded_range = true;
139                            break;
140                        }
141                    }
142                    if in_excluded_range {
143                        continue;
144                    }
145                }
146
147                let is_closing = cap.get(1).is_some_and(|m| m.as_str() == "/");
148
149                // Skip closing tags - we only want to report opening/self-closing tags
150                if is_closing {
151                    continue;
152                }
153
154                let element_name = element_name_match.as_str();
155
156                // Check if this element is allowed
157                if !self.is_allowed_element(element_name) {
158                    // Calculate precise position of the HTML tag
159                    let tag_start_byte = start_byte + tag_start;
160                    let tag_end_byte = start_byte + tag_end;
161                    let (start_line, start_col) = self.byte_to_line_col(tag_start_byte);
162                    let (end_line, end_col) = self.byte_to_line_col(tag_end_byte);
163
164                    // Create precise tree_sitter::Range for this violation
165                    let range = range_from_tree_sitter(&tree_sitter::Range {
166                        start_byte: tag_start_byte,
167                        end_byte: tag_end_byte,
168                        start_point: tree_sitter::Point {
169                            row: start_line,
170                            column: start_col,
171                        },
172                        end_point: tree_sitter::Point {
173                            row: end_line,
174                            column: end_col,
175                        },
176                    });
177
178                    let violation = RuleViolation::new(
179                        &MD033,
180                        format!("Inline HTML [Element: {element_name}]"),
181                        self.context.file_path.clone(),
182                        range,
183                    );
184                    self.violations.push(violation);
185                }
186            }
187        }
188    }
189}
190
191impl RuleLinter for MD033Linter {
192    fn feed(&mut self, node: &Node) {
193        // Process inline and html_block nodes that may contain HTML
194        match node.kind() {
195            "inline" => {
196                // Check if this inline node is inside a code span by looking at its parent
197                if !self.is_in_code_context(node) {
198                    self.process_html_in_node(node);
199                }
200            }
201            "html_block" => {
202                // HTML blocks should always be processed unless they are in code blocks
203                // But html_block nodes are typically not inside code blocks by tree-sitter design
204                self.process_html_in_node(node);
205            }
206            _ => (),
207        }
208    }
209
210    fn finalize(&mut self) -> Vec<RuleViolation> {
211        std::mem::take(&mut self.violations)
212    }
213}
214
215pub const MD033: Rule = Rule {
216    id: "MD033",
217    alias: "no-inline-html",
218    tags: &["html"],
219    description: "Inline HTML",
220    rule_type: RuleType::Token,
221    required_nodes: &["inline", "html_block"],
222    new_linter: |context| Box::new(MD033Linter::new(context)),
223};
224
225#[cfg(test)]
226mod test {
227    use std::path::PathBuf;
228
229    use crate::config::{LintersSettingsTable, MD033InlineHtmlTable, RuleSeverity};
230    use crate::linter::MultiRuleLinter;
231    use crate::test_utils::test_helpers::test_config_with_settings;
232
233    fn test_config_default() -> crate::config::QuickmarkConfig {
234        test_config_with_settings(
235            vec![("no-inline-html", RuleSeverity::Error)],
236            LintersSettingsTable {
237                inline_html: MD033InlineHtmlTable {
238                    allowed_elements: vec![],
239                },
240                ..Default::default()
241            },
242        )
243    }
244
245    fn test_config_with_allowed_elements(
246        allowed_elements: Vec<&str>,
247    ) -> crate::config::QuickmarkConfig {
248        test_config_with_settings(
249            vec![("no-inline-html", RuleSeverity::Error)],
250            LintersSettingsTable {
251                inline_html: MD033InlineHtmlTable {
252                    allowed_elements: allowed_elements.iter().map(|s| s.to_string()).collect(),
253                },
254                ..Default::default()
255            },
256        )
257    }
258
259    #[test]
260    fn test_no_inline_html_no_violations() {
261        let config = test_config_default();
262        let input = "# Regular heading
263
264This is regular markdown with no HTML.
265
266- List item 1
267- List item 2
268
269```text
270<p>This should not trigger as it's in a code block</p>
271```
272
273Text `<code>` text (this should not trigger as it's in a code span)";
274
275        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
276        let violations = linter.analyze();
277        let md033_violations: Vec<_> = violations
278            .iter()
279            .filter(|v| v.rule().id == "MD033")
280            .collect();
281        assert_eq!(md033_violations.len(), 0);
282    }
283
284    #[test]
285    fn test_basic_inline_html_violations() {
286        let config = test_config_default();
287        let input = "# Regular heading
288
289<h1>Inline HTML Heading</h1>
290
291<p>More inline HTML
292but this time on multiple lines
293</p>
294
295Regular text";
296
297        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
298        let violations = linter.analyze();
299        let md033_violations: Vec<_> = violations
300            .iter()
301            .filter(|v| v.rule().id == "MD033")
302            .collect();
303
304        // Should find 2 violations: <h1> and <p> opening tags
305        assert_eq!(md033_violations.len(), 2);
306
307        // Check that the violations contain the element names
308        assert!(md033_violations[0].message().contains("h1"));
309        assert!(md033_violations[1].message().contains("p"));
310    }
311
312    #[test]
313    fn test_self_closing_tags() {
314        let config = test_config_default();
315        let input = "# Heading
316
317<hr>
318
319<hr/>
320
321<br />
322
323<img src=\"test.jpg\" alt=\"test\"/>";
324
325        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
326        let violations = linter.analyze();
327        let md033_violations: Vec<_> = violations
328            .iter()
329            .filter(|v| v.rule().id == "MD033")
330            .collect();
331
332        // Should find 4 violations: <hr>, <hr/>, <br />, <img/>
333        assert_eq!(md033_violations.len(), 4);
334
335        // Check element names
336        assert!(md033_violations.iter().any(|v| v.message().contains("hr")));
337        assert!(md033_violations.iter().any(|v| v.message().contains("br")));
338        assert!(md033_violations.iter().any(|v| v.message().contains("img")));
339    }
340
341    #[test]
342    fn test_allowed_elements() {
343        let config = test_config_with_allowed_elements(vec!["h1", "p", "hr"]);
344        let input = "# Regular heading
345
346<h1>This is allowed</h1>
347
348<h2>This is not allowed</h2>
349
350<p>This is allowed</p>
351
352<div>This is not allowed</div>
353
354<hr>
355
356<hr/>
357
358<br/>";
359
360        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
361        let violations = linter.analyze();
362        let md033_violations: Vec<_> = violations
363            .iter()
364            .filter(|v| v.rule().id == "MD033")
365            .collect();
366
367        // Should find 3 violations: <h2>, <div>, <br/>
368        assert_eq!(md033_violations.len(), 3);
369
370        // Check that only non-allowed elements are reported
371        assert!(md033_violations.iter().any(|v| v.message().contains("h2")));
372        assert!(md033_violations.iter().any(|v| v.message().contains("div")));
373        assert!(md033_violations.iter().any(|v| v.message().contains("br")));
374
375        // Check that allowed elements are not reported
376        assert!(!md033_violations.iter().any(|v| v.message().contains("h1")));
377        assert!(!md033_violations.iter().any(|v| v.message().contains("p")));
378        assert!(!md033_violations.iter().any(|v| v.message().contains("hr")));
379    }
380
381    #[test]
382    fn test_case_insensitive_allowed_elements() {
383        let config = test_config_with_allowed_elements(vec!["h1", "P"]);
384        let input = "# Regular heading
385
386<h1>Lower case tag, lower case config - allowed</h1>
387
388<H1>Upper case tag, lower case config - allowed</H1>
389
390<p>Lower case tag, upper case config - allowed</p>
391
392<P>Upper case tag, upper case config - allowed</P>
393
394<h2>Not allowed</h2>";
395
396        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
397        let violations = linter.analyze();
398        let md033_violations: Vec<_> = violations
399            .iter()
400            .filter(|v| v.rule().id == "MD033")
401            .collect();
402
403        // Should find only 1 violation: <h2>
404        assert_eq!(md033_violations.len(), 1);
405        assert!(md033_violations[0].message().contains("h2"));
406    }
407
408    #[test]
409    fn test_nested_html_tags() {
410        let config = test_config_with_allowed_elements(vec!["h1"]);
411        let input = "<h1>This <h2>is not</h2> allowed</h1>";
412
413        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
414        let violations = linter.analyze();
415        let md033_violations: Vec<_> = violations
416            .iter()
417            .filter(|v| v.rule().id == "MD033")
418            .collect();
419
420        // Should find 1 violation: <h2> (h1 is allowed)
421        assert_eq!(md033_violations.len(), 1);
422        assert!(md033_violations[0].message().contains("h2"));
423    }
424
425    #[test]
426    fn test_html_in_code_blocks_ignored() {
427        let config = test_config_default();
428        let input = "# Heading
429
430```html
431<h1>This should not trigger</h1>
432<p>Neither should this</p>
433```
434
435    <h1>This shouldn't trigger as it's inside an indented code block</h1>
436
437But <p>this should trigger</p>";
438
439        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
440        let violations = linter.analyze();
441        let md033_violations: Vec<_> = violations
442            .iter()
443            .filter(|v| v.rule().id == "MD033")
444            .collect();
445
446        // Should find only 1 violation: the <p> outside code blocks
447        assert_eq!(md033_violations.len(), 1);
448        assert!(md033_violations[0].message().contains("p"));
449    }
450
451    #[test]
452    fn test_html_in_code_spans_ignored() {
453        let config = test_config_default();
454        let input = "# Heading
455
456Text `<code>` text should not trigger.
457
458Text `<p>some text</p>` should not trigger.
459
460But <span>this should trigger</span>.";
461
462        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
463        let violations = linter.analyze();
464        let md033_violations: Vec<_> = violations
465            .iter()
466            .filter(|v| v.rule().id == "MD033")
467            .collect();
468
469        // Should find only 1 violation: <span>
470        assert_eq!(md033_violations.len(), 1);
471        assert!(md033_violations[0].message().contains("span"));
472    }
473
474    #[test]
475    fn test_only_opening_tags_reported() {
476        let config = test_config_default();
477        let input = "# Heading
478
479<p>Opening and closing tags</p>
480
481<div>
482Content
483</div>";
484
485        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
486        let violations = linter.analyze();
487        let md033_violations: Vec<_> = violations
488            .iter()
489            .filter(|v| v.rule().id == "MD033")
490            .collect();
491
492        // Should find only 2 violations: <p> and <div> opening tags, not the closing tags
493        assert_eq!(md033_violations.len(), 2);
494        assert!(md033_violations.iter().any(|v| v.message().contains("p")));
495        assert!(md033_violations.iter().any(|v| v.message().contains("div")));
496    }
497}