1use once_cell::sync::Lazy;
2use regex::Regex;
3use serde::Deserialize;
4use std::{collections::HashSet, rc::Rc};
5use tree_sitter::Node;
6
7use crate::{
8 linter::{range_from_tree_sitter, Context, RuleLinter, RuleViolation},
9 rules::{Rule, RuleType},
10};
11
12#[derive(Debug, PartialEq, Clone, Deserialize, Default)]
14pub struct MD033InlineHtmlTable {
15 #[serde(default)]
16 pub allowed_elements: Vec<String>,
17}
18
19static HTML_TAG_REGEX: Lazy<Regex> = Lazy::new(|| {
21 Regex::new(r"<(/?)([a-zA-Z][a-zA-Z0-9]*)[^>]*/?>").expect("Invalid HTML tag regex")
22});
23
24static CODE_SPAN_REGEX: Lazy<Regex> =
25 Lazy::new(|| Regex::new(r"`[^`]*`").expect("Invalid code span regex"));
26
27pub(crate) struct MD033Linter {
28 context: Rc<Context>,
29 violations: Vec<RuleViolation>,
30 allowed_elements: HashSet<String>,
31 line_starts: Vec<usize>,
32}
33
34impl MD033Linter {
35 pub fn new(context: Rc<Context>) -> Self {
36 let allowed_elements: HashSet<String> = context
38 .config
39 .linters
40 .settings
41 .inline_html
42 .allowed_elements
43 .iter()
44 .map(|element| element.to_lowercase())
45 .collect();
46
47 let line_starts: Vec<usize> = std::iter::once(0)
49 .chain(
50 context
51 .document_content
52 .borrow()
53 .match_indices('\n')
54 .map(|(i, _)| i + 1),
55 )
56 .collect();
57
58 Self {
59 context,
60 violations: Vec::new(),
61 allowed_elements,
62 line_starts,
63 }
64 }
65
66 fn is_allowed_element(&self, element_name: &str) -> bool {
67 self.allowed_elements.contains(&element_name.to_lowercase())
69 }
70
71 fn is_in_code_context(&self, node: &Node) -> bool {
72 let mut current = node.parent();
74 while let Some(parent) = current {
75 match parent.kind() {
76 "code_span" | "fenced_code_block" | "indented_code_block" => {
77 return true;
78 }
79 _ => {
80 current = parent.parent();
81 }
82 }
83 }
84 false
85 }
86
87 fn byte_to_line_col(&self, byte_pos: usize) -> (usize, usize) {
88 let line = match self.line_starts.binary_search(&byte_pos) {
89 Ok(line) => line,
90 Err(line) => line - 1,
91 };
92 let line_start = self.line_starts[line];
93 let col = byte_pos - line_start;
94 (line, col)
95 }
96
97 fn process_html_in_node(&mut self, node: &Node) {
98 let start_byte = node.start_byte();
99 let end_byte = node.end_byte();
100 let content = {
101 let document_content = self.context.document_content.borrow();
102 document_content[start_byte..end_byte].to_string()
103 };
104
105 if node.kind() == "inline" {
106 let mut code_span_ranges = Vec::new();
108 for cap in CODE_SPAN_REGEX.captures_iter(&content) {
109 let span_start = cap.get(0).unwrap().start();
110 let span_end = cap.get(0).unwrap().end();
111 code_span_ranges.push((span_start, span_end));
112 }
113 self.process_html_with_regex(node, &content, start_byte, Some(&code_span_ranges));
114 } else {
115 self.process_html_with_regex(node, &content, start_byte, None);
117 }
118 }
119
120 fn process_html_with_regex(
121 &mut self,
122 _node: &Node,
123 content: &str,
124 start_byte: usize,
125 exclude_ranges: Option<&[(usize, usize)]>,
126 ) {
127 for cap in HTML_TAG_REGEX.captures_iter(content) {
129 if let Some(element_name_match) = cap.get(2) {
130 let tag_start = cap.get(0).unwrap().start();
131 let tag_end = cap.get(0).unwrap().end();
132
133 if let Some(ranges) = exclude_ranges {
135 let mut in_excluded_range = false;
136 for &(exclude_start, exclude_end) in ranges {
137 if tag_start >= exclude_start && tag_end <= exclude_end {
138 in_excluded_range = true;
139 break;
140 }
141 }
142 if in_excluded_range {
143 continue;
144 }
145 }
146
147 let is_closing = cap.get(1).is_some_and(|m| m.as_str() == "/");
148
149 if is_closing {
151 continue;
152 }
153
154 let element_name = element_name_match.as_str();
155
156 if !self.is_allowed_element(element_name) {
158 let tag_start_byte = start_byte + tag_start;
160 let tag_end_byte = start_byte + tag_end;
161 let (start_line, start_col) = self.byte_to_line_col(tag_start_byte);
162 let (end_line, end_col) = self.byte_to_line_col(tag_end_byte);
163
164 let range = range_from_tree_sitter(&tree_sitter::Range {
166 start_byte: tag_start_byte,
167 end_byte: tag_end_byte,
168 start_point: tree_sitter::Point {
169 row: start_line,
170 column: start_col,
171 },
172 end_point: tree_sitter::Point {
173 row: end_line,
174 column: end_col,
175 },
176 });
177
178 let violation = RuleViolation::new(
179 &MD033,
180 format!("Inline HTML [Element: {element_name}]"),
181 self.context.file_path.clone(),
182 range,
183 );
184 self.violations.push(violation);
185 }
186 }
187 }
188 }
189}
190
191impl RuleLinter for MD033Linter {
192 fn feed(&mut self, node: &Node) {
193 match node.kind() {
195 "inline" => {
196 if !self.is_in_code_context(node) {
198 self.process_html_in_node(node);
199 }
200 }
201 "html_block" => {
202 self.process_html_in_node(node);
205 }
206 _ => (),
207 }
208 }
209
210 fn finalize(&mut self) -> Vec<RuleViolation> {
211 std::mem::take(&mut self.violations)
212 }
213}
214
215pub const MD033: Rule = Rule {
216 id: "MD033",
217 alias: "no-inline-html",
218 tags: &["html"],
219 description: "Inline HTML",
220 rule_type: RuleType::Token,
221 required_nodes: &["inline", "html_block"],
222 new_linter: |context| Box::new(MD033Linter::new(context)),
223};
224
225#[cfg(test)]
226mod test {
227 use std::path::PathBuf;
228
229 use crate::config::{LintersSettingsTable, MD033InlineHtmlTable, RuleSeverity};
230 use crate::linter::MultiRuleLinter;
231 use crate::test_utils::test_helpers::test_config_with_settings;
232
233 fn test_config_default() -> crate::config::QuickmarkConfig {
234 test_config_with_settings(
235 vec![("no-inline-html", RuleSeverity::Error)],
236 LintersSettingsTable {
237 inline_html: MD033InlineHtmlTable {
238 allowed_elements: vec![],
239 },
240 ..Default::default()
241 },
242 )
243 }
244
245 fn test_config_with_allowed_elements(
246 allowed_elements: Vec<&str>,
247 ) -> crate::config::QuickmarkConfig {
248 test_config_with_settings(
249 vec![("no-inline-html", RuleSeverity::Error)],
250 LintersSettingsTable {
251 inline_html: MD033InlineHtmlTable {
252 allowed_elements: allowed_elements.iter().map(|s| s.to_string()).collect(),
253 },
254 ..Default::default()
255 },
256 )
257 }
258
259 #[test]
260 fn test_no_inline_html_no_violations() {
261 let config = test_config_default();
262 let input = "# Regular heading
263
264This is regular markdown with no HTML.
265
266- List item 1
267- List item 2
268
269```text
270<p>This should not trigger as it's in a code block</p>
271```
272
273Text `<code>` text (this should not trigger as it's in a code span)";
274
275 let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
276 let violations = linter.analyze();
277 let md033_violations: Vec<_> = violations
278 .iter()
279 .filter(|v| v.rule().id == "MD033")
280 .collect();
281 assert_eq!(md033_violations.len(), 0);
282 }
283
284 #[test]
285 fn test_basic_inline_html_violations() {
286 let config = test_config_default();
287 let input = "# Regular heading
288
289<h1>Inline HTML Heading</h1>
290
291<p>More inline HTML
292but this time on multiple lines
293</p>
294
295Regular text";
296
297 let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
298 let violations = linter.analyze();
299 let md033_violations: Vec<_> = violations
300 .iter()
301 .filter(|v| v.rule().id == "MD033")
302 .collect();
303
304 assert_eq!(md033_violations.len(), 2);
306
307 assert!(md033_violations[0].message().contains("h1"));
309 assert!(md033_violations[1].message().contains("p"));
310 }
311
312 #[test]
313 fn test_self_closing_tags() {
314 let config = test_config_default();
315 let input = "# Heading
316
317<hr>
318
319<hr/>
320
321<br />
322
323<img src=\"test.jpg\" alt=\"test\"/>";
324
325 let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
326 let violations = linter.analyze();
327 let md033_violations: Vec<_> = violations
328 .iter()
329 .filter(|v| v.rule().id == "MD033")
330 .collect();
331
332 assert_eq!(md033_violations.len(), 4);
334
335 assert!(md033_violations.iter().any(|v| v.message().contains("hr")));
337 assert!(md033_violations.iter().any(|v| v.message().contains("br")));
338 assert!(md033_violations.iter().any(|v| v.message().contains("img")));
339 }
340
341 #[test]
342 fn test_allowed_elements() {
343 let config = test_config_with_allowed_elements(vec!["h1", "p", "hr"]);
344 let input = "# Regular heading
345
346<h1>This is allowed</h1>
347
348<h2>This is not allowed</h2>
349
350<p>This is allowed</p>
351
352<div>This is not allowed</div>
353
354<hr>
355
356<hr/>
357
358<br/>";
359
360 let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
361 let violations = linter.analyze();
362 let md033_violations: Vec<_> = violations
363 .iter()
364 .filter(|v| v.rule().id == "MD033")
365 .collect();
366
367 assert_eq!(md033_violations.len(), 3);
369
370 assert!(md033_violations.iter().any(|v| v.message().contains("h2")));
372 assert!(md033_violations.iter().any(|v| v.message().contains("div")));
373 assert!(md033_violations.iter().any(|v| v.message().contains("br")));
374
375 assert!(!md033_violations.iter().any(|v| v.message().contains("h1")));
377 assert!(!md033_violations.iter().any(|v| v.message().contains("p")));
378 assert!(!md033_violations.iter().any(|v| v.message().contains("hr")));
379 }
380
381 #[test]
382 fn test_case_insensitive_allowed_elements() {
383 let config = test_config_with_allowed_elements(vec!["h1", "P"]);
384 let input = "# Regular heading
385
386<h1>Lower case tag, lower case config - allowed</h1>
387
388<H1>Upper case tag, lower case config - allowed</H1>
389
390<p>Lower case tag, upper case config - allowed</p>
391
392<P>Upper case tag, upper case config - allowed</P>
393
394<h2>Not allowed</h2>";
395
396 let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
397 let violations = linter.analyze();
398 let md033_violations: Vec<_> = violations
399 .iter()
400 .filter(|v| v.rule().id == "MD033")
401 .collect();
402
403 assert_eq!(md033_violations.len(), 1);
405 assert!(md033_violations[0].message().contains("h2"));
406 }
407
408 #[test]
409 fn test_nested_html_tags() {
410 let config = test_config_with_allowed_elements(vec!["h1"]);
411 let input = "<h1>This <h2>is not</h2> allowed</h1>";
412
413 let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
414 let violations = linter.analyze();
415 let md033_violations: Vec<_> = violations
416 .iter()
417 .filter(|v| v.rule().id == "MD033")
418 .collect();
419
420 assert_eq!(md033_violations.len(), 1);
422 assert!(md033_violations[0].message().contains("h2"));
423 }
424
425 #[test]
426 fn test_html_in_code_blocks_ignored() {
427 let config = test_config_default();
428 let input = "# Heading
429
430```html
431<h1>This should not trigger</h1>
432<p>Neither should this</p>
433```
434
435 <h1>This shouldn't trigger as it's inside an indented code block</h1>
436
437But <p>this should trigger</p>";
438
439 let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
440 let violations = linter.analyze();
441 let md033_violations: Vec<_> = violations
442 .iter()
443 .filter(|v| v.rule().id == "MD033")
444 .collect();
445
446 assert_eq!(md033_violations.len(), 1);
448 assert!(md033_violations[0].message().contains("p"));
449 }
450
451 #[test]
452 fn test_html_in_code_spans_ignored() {
453 let config = test_config_default();
454 let input = "# Heading
455
456Text `<code>` text should not trigger.
457
458Text `<p>some text</p>` should not trigger.
459
460But <span>this should trigger</span>.";
461
462 let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
463 let violations = linter.analyze();
464 let md033_violations: Vec<_> = violations
465 .iter()
466 .filter(|v| v.rule().id == "MD033")
467 .collect();
468
469 assert_eq!(md033_violations.len(), 1);
471 assert!(md033_violations[0].message().contains("span"));
472 }
473
474 #[test]
475 fn test_only_opening_tags_reported() {
476 let config = test_config_default();
477 let input = "# Heading
478
479<p>Opening and closing tags</p>
480
481<div>
482Content
483</div>";
484
485 let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
486 let violations = linter.analyze();
487 let md033_violations: Vec<_> = violations
488 .iter()
489 .filter(|v| v.rule().id == "MD033")
490 .collect();
491
492 assert_eq!(md033_violations.len(), 2);
494 assert!(md033_violations.iter().any(|v| v.message().contains("p")));
495 assert!(md033_violations.iter().any(|v| v.message().contains("div")));
496 }
497}