rumdl_lib/rules/
md033_no_inline_html.rs

1//!
2//! Rule MD033: No HTML tags
3//!
4//! See [docs/md033.md](../../docs/md033.md) for full documentation, configuration, and examples.
5
6use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
7use crate::utils::kramdown_utils::{is_kramdown_block_attribute, is_kramdown_extension};
8use crate::utils::regex_cache::*;
9use std::collections::HashSet;
10
11mod md033_config;
12use md033_config::MD033Config;
13
14#[derive(Clone)]
15pub struct MD033NoInlineHtml {
16    config: MD033Config,
17    allowed: HashSet<String>,
18}
19
20impl Default for MD033NoInlineHtml {
21    fn default() -> Self {
22        let config = MD033Config::default();
23        let allowed = config.allowed_set();
24        Self { config, allowed }
25    }
26}
27
28impl MD033NoInlineHtml {
29    pub fn new() -> Self {
30        Self::default()
31    }
32
33    pub fn with_allowed(allowed_vec: Vec<String>) -> Self {
34        let config = MD033Config {
35            allowed: allowed_vec.clone(),
36        };
37        let allowed = config.allowed_set();
38        Self { config, allowed }
39    }
40
41    pub fn from_config_struct(config: MD033Config) -> Self {
42        let allowed = config.allowed_set();
43        Self { config, allowed }
44    }
45
46    // Efficient check for allowed tags using HashSet (case-insensitive)
47    #[inline]
48    fn is_tag_allowed(&self, tag: &str) -> bool {
49        if self.allowed.is_empty() {
50            return false;
51        }
52        // Remove angle brackets and slashes, then split by whitespace or '>'
53        let tag = tag.trim_start_matches('<').trim_start_matches('/');
54        let tag_name = tag
55            .split(|c: char| c.is_whitespace() || c == '>' || c == '/')
56            .next()
57            .unwrap_or("");
58        self.allowed.contains(&tag_name.to_lowercase())
59    }
60
61    // Check if a tag is an HTML comment
62    #[inline]
63    fn is_html_comment(&self, tag: &str) -> bool {
64        tag.starts_with("<!--") && tag.ends_with("-->")
65    }
66
67    // Check if a tag is likely a programming type annotation rather than HTML
68    #[inline]
69    fn is_likely_type_annotation(&self, tag: &str) -> bool {
70        // Common programming type names that are often used in generics
71        const COMMON_TYPES: &[&str] = &[
72            "string",
73            "number",
74            "any",
75            "void",
76            "null",
77            "undefined",
78            "array",
79            "promise",
80            "function",
81            "error",
82            "date",
83            "regexp",
84            "symbol",
85            "bigint",
86            "map",
87            "set",
88            "weakmap",
89            "weakset",
90            "iterator",
91            "generator",
92            "t",
93            "u",
94            "v",
95            "k",
96            "e", // Common single-letter type parameters
97            "userdata",
98            "apiresponse",
99            "config",
100            "options",
101            "params",
102            "result",
103            "response",
104            "request",
105            "data",
106            "item",
107            "element",
108            "node",
109        ];
110
111        let tag_content = tag
112            .trim_start_matches('<')
113            .trim_end_matches('>')
114            .trim_start_matches('/');
115        let tag_name = tag_content
116            .split(|c: char| c.is_whitespace() || c == '>' || c == '/')
117            .next()
118            .unwrap_or("");
119
120        // Check if it's a simple tag (no attributes) with a common type name
121        if !tag_content.contains(' ') && !tag_content.contains('=') {
122            COMMON_TYPES.contains(&tag_name.to_ascii_lowercase().as_str())
123        } else {
124            false
125        }
126    }
127
128    // Check if a tag is actually an email address in angle brackets
129    #[inline]
130    fn is_email_address(&self, tag: &str) -> bool {
131        let content = tag.trim_start_matches('<').trim_end_matches('>');
132        // Simple email pattern: contains @ and has reasonable structure
133        content.contains('@')
134            && content.chars().all(|c| c.is_alphanumeric() || "@.-_+".contains(c))
135            && content.split('@').count() == 2
136            && content.split('@').all(|part| !part.is_empty())
137    }
138
139    // Check if a tag has the markdown attribute (MkDocs/Material for MkDocs)
140    #[inline]
141    fn has_markdown_attribute(&self, tag: &str) -> bool {
142        // Check for various forms of markdown attribute
143        // Examples: <div markdown>, <div markdown="1">, <div class="result" markdown>
144        tag.contains(" markdown>") || tag.contains(" markdown=") || tag.contains(" markdown ")
145    }
146
147    // Check if a tag is actually a URL in angle brackets
148    #[inline]
149    fn is_url_in_angle_brackets(&self, tag: &str) -> bool {
150        let content = tag.trim_start_matches('<').trim_end_matches('>');
151        // Check for common URL schemes
152        content.starts_with("http://")
153            || content.starts_with("https://")
154            || content.starts_with("ftp://")
155            || content.starts_with("ftps://")
156            || content.starts_with("mailto:")
157    }
158
159    /// Calculate fix to remove HTML tags while keeping content
160    ///
161    /// For self-closing tags like `<br/>`, returns a single fix to remove the tag.
162    /// For paired tags like `<span>text</span>`, returns the replacement text (just the content).
163    ///
164    /// Returns (range, replacement_text) where range is the bytes to replace
165    /// and replacement_text is what to put there (content without tags, or empty for self-closing).
166    fn calculate_fix(
167        &self,
168        content: &str,
169        opening_tag: &str,
170        tag_byte_start: usize,
171    ) -> Option<(std::ops::Range<usize>, String)> {
172        // Check if it's a self-closing tag (ends with />)
173        if opening_tag.ends_with("/>") {
174            return Some((tag_byte_start..tag_byte_start + opening_tag.len(), String::new()));
175        }
176
177        // Extract tag name from opening tag (e.g., "<div>" -> "div", "<span class='x'>" -> "span")
178        let tag_name = opening_tag
179            .trim_start_matches('<')
180            .split(|c: char| c.is_whitespace() || c == '>' || c == '/')
181            .next()?
182            .to_lowercase();
183
184        // Build the closing tag pattern
185        let closing_tag = format!("</{tag_name}>");
186
187        // Search for the closing tag after the opening tag
188        let search_start = tag_byte_start + opening_tag.len();
189        if let Some(closing_pos) = content[search_start..].find(&closing_tag) {
190            let closing_byte_start = search_start + closing_pos;
191            let closing_byte_end = closing_byte_start + closing_tag.len();
192
193            // Extract the content between tags
194            let inner_content = &content[search_start..closing_byte_start];
195
196            return Some((tag_byte_start..closing_byte_end, inner_content.to_string()));
197        }
198
199        // If no closing tag found, just remove the opening tag
200        Some((tag_byte_start..tag_byte_start + opening_tag.len(), String::new()))
201    }
202}
203
204impl Rule for MD033NoInlineHtml {
205    fn name(&self) -> &'static str {
206        "MD033"
207    }
208
209    fn description(&self) -> &'static str {
210        "Inline HTML is not allowed"
211    }
212
213    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
214        let content = ctx.content;
215
216        // Early return: if no HTML tags at all, skip processing
217        if content.is_empty() || !ctx.likely_has_html() {
218            return Ok(Vec::new());
219        }
220
221        // Quick check for HTML tag pattern before expensive processing
222        if !HTML_TAG_QUICK_CHECK.is_match(content) {
223            return Ok(Vec::new());
224        }
225
226        let mut warnings = Vec::new();
227        let lines: Vec<&str> = content.lines().collect();
228
229        // Track nomarkdown and comment blocks (Kramdown extension)
230        let mut in_nomarkdown = false;
231        let mut in_comment = false;
232        let mut nomarkdown_ranges: Vec<(usize, usize)> = Vec::new();
233        let mut nomarkdown_start = 0;
234        let mut comment_start = 0;
235
236        for (i, line) in lines.iter().enumerate() {
237            let line_num = i + 1;
238
239            // Check for nomarkdown start
240            if line.trim() == "{::nomarkdown}" {
241                in_nomarkdown = true;
242                nomarkdown_start = line_num;
243            } else if line.trim() == "{:/nomarkdown}" && in_nomarkdown {
244                in_nomarkdown = false;
245                nomarkdown_ranges.push((nomarkdown_start, line_num));
246            }
247
248            // Check for comment blocks
249            if line.trim() == "{::comment}" {
250                in_comment = true;
251                comment_start = line_num;
252            } else if line.trim() == "{:/comment}" && in_comment {
253                in_comment = false;
254                nomarkdown_ranges.push((comment_start, line_num));
255            }
256        }
257
258        // Use centralized HTML parser to get all HTML tags (including multiline)
259        let html_tags = ctx.html_tags();
260
261        for html_tag in html_tags.iter() {
262            // Skip closing tags (only warn on opening tags)
263            if html_tag.is_closing {
264                continue;
265            }
266
267            let line_num = html_tag.line;
268            let tag_byte_start = html_tag.byte_offset;
269
270            // Reconstruct tag string from byte offsets
271            let tag = &content[html_tag.byte_offset..html_tag.byte_end];
272
273            // Skip tags in code blocks
274            if ctx.line_info(line_num).is_some_and(|info| info.in_code_block) {
275                continue;
276            }
277
278            // Skip lines that are indented code blocks (4+ spaces or tab)
279            if let Some(line) = lines.get(line_num.saturating_sub(1)) {
280                if line.starts_with("    ") || line.starts_with('\t') {
281                    continue;
282                }
283
284                // Skip Kramdown extensions and block attributes
285                if is_kramdown_extension(line) || is_kramdown_block_attribute(line) {
286                    continue;
287                }
288            }
289
290            // Skip lines inside nomarkdown blocks
291            if nomarkdown_ranges
292                .iter()
293                .any(|(start, end)| line_num >= *start && line_num <= *end)
294            {
295                continue;
296            }
297
298            // Skip HTML tags inside HTML comments
299            if ctx.is_in_html_comment(tag_byte_start) {
300                continue;
301            }
302
303            // Skip HTML comments themselves
304            if self.is_html_comment(tag) {
305                continue;
306            }
307
308            // Skip JSX components in MDX files (e.g., <Chart />, <MyComponent>)
309            if ctx.flavor.supports_jsx() && html_tag.tag_name.chars().next().is_some_and(|c| c.is_uppercase()) {
310                continue;
311            }
312
313            // Skip likely programming type annotations
314            if self.is_likely_type_annotation(tag) {
315                continue;
316            }
317
318            // Skip email addresses in angle brackets
319            if self.is_email_address(tag) {
320                continue;
321            }
322
323            // Skip URLs in angle brackets
324            if self.is_url_in_angle_brackets(tag) {
325                continue;
326            }
327
328            // Skip tags inside code spans
329            let tag_start_col = html_tag.start_col + 1; // Convert to 1-indexed
330            if ctx.is_in_code_span(line_num, tag_start_col) {
331                continue;
332            }
333
334            // Skip allowed tags
335            if self.is_tag_allowed(tag) {
336                continue;
337            }
338
339            // Skip tags with markdown attribute in MkDocs mode
340            if ctx.flavor == crate::config::MarkdownFlavor::MkDocs && self.has_markdown_attribute(tag) {
341                continue;
342            }
343
344            // Calculate fix to remove HTML tags but keep content
345            let fix = self
346                .calculate_fix(content, tag, tag_byte_start)
347                .map(|(range, replacement)| Fix { range, replacement });
348
349            // Report the HTML tag
350            warnings.push(LintWarning {
351                rule_name: Some(self.name().to_string()),
352                line: line_num,
353                column: html_tag.start_col + 1,   // Convert to 1-indexed
354                end_line: line_num,               // TODO: calculate actual end line for multiline tags
355                end_column: html_tag.end_col + 1, // Convert to 1-indexed
356                message: format!("Inline HTML found: {tag}"),
357                severity: Severity::Warning,
358                fix,
359            });
360        }
361
362        Ok(warnings)
363    }
364
365    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
366        // No fix for MD033: do not remove or alter HTML, just return the input unchanged
367        Ok(ctx.content.to_string())
368    }
369
370    fn fix_capability(&self) -> crate::rule::FixCapability {
371        crate::rule::FixCapability::Unfixable
372    }
373
374    /// Get the category of this rule for selective processing
375    fn category(&self) -> RuleCategory {
376        RuleCategory::Html
377    }
378
379    /// Check if this rule should be skipped
380    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
381        ctx.content.is_empty() || !ctx.likely_has_html()
382    }
383
384    fn as_any(&self) -> &dyn std::any::Any {
385        self
386    }
387
388    fn default_config_section(&self) -> Option<(String, toml::Value)> {
389        let json_value = serde_json::to_value(&self.config).ok()?;
390        Some((
391            self.name().to_string(),
392            crate::rule_config_serde::json_to_toml_value(&json_value)?,
393        ))
394    }
395
396    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
397    where
398        Self: Sized,
399    {
400        let rule_config = crate::rule_config_serde::load_rule_config::<MD033Config>(config);
401        Box::new(Self::from_config_struct(rule_config))
402    }
403}
404
405#[cfg(test)]
406mod tests {
407    use super::*;
408    use crate::lint_context::LintContext;
409    use crate::rule::Rule;
410
411    #[test]
412    fn test_md033_basic_html() {
413        let rule = MD033NoInlineHtml::default();
414        let content = "<div>Some content</div>";
415        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
416        let result = rule.check(&ctx).unwrap();
417        // Only reports opening tags, not closing tags
418        assert_eq!(result.len(), 1); // Only <div>, not </div>
419        assert!(result[0].message.starts_with("Inline HTML found: <div>"));
420    }
421
422    #[test]
423    fn test_md033_case_insensitive() {
424        let rule = MD033NoInlineHtml::default();
425        let content = "<DiV>Some <B>content</B></dIv>";
426        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
427        let result = rule.check(&ctx).unwrap();
428        // Only reports opening tags, not closing tags
429        assert_eq!(result.len(), 2); // <DiV>, <B> (not </B>, </dIv>)
430        assert_eq!(result[0].message, "Inline HTML found: <DiV>");
431        assert_eq!(result[1].message, "Inline HTML found: <B>");
432    }
433
434    #[test]
435    fn test_md033_allowed_tags() {
436        let rule = MD033NoInlineHtml::with_allowed(vec!["div".to_string(), "br".to_string()]);
437        let content = "<div>Allowed</div><p>Not allowed</p><br/>";
438        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
439        let result = rule.check(&ctx).unwrap();
440        // Only warnings for non-allowed opening tags (<p> only, div and br are allowed)
441        assert_eq!(result.len(), 1);
442        assert_eq!(result[0].message, "Inline HTML found: <p>");
443
444        // Test case-insensitivity of allowed tags
445        let content2 = "<DIV>Allowed</DIV><P>Not allowed</P><BR/>";
446        let ctx2 = LintContext::new(content2, crate::config::MarkdownFlavor::Standard);
447        let result2 = rule.check(&ctx2).unwrap();
448        assert_eq!(result2.len(), 1); // Only <P> flagged
449        assert_eq!(result2[0].message, "Inline HTML found: <P>");
450    }
451
452    #[test]
453    fn test_md033_html_comments() {
454        let rule = MD033NoInlineHtml::default();
455        let content = "<!-- This is a comment --> <p>Not a comment</p>";
456        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
457        let result = rule.check(&ctx).unwrap();
458        // Should detect warnings for HTML opening tags (comments are skipped, closing tags not reported)
459        assert_eq!(result.len(), 1); // Only <p>
460        assert_eq!(result[0].message, "Inline HTML found: <p>");
461    }
462
463    #[test]
464    fn test_md033_tags_in_links() {
465        let rule = MD033NoInlineHtml::default();
466        let content = "[Link](http://example.com/<div>)";
467        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
468        let result = rule.check(&ctx).unwrap();
469        // The <div> in the URL should be detected as HTML (not skipped)
470        assert_eq!(result.len(), 1);
471        assert_eq!(result[0].message, "Inline HTML found: <div>");
472
473        let content2 = "[Link <a>text</a>](url)";
474        let ctx2 = LintContext::new(content2, crate::config::MarkdownFlavor::Standard);
475        let result2 = rule.check(&ctx2).unwrap();
476        // Only reports opening tags
477        assert_eq!(result2.len(), 1); // Only <a>
478        assert_eq!(result2[0].message, "Inline HTML found: <a>");
479    }
480
481    #[test]
482    fn test_md033_fix_escaping() {
483        let rule = MD033NoInlineHtml::default();
484        let content = "Text with <div> and <br/> tags.";
485        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
486        let fixed_content = rule.fix(&ctx).unwrap();
487        // No fix for HTML tags; output should be unchanged
488        assert_eq!(fixed_content, content);
489    }
490
491    #[test]
492    fn test_md033_in_code_blocks() {
493        let rule = MD033NoInlineHtml::default();
494        let content = "```html\n<div>Code</div>\n```\n<div>Not code</div>";
495        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
496        let result = rule.check(&ctx).unwrap();
497        // Only reports opening tags outside code block
498        assert_eq!(result.len(), 1); // Only <div> outside code block
499        assert_eq!(result[0].message, "Inline HTML found: <div>");
500    }
501
502    #[test]
503    fn test_md033_in_code_spans() {
504        let rule = MD033NoInlineHtml::default();
505        let content = "Text with `<p>in code</p>` span. <br/> Not in span.";
506        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
507        let result = rule.check(&ctx).unwrap();
508        // Should detect <br/> outside code span, but not tags inside code span
509        assert_eq!(result.len(), 1);
510        assert_eq!(result[0].message, "Inline HTML found: <br/>");
511    }
512
513    #[test]
514    fn test_md033_issue_90_code_span_with_diff_block() {
515        // Test for issue #90: inline code span followed by diff code block
516        let rule = MD033NoInlineHtml::default();
517        let content = r#"# Heading
518
519`<env>`
520
521```diff
522- this
523+ that
524```"#;
525        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
526        let result = rule.check(&ctx).unwrap();
527        // Should NOT detect <env> as HTML since it's inside backticks
528        assert_eq!(result.len(), 0, "Should not report HTML tags inside code spans");
529    }
530
531    #[test]
532    fn test_md033_multiple_code_spans_with_angle_brackets() {
533        // Test multiple code spans on same line
534        let rule = MD033NoInlineHtml::default();
535        let content = "`<one>` and `<two>` and `<three>` are all code spans";
536        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
537        let result = rule.check(&ctx).unwrap();
538        assert_eq!(result.len(), 0, "Should not report HTML tags inside any code spans");
539    }
540
541    #[test]
542    fn test_md033_nested_angle_brackets_in_code_span() {
543        // Test nested angle brackets
544        let rule = MD033NoInlineHtml::default();
545        let content = "Text with `<<nested>>` brackets";
546        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
547        let result = rule.check(&ctx).unwrap();
548        assert_eq!(result.len(), 0, "Should handle nested angle brackets in code spans");
549    }
550
551    #[test]
552    fn test_md033_code_span_at_end_before_code_block() {
553        // Test code span at end of line before code block
554        let rule = MD033NoInlineHtml::default();
555        let content = "Testing `<test>`\n```\ncode here\n```";
556        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
557        let result = rule.check(&ctx).unwrap();
558        assert_eq!(result.len(), 0, "Should handle code span before code block");
559    }
560
561    #[test]
562    fn test_md033_quick_fix_inline_tag() {
563        // Test Quick Fix for inline HTML tags - keeps content, removes tags
564        let rule = MD033NoInlineHtml::default();
565        let content = "This has <span>inline text</span> that should keep content.";
566        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
567        let result = rule.check(&ctx).unwrap();
568
569        assert_eq!(result.len(), 1, "Should find one HTML tag");
570        assert!(result[0].fix.is_some(), "Should have a fix");
571
572        let fix = result[0].fix.as_ref().unwrap();
573        assert_eq!(&content[fix.range.clone()], "<span>inline text</span>");
574        assert_eq!(fix.replacement, "inline text");
575    }
576
577    #[test]
578    fn test_md033_quick_fix_multiline_tag() {
579        // Test Quick Fix for multiline HTML tags - keeps content
580        let rule = MD033NoInlineHtml::default();
581        let content = "<div>\nBlock content\n</div>";
582        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
583        let result = rule.check(&ctx).unwrap();
584
585        assert_eq!(result.len(), 1, "Should find one HTML tag");
586        assert!(result[0].fix.is_some(), "Should have a fix");
587
588        let fix = result[0].fix.as_ref().unwrap();
589        assert_eq!(&content[fix.range.clone()], "<div>\nBlock content\n</div>");
590        assert_eq!(fix.replacement, "\nBlock content\n");
591    }
592
593    #[test]
594    fn test_md033_quick_fix_self_closing_tag() {
595        // Test Quick Fix for self-closing tags - removes tag (no content)
596        let rule = MD033NoInlineHtml::default();
597        let content = "Self-closing: <br/>";
598        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
599        let result = rule.check(&ctx).unwrap();
600
601        assert_eq!(result.len(), 1, "Should find one HTML tag");
602        assert!(result[0].fix.is_some(), "Should have a fix");
603
604        let fix = result[0].fix.as_ref().unwrap();
605        assert_eq!(&content[fix.range.clone()], "<br/>");
606        assert_eq!(fix.replacement, "");
607    }
608
609    #[test]
610    fn test_md033_quick_fix_multiple_tags() {
611        // Test Quick Fix with multiple HTML tags - keeps content for both
612        let rule = MD033NoInlineHtml::default();
613        let content = "<span>first</span> and <strong>second</strong>";
614        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
615        let result = rule.check(&ctx).unwrap();
616
617        assert_eq!(result.len(), 2, "Should find two HTML tags");
618        assert!(result[0].fix.is_some(), "First tag should have a fix");
619        assert!(result[1].fix.is_some(), "Second tag should have a fix");
620
621        let fix1 = result[0].fix.as_ref().unwrap();
622        assert_eq!(&content[fix1.range.clone()], "<span>first</span>");
623        assert_eq!(fix1.replacement, "first");
624
625        let fix2 = result[1].fix.as_ref().unwrap();
626        assert_eq!(&content[fix2.range.clone()], "<strong>second</strong>");
627        assert_eq!(fix2.replacement, "second");
628    }
629}