rumdl_lib/rules/
md033_no_inline_html.rs

1//!
2//! Rule MD033: No HTML tags
3//!
4//! See [docs/md033.md](../../docs/md033.md) for full documentation, configuration, and examples.
5
6use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
7use crate::utils::kramdown_utils::{is_kramdown_block_attribute, is_kramdown_extension};
8use crate::utils::regex_cache::*;
9use std::collections::HashSet;
10
11mod md033_config;
12use md033_config::MD033Config;
13
14#[derive(Clone)]
15pub struct MD033NoInlineHtml {
16    config: MD033Config,
17    allowed: HashSet<String>,
18}
19
20impl Default for MD033NoInlineHtml {
21    fn default() -> Self {
22        let config = MD033Config::default();
23        let allowed = config.allowed_set();
24        Self { config, allowed }
25    }
26}
27
28impl MD033NoInlineHtml {
29    pub fn new() -> Self {
30        Self::default()
31    }
32
33    pub fn with_allowed(allowed_vec: Vec<String>) -> Self {
34        let config = MD033Config {
35            allowed: allowed_vec.clone(),
36        };
37        let allowed = config.allowed_set();
38        Self { config, allowed }
39    }
40
41    pub fn from_config_struct(config: MD033Config) -> Self {
42        let allowed = config.allowed_set();
43        Self { config, allowed }
44    }
45
46    // Efficient check for allowed tags using HashSet (case-insensitive)
47    #[inline]
48    fn is_tag_allowed(&self, tag: &str) -> bool {
49        if self.allowed.is_empty() {
50            return false;
51        }
52        // Remove angle brackets and slashes, then split by whitespace or '>'
53        let tag = tag.trim_start_matches('<').trim_start_matches('/');
54        let tag_name = tag
55            .split(|c: char| c.is_whitespace() || c == '>' || c == '/')
56            .next()
57            .unwrap_or("");
58        self.allowed.contains(&tag_name.to_lowercase())
59    }
60
61    // Check if a tag is an HTML comment
62    #[inline]
63    fn is_html_comment(&self, tag: &str) -> bool {
64        tag.starts_with("<!--") && tag.ends_with("-->")
65    }
66
67    // Check if a tag is likely a programming type annotation rather than HTML
68    #[inline]
69    fn is_likely_type_annotation(&self, tag: &str) -> bool {
70        // Common programming type names that are often used in generics
71        const COMMON_TYPES: &[&str] = &[
72            "string",
73            "number",
74            "any",
75            "void",
76            "null",
77            "undefined",
78            "array",
79            "promise",
80            "function",
81            "error",
82            "date",
83            "regexp",
84            "symbol",
85            "bigint",
86            "map",
87            "set",
88            "weakmap",
89            "weakset",
90            "iterator",
91            "generator",
92            "t",
93            "u",
94            "v",
95            "k",
96            "e", // Common single-letter type parameters
97            "userdata",
98            "apiresponse",
99            "config",
100            "options",
101            "params",
102            "result",
103            "response",
104            "request",
105            "data",
106            "item",
107            "element",
108            "node",
109        ];
110
111        let tag_content = tag
112            .trim_start_matches('<')
113            .trim_end_matches('>')
114            .trim_start_matches('/');
115        let tag_name = tag_content
116            .split(|c: char| c.is_whitespace() || c == '>' || c == '/')
117            .next()
118            .unwrap_or("");
119
120        // Check if it's a simple tag (no attributes) with a common type name
121        if !tag_content.contains(' ') && !tag_content.contains('=') {
122            COMMON_TYPES.contains(&tag_name.to_ascii_lowercase().as_str())
123        } else {
124            false
125        }
126    }
127
128    // Check if a tag is actually an email address in angle brackets
129    #[inline]
130    fn is_email_address(&self, tag: &str) -> bool {
131        let content = tag.trim_start_matches('<').trim_end_matches('>');
132        // Simple email pattern: contains @ and has reasonable structure
133        content.contains('@')
134            && content.chars().all(|c| c.is_alphanumeric() || "@.-_+".contains(c))
135            && content.split('@').count() == 2
136            && content.split('@').all(|part| !part.is_empty())
137    }
138
139    // Check if a tag has the markdown attribute (MkDocs/Material for MkDocs)
140    #[inline]
141    fn has_markdown_attribute(&self, tag: &str) -> bool {
142        // Check for various forms of markdown attribute
143        // Examples: <div markdown>, <div markdown="1">, <div class="result" markdown>
144        tag.contains(" markdown>") || tag.contains(" markdown=") || tag.contains(" markdown ")
145    }
146
147    // Check if a tag is actually a URL in angle brackets
148    #[inline]
149    fn is_url_in_angle_brackets(&self, tag: &str) -> bool {
150        let content = tag.trim_start_matches('<').trim_end_matches('>');
151        // Check for common URL schemes
152        content.starts_with("http://")
153            || content.starts_with("https://")
154            || content.starts_with("ftp://")
155            || content.starts_with("ftps://")
156            || content.starts_with("mailto:")
157    }
158
159    /// Calculate fix to remove HTML tags while keeping content
160    ///
161    /// For self-closing tags like `<br/>`, returns a single fix to remove the tag.
162    /// For paired tags like `<span>text</span>`, returns the replacement text (just the content).
163    ///
164    /// Returns (range, replacement_text) where range is the bytes to replace
165    /// and replacement_text is what to put there (content without tags, or empty for self-closing).
166    fn calculate_fix(
167        &self,
168        content: &str,
169        opening_tag: &str,
170        tag_byte_start: usize,
171    ) -> Option<(std::ops::Range<usize>, String)> {
172        // Check if it's a self-closing tag (ends with />)
173        if opening_tag.ends_with("/>") {
174            return Some((tag_byte_start..tag_byte_start + opening_tag.len(), String::new()));
175        }
176
177        // Extract tag name from opening tag (e.g., "<div>" -> "div", "<span class='x'>" -> "span")
178        let tag_name = opening_tag
179            .trim_start_matches('<')
180            .split(|c: char| c.is_whitespace() || c == '>' || c == '/')
181            .next()?
182            .to_lowercase();
183
184        // Build the closing tag pattern
185        let closing_tag = format!("</{tag_name}>");
186
187        // Search for the closing tag after the opening tag
188        let search_start = tag_byte_start + opening_tag.len();
189        if let Some(closing_pos) = content[search_start..].find(&closing_tag) {
190            let closing_byte_start = search_start + closing_pos;
191            let closing_byte_end = closing_byte_start + closing_tag.len();
192
193            // Extract the content between tags
194            let inner_content = &content[search_start..closing_byte_start];
195
196            return Some((tag_byte_start..closing_byte_end, inner_content.to_string()));
197        }
198
199        // If no closing tag found, just remove the opening tag
200        Some((tag_byte_start..tag_byte_start + opening_tag.len(), String::new()))
201    }
202}
203
204impl Rule for MD033NoInlineHtml {
205    fn name(&self) -> &'static str {
206        "MD033"
207    }
208
209    fn description(&self) -> &'static str {
210        "Inline HTML is not allowed"
211    }
212
213    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
214        let content = ctx.content;
215
216        // Early return: if no HTML tags at all, skip processing
217        if content.is_empty() || !ctx.likely_has_html() {
218            return Ok(Vec::new());
219        }
220
221        // Quick check for HTML tag pattern before expensive processing
222        if !HTML_TAG_QUICK_CHECK.is_match(content) {
223            return Ok(Vec::new());
224        }
225
226        let mut warnings = Vec::new();
227        let lines: Vec<&str> = content.lines().collect();
228
229        // Track nomarkdown and comment blocks (Kramdown extension)
230        let mut in_nomarkdown = false;
231        let mut in_comment = false;
232        let mut nomarkdown_ranges: Vec<(usize, usize)> = Vec::new();
233        let mut nomarkdown_start = 0;
234        let mut comment_start = 0;
235
236        for (i, line) in lines.iter().enumerate() {
237            let line_num = i + 1;
238
239            // Check for nomarkdown start
240            if line.trim() == "{::nomarkdown}" {
241                in_nomarkdown = true;
242                nomarkdown_start = line_num;
243            } else if line.trim() == "{:/nomarkdown}" && in_nomarkdown {
244                in_nomarkdown = false;
245                nomarkdown_ranges.push((nomarkdown_start, line_num));
246            }
247
248            // Check for comment blocks
249            if line.trim() == "{::comment}" {
250                in_comment = true;
251                comment_start = line_num;
252            } else if line.trim() == "{:/comment}" && in_comment {
253                in_comment = false;
254                nomarkdown_ranges.push((comment_start, line_num));
255            }
256        }
257
258        // Use centralized HTML parser to get all HTML tags (including multiline)
259        let html_tags = ctx.html_tags();
260
261        for html_tag in html_tags.iter() {
262            // Skip closing tags (only warn on opening tags)
263            if html_tag.is_closing {
264                continue;
265            }
266
267            let line_num = html_tag.line;
268            let tag_byte_start = html_tag.byte_offset;
269
270            // Reconstruct tag string from byte offsets
271            let tag = &content[html_tag.byte_offset..html_tag.byte_end];
272
273            // Skip tags in code blocks
274            if ctx.line_info(line_num).is_some_and(|info| info.in_code_block) {
275                continue;
276            }
277
278            // Skip lines that are indented code blocks (4+ spaces or tab)
279            if let Some(line) = lines.get(line_num.saturating_sub(1)) {
280                if line.starts_with("    ") || line.starts_with('\t') {
281                    continue;
282                }
283
284                // Skip Kramdown extensions and block attributes
285                if is_kramdown_extension(line) || is_kramdown_block_attribute(line) {
286                    continue;
287                }
288            }
289
290            // Skip lines inside nomarkdown blocks
291            if nomarkdown_ranges
292                .iter()
293                .any(|(start, end)| line_num >= *start && line_num <= *end)
294            {
295                continue;
296            }
297
298            // Skip HTML tags inside HTML comments
299            if ctx.is_in_html_comment(tag_byte_start) {
300                continue;
301            }
302
303            // Skip HTML comments themselves
304            if self.is_html_comment(tag) {
305                continue;
306            }
307
308            // Skip JSX components in MDX files (e.g., <Chart />, <MyComponent>)
309            if ctx.flavor.supports_jsx() && html_tag.tag_name.chars().next().is_some_and(|c| c.is_uppercase()) {
310                continue;
311            }
312
313            // Skip likely programming type annotations
314            if self.is_likely_type_annotation(tag) {
315                continue;
316            }
317
318            // Skip email addresses in angle brackets
319            if self.is_email_address(tag) {
320                continue;
321            }
322
323            // Skip URLs in angle brackets
324            if self.is_url_in_angle_brackets(tag) {
325                continue;
326            }
327
328            // Skip tags inside code spans (use byte offset for reliable multi-line span detection)
329            if ctx.is_byte_offset_in_code_span(tag_byte_start) {
330                continue;
331            }
332
333            // Skip allowed tags
334            if self.is_tag_allowed(tag) {
335                continue;
336            }
337
338            // Skip tags with markdown attribute in MkDocs mode
339            if ctx.flavor == crate::config::MarkdownFlavor::MkDocs && self.has_markdown_attribute(tag) {
340                continue;
341            }
342
343            // Calculate fix to remove HTML tags but keep content
344            let fix = self
345                .calculate_fix(content, tag, tag_byte_start)
346                .map(|(range, replacement)| Fix { range, replacement });
347
348            // Report the HTML tag
349            warnings.push(LintWarning {
350                rule_name: Some(self.name().to_string()),
351                line: line_num,
352                column: html_tag.start_col + 1,   // Convert to 1-indexed
353                end_line: line_num,               // TODO: calculate actual end line for multiline tags
354                end_column: html_tag.end_col + 1, // Convert to 1-indexed
355                message: format!("Inline HTML found: {tag}"),
356                severity: Severity::Warning,
357                fix,
358            });
359        }
360
361        Ok(warnings)
362    }
363
364    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
365        // No fix for MD033: do not remove or alter HTML, just return the input unchanged
366        Ok(ctx.content.to_string())
367    }
368
369    fn fix_capability(&self) -> crate::rule::FixCapability {
370        crate::rule::FixCapability::Unfixable
371    }
372
373    /// Get the category of this rule for selective processing
374    fn category(&self) -> RuleCategory {
375        RuleCategory::Html
376    }
377
378    /// Check if this rule should be skipped
379    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
380        ctx.content.is_empty() || !ctx.likely_has_html()
381    }
382
383    fn as_any(&self) -> &dyn std::any::Any {
384        self
385    }
386
387    fn default_config_section(&self) -> Option<(String, toml::Value)> {
388        let json_value = serde_json::to_value(&self.config).ok()?;
389        Some((
390            self.name().to_string(),
391            crate::rule_config_serde::json_to_toml_value(&json_value)?,
392        ))
393    }
394
395    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
396    where
397        Self: Sized,
398    {
399        let rule_config = crate::rule_config_serde::load_rule_config::<MD033Config>(config);
400        Box::new(Self::from_config_struct(rule_config))
401    }
402}
403
404#[cfg(test)]
405mod tests {
406    use super::*;
407    use crate::lint_context::LintContext;
408    use crate::rule::Rule;
409
410    #[test]
411    fn test_md033_basic_html() {
412        let rule = MD033NoInlineHtml::default();
413        let content = "<div>Some content</div>";
414        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
415        let result = rule.check(&ctx).unwrap();
416        // Only reports opening tags, not closing tags
417        assert_eq!(result.len(), 1); // Only <div>, not </div>
418        assert!(result[0].message.starts_with("Inline HTML found: <div>"));
419    }
420
421    #[test]
422    fn test_md033_case_insensitive() {
423        let rule = MD033NoInlineHtml::default();
424        let content = "<DiV>Some <B>content</B></dIv>";
425        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
426        let result = rule.check(&ctx).unwrap();
427        // Only reports opening tags, not closing tags
428        assert_eq!(result.len(), 2); // <DiV>, <B> (not </B>, </dIv>)
429        assert_eq!(result[0].message, "Inline HTML found: <DiV>");
430        assert_eq!(result[1].message, "Inline HTML found: <B>");
431    }
432
433    #[test]
434    fn test_md033_allowed_tags() {
435        let rule = MD033NoInlineHtml::with_allowed(vec!["div".to_string(), "br".to_string()]);
436        let content = "<div>Allowed</div><p>Not allowed</p><br/>";
437        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
438        let result = rule.check(&ctx).unwrap();
439        // Only warnings for non-allowed opening tags (<p> only, div and br are allowed)
440        assert_eq!(result.len(), 1);
441        assert_eq!(result[0].message, "Inline HTML found: <p>");
442
443        // Test case-insensitivity of allowed tags
444        let content2 = "<DIV>Allowed</DIV><P>Not allowed</P><BR/>";
445        let ctx2 = LintContext::new(content2, crate::config::MarkdownFlavor::Standard, None);
446        let result2 = rule.check(&ctx2).unwrap();
447        assert_eq!(result2.len(), 1); // Only <P> flagged
448        assert_eq!(result2[0].message, "Inline HTML found: <P>");
449    }
450
451    #[test]
452    fn test_md033_html_comments() {
453        let rule = MD033NoInlineHtml::default();
454        let content = "<!-- This is a comment --> <p>Not a comment</p>";
455        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
456        let result = rule.check(&ctx).unwrap();
457        // Should detect warnings for HTML opening tags (comments are skipped, closing tags not reported)
458        assert_eq!(result.len(), 1); // Only <p>
459        assert_eq!(result[0].message, "Inline HTML found: <p>");
460    }
461
462    #[test]
463    fn test_md033_tags_in_links() {
464        let rule = MD033NoInlineHtml::default();
465        let content = "[Link](http://example.com/<div>)";
466        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
467        let result = rule.check(&ctx).unwrap();
468        // The <div> in the URL should be detected as HTML (not skipped)
469        assert_eq!(result.len(), 1);
470        assert_eq!(result[0].message, "Inline HTML found: <div>");
471
472        let content2 = "[Link <a>text</a>](url)";
473        let ctx2 = LintContext::new(content2, crate::config::MarkdownFlavor::Standard, None);
474        let result2 = rule.check(&ctx2).unwrap();
475        // Only reports opening tags
476        assert_eq!(result2.len(), 1); // Only <a>
477        assert_eq!(result2[0].message, "Inline HTML found: <a>");
478    }
479
480    #[test]
481    fn test_md033_fix_escaping() {
482        let rule = MD033NoInlineHtml::default();
483        let content = "Text with <div> and <br/> tags.";
484        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
485        let fixed_content = rule.fix(&ctx).unwrap();
486        // No fix for HTML tags; output should be unchanged
487        assert_eq!(fixed_content, content);
488    }
489
490    #[test]
491    fn test_md033_in_code_blocks() {
492        let rule = MD033NoInlineHtml::default();
493        let content = "```html\n<div>Code</div>\n```\n<div>Not code</div>";
494        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
495        let result = rule.check(&ctx).unwrap();
496        // Only reports opening tags outside code block
497        assert_eq!(result.len(), 1); // Only <div> outside code block
498        assert_eq!(result[0].message, "Inline HTML found: <div>");
499    }
500
501    #[test]
502    fn test_md033_in_code_spans() {
503        let rule = MD033NoInlineHtml::default();
504        let content = "Text with `<p>in code</p>` span. <br/> Not in span.";
505        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
506        let result = rule.check(&ctx).unwrap();
507        // Should detect <br/> outside code span, but not tags inside code span
508        assert_eq!(result.len(), 1);
509        assert_eq!(result[0].message, "Inline HTML found: <br/>");
510    }
511
512    #[test]
513    fn test_md033_issue_90_code_span_with_diff_block() {
514        // Test for issue #90: inline code span followed by diff code block
515        let rule = MD033NoInlineHtml::default();
516        let content = r#"# Heading
517
518`<env>`
519
520```diff
521- this
522+ that
523```"#;
524        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
525        let result = rule.check(&ctx).unwrap();
526        // Should NOT detect <env> as HTML since it's inside backticks
527        assert_eq!(result.len(), 0, "Should not report HTML tags inside code spans");
528    }
529
530    #[test]
531    fn test_md033_multiple_code_spans_with_angle_brackets() {
532        // Test multiple code spans on same line
533        let rule = MD033NoInlineHtml::default();
534        let content = "`<one>` and `<two>` and `<three>` are all code spans";
535        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
536        let result = rule.check(&ctx).unwrap();
537        assert_eq!(result.len(), 0, "Should not report HTML tags inside any code spans");
538    }
539
540    #[test]
541    fn test_md033_nested_angle_brackets_in_code_span() {
542        // Test nested angle brackets
543        let rule = MD033NoInlineHtml::default();
544        let content = "Text with `<<nested>>` brackets";
545        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
546        let result = rule.check(&ctx).unwrap();
547        assert_eq!(result.len(), 0, "Should handle nested angle brackets in code spans");
548    }
549
550    #[test]
551    fn test_md033_code_span_at_end_before_code_block() {
552        // Test code span at end of line before code block
553        let rule = MD033NoInlineHtml::default();
554        let content = "Testing `<test>`\n```\ncode here\n```";
555        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
556        let result = rule.check(&ctx).unwrap();
557        assert_eq!(result.len(), 0, "Should handle code span before code block");
558    }
559
560    #[test]
561    fn test_md033_quick_fix_inline_tag() {
562        // Test Quick Fix for inline HTML tags - keeps content, removes tags
563        let rule = MD033NoInlineHtml::default();
564        let content = "This has <span>inline text</span> that should keep content.";
565        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
566        let result = rule.check(&ctx).unwrap();
567
568        assert_eq!(result.len(), 1, "Should find one HTML tag");
569        assert!(result[0].fix.is_some(), "Should have a fix");
570
571        let fix = result[0].fix.as_ref().unwrap();
572        assert_eq!(&content[fix.range.clone()], "<span>inline text</span>");
573        assert_eq!(fix.replacement, "inline text");
574    }
575
576    #[test]
577    fn test_md033_quick_fix_multiline_tag() {
578        // Test Quick Fix for multiline HTML tags - keeps content
579        let rule = MD033NoInlineHtml::default();
580        let content = "<div>\nBlock content\n</div>";
581        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
582        let result = rule.check(&ctx).unwrap();
583
584        assert_eq!(result.len(), 1, "Should find one HTML tag");
585        assert!(result[0].fix.is_some(), "Should have a fix");
586
587        let fix = result[0].fix.as_ref().unwrap();
588        assert_eq!(&content[fix.range.clone()], "<div>\nBlock content\n</div>");
589        assert_eq!(fix.replacement, "\nBlock content\n");
590    }
591
592    #[test]
593    fn test_md033_quick_fix_self_closing_tag() {
594        // Test Quick Fix for self-closing tags - removes tag (no content)
595        let rule = MD033NoInlineHtml::default();
596        let content = "Self-closing: <br/>";
597        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
598        let result = rule.check(&ctx).unwrap();
599
600        assert_eq!(result.len(), 1, "Should find one HTML tag");
601        assert!(result[0].fix.is_some(), "Should have a fix");
602
603        let fix = result[0].fix.as_ref().unwrap();
604        assert_eq!(&content[fix.range.clone()], "<br/>");
605        assert_eq!(fix.replacement, "");
606    }
607
608    #[test]
609    fn test_md033_quick_fix_multiple_tags() {
610        // Test Quick Fix with multiple HTML tags - keeps content for both
611        let rule = MD033NoInlineHtml::default();
612        let content = "<span>first</span> and <strong>second</strong>";
613        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard, None);
614        let result = rule.check(&ctx).unwrap();
615
616        assert_eq!(result.len(), 2, "Should find two HTML tags");
617        assert!(result[0].fix.is_some(), "First tag should have a fix");
618        assert!(result[1].fix.is_some(), "Second tag should have a fix");
619
620        let fix1 = result[0].fix.as_ref().unwrap();
621        assert_eq!(&content[fix1.range.clone()], "<span>first</span>");
622        assert_eq!(fix1.replacement, "first");
623
624        let fix2 = result[1].fix.as_ref().unwrap();
625        assert_eq!(&content[fix2.range.clone()], "<strong>second</strong>");
626        assert_eq!(fix2.replacement, "second");
627    }
628}