rumdl_lib/rules/
md033_no_inline_html.rs

1//!
2//! Rule MD033: No HTML tags
3//!
4//! See [docs/md033.md](../../docs/md033.md) for full documentation, configuration, and examples.
5
6use crate::rule::{LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
7use crate::utils::kramdown_utils::{is_kramdown_block_attribute, is_kramdown_extension};
8use crate::utils::range_utils::calculate_html_tag_range;
9use crate::utils::regex_cache::*;
10use std::collections::HashSet;
11
12mod md033_config;
13use md033_config::MD033Config;
14
15#[derive(Clone)]
16pub struct MD033NoInlineHtml {
17    config: MD033Config,
18    allowed: HashSet<String>,
19}
20
21impl Default for MD033NoInlineHtml {
22    fn default() -> Self {
23        let config = MD033Config::default();
24        let allowed = config.allowed_set();
25        Self { config, allowed }
26    }
27}
28
29impl MD033NoInlineHtml {
30    pub fn new() -> Self {
31        Self::default()
32    }
33
34    pub fn with_allowed(allowed_vec: Vec<String>) -> Self {
35        let config = MD033Config {
36            allowed: allowed_vec.clone(),
37        };
38        let allowed = config.allowed_set();
39        Self { config, allowed }
40    }
41
42    pub fn from_config_struct(config: MD033Config) -> Self {
43        let allowed = config.allowed_set();
44        Self { config, allowed }
45    }
46
47    // Efficient check for allowed tags using HashSet (case-insensitive)
48    #[inline]
49    fn is_tag_allowed(&self, tag: &str) -> bool {
50        if self.allowed.is_empty() {
51            return false;
52        }
53        // Remove angle brackets and slashes, then split by whitespace or '>'
54        let tag = tag.trim_start_matches('<').trim_start_matches('/');
55        let tag_name = tag
56            .split(|c: char| c.is_whitespace() || c == '>' || c == '/')
57            .next()
58            .unwrap_or("");
59        self.allowed.contains(&tag_name.to_lowercase())
60    }
61
62    // Check if a tag is an HTML comment
63    #[inline]
64    fn is_html_comment(&self, tag: &str) -> bool {
65        tag.starts_with("<!--") && tag.ends_with("-->")
66    }
67
68    // Check if a tag is likely a programming type annotation rather than HTML
69    #[inline]
70    fn is_likely_type_annotation(&self, tag: &str) -> bool {
71        // Common programming type names that are often used in generics
72        const COMMON_TYPES: &[&str] = &[
73            "string",
74            "number",
75            "any",
76            "void",
77            "null",
78            "undefined",
79            "array",
80            "promise",
81            "function",
82            "error",
83            "date",
84            "regexp",
85            "symbol",
86            "bigint",
87            "map",
88            "set",
89            "weakmap",
90            "weakset",
91            "iterator",
92            "generator",
93            "t",
94            "u",
95            "v",
96            "k",
97            "e", // Common single-letter type parameters
98            "userdata",
99            "apiresponse",
100            "config",
101            "options",
102            "params",
103            "result",
104            "response",
105            "request",
106            "data",
107            "item",
108            "element",
109            "node",
110        ];
111
112        let tag_content = tag
113            .trim_start_matches('<')
114            .trim_end_matches('>')
115            .trim_start_matches('/');
116        let tag_name = tag_content
117            .split(|c: char| c.is_whitespace() || c == '>' || c == '/')
118            .next()
119            .unwrap_or("");
120
121        // Check if it's a simple tag (no attributes) with a common type name
122        if !tag_content.contains(' ') && !tag_content.contains('=') {
123            COMMON_TYPES.contains(&tag_name.to_ascii_lowercase().as_str())
124        } else {
125            false
126        }
127    }
128
129    // Check if a tag is actually an email address in angle brackets
130    #[inline]
131    fn is_email_address(&self, tag: &str) -> bool {
132        let content = tag.trim_start_matches('<').trim_end_matches('>');
133        // Simple email pattern: contains @ and has reasonable structure
134        content.contains('@')
135            && content.chars().all(|c| c.is_alphanumeric() || "@.-_+".contains(c))
136            && content.split('@').count() == 2
137            && content.split('@').all(|part| !part.is_empty())
138    }
139
140    // Check if a tag has the markdown attribute (MkDocs/Material for MkDocs)
141    #[inline]
142    fn has_markdown_attribute(&self, tag: &str) -> bool {
143        // Check for various forms of markdown attribute
144        // Examples: <div markdown>, <div markdown="1">, <div class="result" markdown>
145        tag.contains(" markdown>") || tag.contains(" markdown=") || tag.contains(" markdown ")
146    }
147
148    // Check if a tag is actually a URL in angle brackets
149    #[inline]
150    fn is_url_in_angle_brackets(&self, tag: &str) -> bool {
151        let content = tag.trim_start_matches('<').trim_end_matches('>');
152        // Check for common URL schemes
153        content.starts_with("http://")
154            || content.starts_with("https://")
155            || content.starts_with("ftp://")
156            || content.starts_with("ftps://")
157            || content.starts_with("mailto:")
158    }
159
160    /// Find HTML tags that span multiple lines
161    fn find_multiline_html_tags(
162        &self,
163        ctx: &crate::lint_context::LintContext,
164        content: &str,
165        nomarkdown_ranges: &[(usize, usize)],
166        warnings: &mut Vec<LintWarning>,
167    ) {
168        // Early return: if content has no incomplete tags at line ends, skip processing
169        if !content.contains('<') || !content.lines().any(|line| line.trim_end().ends_with('<')) {
170            return;
171        }
172
173        // Simple approach: use regex to find patterns like <tagname and then look for closing >
174        lazy_static::lazy_static! {
175            static ref INCOMPLETE_TAG_START: regex::Regex = regex::Regex::new(r"(?i)<[a-zA-Z][^>]*$").unwrap();
176        }
177
178        let lines: Vec<&str> = content.lines().collect();
179
180        for (i, line) in lines.iter().enumerate() {
181            let line_num = i + 1;
182
183            // Skip code blocks and empty lines
184            if line.trim().is_empty() || ctx.is_in_code_block(line_num) {
185                continue;
186            }
187
188            // Skip lines inside nomarkdown blocks
189            if nomarkdown_ranges
190                .iter()
191                .any(|(start, end)| line_num >= *start && line_num <= *end)
192            {
193                continue;
194            }
195
196            // Early return: skip lines that don't end with incomplete tags
197            if !line.contains('<') {
198                continue;
199            }
200
201            // Look for incomplete HTML tags at the end of the line
202            if let Some(incomplete_match) = INCOMPLETE_TAG_START.find(line) {
203                let start_column = incomplete_match.start() + 1; // 1-indexed
204
205                // Build the complete tag by looking at subsequent lines
206                let mut complete_tag = incomplete_match.as_str().to_string();
207                let mut found_end = false;
208
209                // Look for the closing > in subsequent lines (limit search to 10 lines)
210                for (j, next_line) in lines.iter().enumerate().skip(i + 1).take(10) {
211                    let next_line_num = j + 1;
212
213                    // Stop if we hit a code block
214                    if ctx.is_in_code_block(next_line_num) {
215                        break;
216                    }
217
218                    complete_tag.push(' '); // Add space to normalize whitespace
219                    complete_tag.push_str(next_line.trim());
220
221                    if next_line.contains('>') {
222                        found_end = true;
223                        break;
224                    }
225                }
226
227                if found_end {
228                    // Extract just the tag part (up to the first >)
229                    if let Some(end_pos) = complete_tag.find('>') {
230                        let final_tag = &complete_tag[0..=end_pos];
231
232                        // Apply the same filters as single-line tags
233                        let skip_mkdocs_markdown = ctx.flavor == crate::config::MarkdownFlavor::MkDocs
234                            && self.has_markdown_attribute(final_tag);
235
236                        if !self.is_html_comment(final_tag)
237                            && !self.is_likely_type_annotation(final_tag)
238                            && !self.is_email_address(final_tag)
239                            && !self.is_url_in_angle_brackets(final_tag)
240                            && !self.is_tag_allowed(final_tag)
241                            && !skip_mkdocs_markdown
242                            && HTML_TAG_FINDER.is_match(final_tag)
243                        {
244                            // Check for duplicates (avoid flagging the same position twice)
245                            let already_warned =
246                                warnings.iter().any(|w| w.line == line_num && w.column == start_column);
247
248                            if !already_warned {
249                                let (start_line, start_col, end_line, end_col) = calculate_html_tag_range(
250                                    line_num,
251                                    line,
252                                    incomplete_match.start(),
253                                    incomplete_match.len(),
254                                );
255                                warnings.push(LintWarning {
256                                    rule_name: Some(self.name()),
257                                    line: start_line,
258                                    column: start_col,
259                                    end_line,
260                                    end_column: end_col,
261                                    message: format!("HTML tag found: {final_tag} (use Markdown syntax instead)"),
262                                    severity: Severity::Warning,
263                                    fix: None,
264                                });
265                            }
266                        }
267                    }
268                }
269            }
270        }
271    }
272}
273
274impl Rule for MD033NoInlineHtml {
275    fn name(&self) -> &'static str {
276        "MD033"
277    }
278
279    fn description(&self) -> &'static str {
280        "Inline HTML is not allowed"
281    }
282
283    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
284        let content = ctx.content;
285
286        // Early return: if no HTML tags at all, skip processing
287        if content.is_empty() || !ctx.likely_has_html() {
288            return Ok(Vec::new());
289        }
290
291        // Quick check for HTML tag pattern before expensive processing
292        if !HTML_TAG_QUICK_CHECK.is_match(content) {
293            return Ok(Vec::new());
294        }
295
296        let mut warnings = Vec::new();
297        let lines: Vec<&str> = content.lines().collect();
298
299        // Track nomarkdown and comment blocks
300        let mut in_nomarkdown = false;
301        let mut in_comment = false;
302        let mut nomarkdown_ranges: Vec<(usize, usize)> = Vec::new();
303        let mut nomarkdown_start = 0;
304        let mut comment_start = 0;
305
306        // First pass: identify nomarkdown and comment blocks
307        for (i, line) in lines.iter().enumerate() {
308            let line_num = i + 1;
309
310            // Check for nomarkdown start
311            if line.trim() == "{::nomarkdown}" {
312                in_nomarkdown = true;
313                nomarkdown_start = line_num;
314            } else if line.trim() == "{:/nomarkdown}" && in_nomarkdown {
315                in_nomarkdown = false;
316                nomarkdown_ranges.push((nomarkdown_start, line_num));
317            }
318
319            // Check for comment blocks
320            if line.trim() == "{::comment}" {
321                in_comment = true;
322                comment_start = line_num;
323            } else if line.trim() == "{:/comment}" && in_comment {
324                in_comment = false;
325                nomarkdown_ranges.push((comment_start, line_num));
326            }
327        }
328
329        // Second pass: find single-line HTML tags
330        // To match markdownlint behavior, report one warning per HTML tag
331        for (i, line) in lines.iter().enumerate() {
332            let line_num = i + 1;
333
334            if line.trim().is_empty() {
335                continue;
336            }
337            if ctx.is_in_code_block(line_num) {
338                continue;
339            }
340            // Skip lines that are indented code blocks (4+ spaces or tab) per CommonMark spec
341            // Even if they're not in the structure's code blocks (e.g., HTML blocks)
342            if line.starts_with("    ") || line.starts_with('\t') {
343                continue;
344            }
345
346            // Skip lines inside nomarkdown blocks
347            if nomarkdown_ranges
348                .iter()
349                .any(|(start, end)| line_num >= *start && line_num <= *end)
350            {
351                continue;
352            }
353
354            // Skip Kramdown extensions and block attributes
355            if is_kramdown_extension(line) || is_kramdown_block_attribute(line) {
356                continue;
357            }
358
359            // Find all HTML tags in the line using regex
360            for tag_match in HTML_TAG_FINDER.find_iter(line) {
361                let tag = tag_match.as_str();
362
363                // Skip HTML comments
364                if self.is_html_comment(tag) {
365                    continue;
366                }
367
368                // Skip likely programming type annotations
369                if self.is_likely_type_annotation(tag) {
370                    continue;
371                }
372
373                // Skip email addresses in angle brackets
374                if self.is_email_address(tag) {
375                    continue;
376                }
377
378                // Skip URLs in angle brackets
379                if self.is_url_in_angle_brackets(tag) {
380                    continue;
381                }
382
383                // Skip tags inside code spans
384                let tag_start_col = tag_match.start() + 1; // 1-indexed
385                if ctx.is_in_code_span(line_num, tag_start_col) {
386                    continue;
387                }
388
389                // Skip allowed tags
390                if self.is_tag_allowed(tag) {
391                    continue;
392                }
393
394                // Skip tags with markdown attribute in MkDocs mode
395                if ctx.flavor == crate::config::MarkdownFlavor::MkDocs && self.has_markdown_attribute(tag) {
396                    continue;
397                }
398
399                // Report each HTML tag individually (true markdownlint compatibility)
400                let (start_line, start_col, end_line, end_col) =
401                    calculate_html_tag_range(line_num, line, tag_match.start(), tag_match.len());
402                warnings.push(LintWarning {
403                    rule_name: Some(self.name()),
404                    line: start_line,
405                    column: start_col,
406                    end_line,
407                    end_column: end_col,
408                    message: format!("Inline HTML found: {tag} (use Markdown syntax instead)"),
409                    severity: Severity::Warning,
410                    fix: None,
411                });
412            }
413        }
414
415        // Third pass: find multi-line HTML tags
416        self.find_multiline_html_tags(ctx, ctx.content, &nomarkdown_ranges, &mut warnings);
417
418        Ok(warnings)
419    }
420
421    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
422        // No fix for MD033: do not remove or alter HTML, just return the input unchanged
423        Ok(ctx.content.to_string())
424    }
425
426    fn fix_capability(&self) -> crate::rule::FixCapability {
427        crate::rule::FixCapability::Unfixable
428    }
429
430    /// Get the category of this rule for selective processing
431    fn category(&self) -> RuleCategory {
432        RuleCategory::Html
433    }
434
435    /// Check if this rule should be skipped
436    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
437        ctx.content.is_empty() || !ctx.likely_has_html()
438    }
439
440    fn as_any(&self) -> &dyn std::any::Any {
441        self
442    }
443
444    fn default_config_section(&self) -> Option<(String, toml::Value)> {
445        let json_value = serde_json::to_value(&self.config).ok()?;
446        Some((
447            self.name().to_string(),
448            crate::rule_config_serde::json_to_toml_value(&json_value)?,
449        ))
450    }
451
452    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
453    where
454        Self: Sized,
455    {
456        let rule_config = crate::rule_config_serde::load_rule_config::<MD033Config>(config);
457        Box::new(Self::from_config_struct(rule_config))
458    }
459}
460
461#[cfg(test)]
462mod tests {
463    use super::*;
464    use crate::lint_context::LintContext;
465    use crate::rule::Rule;
466
467    #[test]
468    fn test_md033_basic_html() {
469        let rule = MD033NoInlineHtml::default();
470        let content = "<div>Some content</div>";
471        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
472        let result = rule.check(&ctx).unwrap();
473        // Reports one warning per HTML tag (true markdownlint compatibility)
474        assert_eq!(result.len(), 2); // <div> and </div>
475        assert!(result[0].message.starts_with("Inline HTML found: <div>"));
476        assert!(result[1].message.starts_with("Inline HTML found: </div>"));
477    }
478
479    #[test]
480    fn test_md033_case_insensitive() {
481        let rule = MD033NoInlineHtml::default();
482        let content = "<DiV>Some <B>content</B></dIv>";
483        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
484        let result = rule.check(&ctx).unwrap();
485        // Reports one warning per HTML tag (true markdownlint compatibility)
486        assert_eq!(result.len(), 4); // <DiV>, <B>, </B>, </dIv>
487        assert_eq!(
488            result[0].message,
489            "Inline HTML found: <DiV> (use Markdown syntax instead)"
490        );
491        assert_eq!(
492            result[1].message,
493            "Inline HTML found: <B> (use Markdown syntax instead)"
494        );
495        assert_eq!(
496            result[2].message,
497            "Inline HTML found: </B> (use Markdown syntax instead)"
498        );
499        assert_eq!(
500            result[3].message,
501            "Inline HTML found: </dIv> (use Markdown syntax instead)"
502        );
503    }
504
505    #[test]
506    fn test_md033_allowed_tags() {
507        let rule = MD033NoInlineHtml::with_allowed(vec!["div".to_string(), "br".to_string()]);
508        let content = "<div>Allowed</div><p>Not allowed</p><br/>";
509        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
510        let result = rule.check(&ctx).unwrap();
511        // Only warnings for non-allowed tags (<p> and </p>, div and br are allowed)
512        assert_eq!(result.len(), 2);
513        assert_eq!(
514            result[0].message,
515            "Inline HTML found: <p> (use Markdown syntax instead)"
516        );
517        assert_eq!(
518            result[1].message,
519            "Inline HTML found: </p> (use Markdown syntax instead)"
520        );
521
522        // Test case-insensitivity of allowed tags
523        let content2 = "<DIV>Allowed</DIV><P>Not allowed</P><BR/>";
524        let ctx2 = LintContext::new(content2, crate::config::MarkdownFlavor::Standard);
525        let result2 = rule.check(&ctx2).unwrap();
526        assert_eq!(result2.len(), 2); // <P> and </P> flagged
527        assert_eq!(
528            result2[0].message,
529            "Inline HTML found: <P> (use Markdown syntax instead)"
530        );
531        assert_eq!(
532            result2[1].message,
533            "Inline HTML found: </P> (use Markdown syntax instead)"
534        );
535    }
536
537    #[test]
538    fn test_md033_html_comments() {
539        let rule = MD033NoInlineHtml::default();
540        let content = "<!-- This is a comment --> <p>Not a comment</p>";
541        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
542        let result = rule.check(&ctx).unwrap();
543        // Should detect warnings for HTML tags (comments are skipped)
544        assert_eq!(result.len(), 2); // <p> and </p>
545        assert_eq!(
546            result[0].message,
547            "Inline HTML found: <p> (use Markdown syntax instead)"
548        );
549        assert_eq!(
550            result[1].message,
551            "Inline HTML found: </p> (use Markdown syntax instead)"
552        );
553    }
554
555    #[test]
556    fn test_md033_tags_in_links() {
557        let rule = MD033NoInlineHtml::default();
558        let content = "[Link](http://example.com/<div>)";
559        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
560        let result = rule.check(&ctx).unwrap();
561        // The <div> in the URL should be detected as HTML (not skipped)
562        assert_eq!(result.len(), 1);
563        assert_eq!(
564            result[0].message,
565            "Inline HTML found: <div> (use Markdown syntax instead)"
566        );
567
568        let content2 = "[Link <a>text</a>](url)";
569        let ctx2 = LintContext::new(content2, crate::config::MarkdownFlavor::Standard);
570        let result2 = rule.check(&ctx2).unwrap();
571        // Reports one warning per HTML tag (true markdownlint compatibility)
572        assert_eq!(result2.len(), 2); // <a> and </a>
573        assert_eq!(
574            result2[0].message,
575            "Inline HTML found: <a> (use Markdown syntax instead)"
576        );
577        assert_eq!(
578            result2[1].message,
579            "Inline HTML found: </a> (use Markdown syntax instead)"
580        );
581    }
582
583    #[test]
584    fn test_md033_fix_escaping() {
585        let rule = MD033NoInlineHtml::default();
586        let content = "Text with <div> and <br/> tags.";
587        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
588        let fixed_content = rule.fix(&ctx).unwrap();
589        // No fix for HTML tags; output should be unchanged
590        assert_eq!(fixed_content, content);
591    }
592
593    #[test]
594    fn test_md033_in_code_blocks() {
595        let rule = MD033NoInlineHtml::default();
596        let content = "```html\n<div>Code</div>\n```\n<div>Not code</div>";
597        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
598        let result = rule.check(&ctx).unwrap();
599        // Reports one warning per HTML tag (true markdownlint compatibility)
600        assert_eq!(result.len(), 2); // <div> and </div> outside code block
601        assert_eq!(
602            result[0].message,
603            "Inline HTML found: <div> (use Markdown syntax instead)"
604        );
605        assert_eq!(
606            result[1].message,
607            "Inline HTML found: </div> (use Markdown syntax instead)"
608        );
609    }
610
611    #[test]
612    fn test_md033_in_code_spans() {
613        let rule = MD033NoInlineHtml::default();
614        let content = "Text with `<p>in code</p>` span. <br/> Not in span.";
615        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
616        let result = rule.check(&ctx).unwrap();
617        // Should detect <br/> outside code span, but not tags inside code span
618        assert_eq!(result.len(), 1);
619        assert_eq!(
620            result[0].message,
621            "Inline HTML found: <br/> (use Markdown syntax instead)"
622        );
623    }
624
625    #[test]
626    fn test_md033_issue_90_code_span_with_diff_block() {
627        // Test for issue #90: inline code span followed by diff code block
628        let rule = MD033NoInlineHtml::default();
629        let content = r#"# Heading
630
631`<env>`
632
633```diff
634- this
635+ that
636```"#;
637        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
638        let result = rule.check(&ctx).unwrap();
639        // Should NOT detect <env> as HTML since it's inside backticks
640        assert_eq!(result.len(), 0, "Should not report HTML tags inside code spans");
641    }
642
643    #[test]
644    fn test_md033_multiple_code_spans_with_angle_brackets() {
645        // Test multiple code spans on same line
646        let rule = MD033NoInlineHtml::default();
647        let content = "`<one>` and `<two>` and `<three>` are all code spans";
648        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
649        let result = rule.check(&ctx).unwrap();
650        assert_eq!(result.len(), 0, "Should not report HTML tags inside any code spans");
651    }
652
653    #[test]
654    fn test_md033_nested_angle_brackets_in_code_span() {
655        // Test nested angle brackets
656        let rule = MD033NoInlineHtml::default();
657        let content = "Text with `<<nested>>` brackets";
658        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
659        let result = rule.check(&ctx).unwrap();
660        assert_eq!(result.len(), 0, "Should handle nested angle brackets in code spans");
661    }
662
663    #[test]
664    fn test_md033_code_span_at_end_before_code_block() {
665        // Test code span at end of line before code block
666        let rule = MD033NoInlineHtml::default();
667        let content = "Testing `<test>`\n```\ncode here\n```";
668        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
669        let result = rule.check(&ctx).unwrap();
670        assert_eq!(result.len(), 0, "Should handle code span before code block");
671    }
672}