rumdl_lib/rules/
md033_no_inline_html.rs

1//!
2//! Rule MD033: No HTML tags
3//!
4//! See [docs/md033.md](../../docs/md033.md) for full documentation, configuration, and examples.
5
6use crate::rule::{LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
7use crate::utils::kramdown_utils::{is_kramdown_block_attribute, is_kramdown_extension};
8use crate::utils::range_utils::calculate_html_tag_range;
9use crate::utils::regex_cache::*;
10use std::collections::HashSet;
11
12mod md033_config;
13use md033_config::MD033Config;
14
15#[derive(Clone)]
16pub struct MD033NoInlineHtml {
17    config: MD033Config,
18    allowed: HashSet<String>,
19}
20
21impl Default for MD033NoInlineHtml {
22    fn default() -> Self {
23        let config = MD033Config::default();
24        let allowed = config.allowed_set();
25        Self { config, allowed }
26    }
27}
28
29impl MD033NoInlineHtml {
30    pub fn new() -> Self {
31        Self::default()
32    }
33
34    pub fn with_allowed(allowed_vec: Vec<String>) -> Self {
35        let config = MD033Config {
36            allowed: allowed_vec.clone(),
37        };
38        let allowed = config.allowed_set();
39        Self { config, allowed }
40    }
41
42    pub fn from_config_struct(config: MD033Config) -> Self {
43        let allowed = config.allowed_set();
44        Self { config, allowed }
45    }
46
47    // Efficient check for allowed tags using HashSet (case-insensitive)
48    #[inline]
49    fn is_tag_allowed(&self, tag: &str) -> bool {
50        if self.allowed.is_empty() {
51            return false;
52        }
53        // Remove angle brackets and slashes, then split by whitespace or '>'
54        let tag = tag.trim_start_matches('<').trim_start_matches('/');
55        let tag_name = tag
56            .split(|c: char| c.is_whitespace() || c == '>' || c == '/')
57            .next()
58            .unwrap_or("");
59        self.allowed.contains(&tag_name.to_lowercase())
60    }
61
62    // Check if a tag is an HTML comment
63    #[inline]
64    fn is_html_comment(&self, tag: &str) -> bool {
65        tag.starts_with("<!--") && tag.ends_with("-->")
66    }
67
68    // Check if a tag is likely a programming type annotation rather than HTML
69    #[inline]
70    fn is_likely_type_annotation(&self, tag: &str) -> bool {
71        // Common programming type names that are often used in generics
72        const COMMON_TYPES: &[&str] = &[
73            "string",
74            "number",
75            "any",
76            "void",
77            "null",
78            "undefined",
79            "array",
80            "promise",
81            "function",
82            "error",
83            "date",
84            "regexp",
85            "symbol",
86            "bigint",
87            "map",
88            "set",
89            "weakmap",
90            "weakset",
91            "iterator",
92            "generator",
93            "t",
94            "u",
95            "v",
96            "k",
97            "e", // Common single-letter type parameters
98            "userdata",
99            "apiresponse",
100            "config",
101            "options",
102            "params",
103            "result",
104            "response",
105            "request",
106            "data",
107            "item",
108            "element",
109            "node",
110        ];
111
112        let tag_content = tag
113            .trim_start_matches('<')
114            .trim_end_matches('>')
115            .trim_start_matches('/');
116        let tag_name = tag_content
117            .split(|c: char| c.is_whitespace() || c == '>' || c == '/')
118            .next()
119            .unwrap_or("");
120
121        // Check if it's a simple tag (no attributes) with a common type name
122        if !tag_content.contains(' ') && !tag_content.contains('=') {
123            COMMON_TYPES.contains(&tag_name.to_ascii_lowercase().as_str())
124        } else {
125            false
126        }
127    }
128
129    // Check if a tag is actually an email address in angle brackets
130    #[inline]
131    fn is_email_address(&self, tag: &str) -> bool {
132        let content = tag.trim_start_matches('<').trim_end_matches('>');
133        // Simple email pattern: contains @ and has reasonable structure
134        content.contains('@')
135            && content.chars().all(|c| c.is_alphanumeric() || "@.-_+".contains(c))
136            && content.split('@').count() == 2
137            && content.split('@').all(|part| !part.is_empty())
138    }
139
140    // Check if a tag has the markdown attribute (MkDocs/Material for MkDocs)
141    #[inline]
142    fn has_markdown_attribute(&self, tag: &str) -> bool {
143        // Check for various forms of markdown attribute
144        // Examples: <div markdown>, <div markdown="1">, <div class="result" markdown>
145        tag.contains(" markdown>") || tag.contains(" markdown=") || tag.contains(" markdown ")
146    }
147
148    // Check if a tag is actually a URL in angle brackets
149    #[inline]
150    fn is_url_in_angle_brackets(&self, tag: &str) -> bool {
151        let content = tag.trim_start_matches('<').trim_end_matches('>');
152        // Check for common URL schemes
153        content.starts_with("http://")
154            || content.starts_with("https://")
155            || content.starts_with("ftp://")
156            || content.starts_with("ftps://")
157            || content.starts_with("mailto:")
158    }
159
160    /// Find HTML tags that span multiple lines
161    fn find_multiline_html_tags(
162        &self,
163        ctx: &crate::lint_context::LintContext,
164        content: &str,
165        nomarkdown_ranges: &[(usize, usize)],
166        warnings: &mut Vec<LintWarning>,
167    ) {
168        // Early return: if content has no incomplete tags at line ends, skip processing
169        if !content.contains('<') || !content.lines().any(|line| line.trim_end().ends_with('<')) {
170            return;
171        }
172
173        // Simple approach: use regex to find patterns like <tagname and then look for closing >
174        lazy_static::lazy_static! {
175            static ref INCOMPLETE_TAG_START: regex::Regex = regex::Regex::new(r"(?i)<[a-zA-Z][^>]*$").unwrap();
176        }
177
178        let lines: Vec<&str> = content.lines().collect();
179
180        for (i, line) in lines.iter().enumerate() {
181            let line_num = i + 1;
182
183            // Skip code blocks and empty lines
184            if line.trim().is_empty() || ctx.is_in_code_block(line_num) {
185                continue;
186            }
187
188            // Skip lines inside nomarkdown blocks
189            if nomarkdown_ranges
190                .iter()
191                .any(|(start, end)| line_num >= *start && line_num <= *end)
192            {
193                continue;
194            }
195
196            // Early return: skip lines that don't end with incomplete tags
197            if !line.contains('<') {
198                continue;
199            }
200
201            // Look for incomplete HTML tags at the end of the line
202            if let Some(incomplete_match) = INCOMPLETE_TAG_START.find(line) {
203                let start_column = incomplete_match.start() + 1; // 1-indexed
204
205                // Build the complete tag by looking at subsequent lines
206                let mut complete_tag = incomplete_match.as_str().to_string();
207                let mut found_end = false;
208
209                // Look for the closing > in subsequent lines (limit search to 10 lines)
210                for (j, next_line) in lines.iter().enumerate().skip(i + 1).take(10) {
211                    let next_line_num = j + 1;
212
213                    // Stop if we hit a code block
214                    if ctx.is_in_code_block(next_line_num) {
215                        break;
216                    }
217
218                    complete_tag.push(' '); // Add space to normalize whitespace
219                    complete_tag.push_str(next_line.trim());
220
221                    if next_line.contains('>') {
222                        found_end = true;
223                        break;
224                    }
225                }
226
227                if found_end {
228                    // Extract just the tag part (up to the first >)
229                    if let Some(end_pos) = complete_tag.find('>') {
230                        let final_tag = &complete_tag[0..=end_pos];
231
232                        // Apply the same filters as single-line tags
233                        let skip_mkdocs_markdown = ctx.flavor == crate::config::MarkdownFlavor::MkDocs
234                            && self.has_markdown_attribute(final_tag);
235
236                        if !self.is_html_comment(final_tag)
237                            && !self.is_likely_type_annotation(final_tag)
238                            && !self.is_email_address(final_tag)
239                            && !self.is_url_in_angle_brackets(final_tag)
240                            && !self.is_tag_allowed(final_tag)
241                            && !skip_mkdocs_markdown
242                            && HTML_TAG_FINDER.is_match(final_tag)
243                        {
244                            // Check for duplicates (avoid flagging the same position twice)
245                            let already_warned =
246                                warnings.iter().any(|w| w.line == line_num && w.column == start_column);
247
248                            if !already_warned {
249                                let (start_line, start_col, end_line, end_col) = calculate_html_tag_range(
250                                    line_num,
251                                    line,
252                                    incomplete_match.start(),
253                                    incomplete_match.len(),
254                                );
255                                warnings.push(LintWarning {
256                                    rule_name: Some(self.name()),
257                                    line: start_line,
258                                    column: start_col,
259                                    end_line,
260                                    end_column: end_col,
261                                    message: format!("HTML tag found: {final_tag} (use Markdown syntax instead)"),
262                                    severity: Severity::Warning,
263                                    fix: None,
264                                });
265                            }
266                        }
267                    }
268                }
269            }
270        }
271    }
272}
273
274impl Rule for MD033NoInlineHtml {
275    fn name(&self) -> &'static str {
276        "MD033"
277    }
278
279    fn description(&self) -> &'static str {
280        "Inline HTML is not allowed"
281    }
282
283    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
284        let content = ctx.content;
285
286        // Early return: if no HTML tags at all, skip processing
287        if content.is_empty() || !has_html_tags(content) {
288            return Ok(Vec::new());
289        }
290
291        // Quick check for HTML tag pattern before expensive processing
292        if !HTML_TAG_QUICK_CHECK.is_match(content) {
293            return Ok(Vec::new());
294        }
295
296        let mut warnings = Vec::new();
297        let lines: Vec<&str> = content.lines().collect();
298
299        // Track nomarkdown and comment blocks
300        let mut in_nomarkdown = false;
301        let mut in_comment = false;
302        let mut nomarkdown_ranges: Vec<(usize, usize)> = Vec::new();
303        let mut nomarkdown_start = 0;
304        let mut comment_start = 0;
305
306        // First pass: identify nomarkdown and comment blocks
307        for (i, line) in lines.iter().enumerate() {
308            let line_num = i + 1;
309
310            // Check for nomarkdown start
311            if line.trim() == "{::nomarkdown}" {
312                in_nomarkdown = true;
313                nomarkdown_start = line_num;
314            } else if line.trim() == "{:/nomarkdown}" && in_nomarkdown {
315                in_nomarkdown = false;
316                nomarkdown_ranges.push((nomarkdown_start, line_num));
317            }
318
319            // Check for comment blocks
320            if line.trim() == "{::comment}" {
321                in_comment = true;
322                comment_start = line_num;
323            } else if line.trim() == "{:/comment}" && in_comment {
324                in_comment = false;
325                nomarkdown_ranges.push((comment_start, line_num));
326            }
327        }
328
329        // Second pass: find single-line HTML tags
330        // To match markdownlint behavior, report one warning per HTML tag
331        for (i, line) in lines.iter().enumerate() {
332            let line_num = i + 1;
333
334            if line.trim().is_empty() {
335                continue;
336            }
337            if ctx.is_in_code_block(line_num) {
338                continue;
339            }
340            // Skip lines that are indented code blocks (4+ spaces or tab) per CommonMark spec
341            // Even if they're not in the structure's code blocks (e.g., HTML blocks)
342            if line.starts_with("    ") || line.starts_with('\t') {
343                continue;
344            }
345
346            // Skip lines inside nomarkdown blocks
347            if nomarkdown_ranges
348                .iter()
349                .any(|(start, end)| line_num >= *start && line_num <= *end)
350            {
351                continue;
352            }
353
354            // Skip Kramdown extensions and block attributes
355            if is_kramdown_extension(line) || is_kramdown_block_attribute(line) {
356                continue;
357            }
358
359            // Find all HTML tags in the line using regex
360            for tag_match in HTML_TAG_FINDER.find_iter(line) {
361                let tag = tag_match.as_str();
362
363                // Skip HTML comments
364                if self.is_html_comment(tag) {
365                    continue;
366                }
367
368                // Skip likely programming type annotations
369                if self.is_likely_type_annotation(tag) {
370                    continue;
371                }
372
373                // Skip email addresses in angle brackets
374                if self.is_email_address(tag) {
375                    continue;
376                }
377
378                // Skip URLs in angle brackets
379                if self.is_url_in_angle_brackets(tag) {
380                    continue;
381                }
382
383                // Skip tags inside code spans
384                let tag_start_col = tag_match.start() + 1; // 1-indexed
385                if ctx.is_in_code_span(line_num, tag_start_col) {
386                    continue;
387                }
388
389                // Skip allowed tags
390                if self.is_tag_allowed(tag) {
391                    continue;
392                }
393
394                // Skip tags with markdown attribute in MkDocs mode
395                if ctx.flavor == crate::config::MarkdownFlavor::MkDocs && self.has_markdown_attribute(tag) {
396                    continue;
397                }
398
399                // Report each HTML tag individually (true markdownlint compatibility)
400                let (start_line, start_col, end_line, end_col) =
401                    calculate_html_tag_range(line_num, line, tag_match.start(), tag_match.len());
402                warnings.push(LintWarning {
403                    rule_name: Some(self.name()),
404                    line: start_line,
405                    column: start_col,
406                    end_line,
407                    end_column: end_col,
408                    message: format!("Inline HTML found: {tag} (use Markdown syntax instead)"),
409                    severity: Severity::Warning,
410                    fix: None,
411                });
412            }
413        }
414
415        // Third pass: find multi-line HTML tags
416        self.find_multiline_html_tags(ctx, ctx.content, &nomarkdown_ranges, &mut warnings);
417
418        Ok(warnings)
419    }
420
421    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
422        // No fix for MD033: do not remove or alter HTML, just return the input unchanged
423        Ok(ctx.content.to_string())
424    }
425
426    fn fix_capability(&self) -> crate::rule::FixCapability {
427        crate::rule::FixCapability::Unfixable
428    }
429
430    /// Get the category of this rule for selective processing
431    fn category(&self) -> RuleCategory {
432        RuleCategory::Html
433    }
434
435    /// Check if this rule should be skipped
436    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
437        let content = ctx.content;
438        content.is_empty() || !has_html_tags(content)
439    }
440
441    fn as_any(&self) -> &dyn std::any::Any {
442        self
443    }
444
445    fn default_config_section(&self) -> Option<(String, toml::Value)> {
446        let json_value = serde_json::to_value(&self.config).ok()?;
447        Some((
448            self.name().to_string(),
449            crate::rule_config_serde::json_to_toml_value(&json_value)?,
450        ))
451    }
452
453    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
454    where
455        Self: Sized,
456    {
457        let rule_config = crate::rule_config_serde::load_rule_config::<MD033Config>(config);
458        Box::new(Self::from_config_struct(rule_config))
459    }
460}
461
462#[cfg(test)]
463mod tests {
464    use super::*;
465    use crate::lint_context::LintContext;
466    use crate::rule::Rule;
467
468    #[test]
469    fn test_md033_basic_html() {
470        let rule = MD033NoInlineHtml::default();
471        let content = "<div>Some content</div>";
472        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
473        let result = rule.check(&ctx).unwrap();
474        // Reports one warning per HTML tag (true markdownlint compatibility)
475        assert_eq!(result.len(), 2); // <div> and </div>
476        assert!(result[0].message.starts_with("Inline HTML found: <div>"));
477        assert!(result[1].message.starts_with("Inline HTML found: </div>"));
478    }
479
480    #[test]
481    fn test_md033_case_insensitive() {
482        let rule = MD033NoInlineHtml::default();
483        let content = "<DiV>Some <B>content</B></dIv>";
484        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
485        let result = rule.check(&ctx).unwrap();
486        // Reports one warning per HTML tag (true markdownlint compatibility)
487        assert_eq!(result.len(), 4); // <DiV>, <B>, </B>, </dIv>
488        assert_eq!(
489            result[0].message,
490            "Inline HTML found: <DiV> (use Markdown syntax instead)"
491        );
492        assert_eq!(
493            result[1].message,
494            "Inline HTML found: <B> (use Markdown syntax instead)"
495        );
496        assert_eq!(
497            result[2].message,
498            "Inline HTML found: </B> (use Markdown syntax instead)"
499        );
500        assert_eq!(
501            result[3].message,
502            "Inline HTML found: </dIv> (use Markdown syntax instead)"
503        );
504    }
505
506    #[test]
507    fn test_md033_allowed_tags() {
508        let rule = MD033NoInlineHtml::with_allowed(vec!["div".to_string(), "br".to_string()]);
509        let content = "<div>Allowed</div><p>Not allowed</p><br/>";
510        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
511        let result = rule.check(&ctx).unwrap();
512        // Only warnings for non-allowed tags (<p> and </p>, div and br are allowed)
513        assert_eq!(result.len(), 2);
514        assert_eq!(
515            result[0].message,
516            "Inline HTML found: <p> (use Markdown syntax instead)"
517        );
518        assert_eq!(
519            result[1].message,
520            "Inline HTML found: </p> (use Markdown syntax instead)"
521        );
522
523        // Test case-insensitivity of allowed tags
524        let content2 = "<DIV>Allowed</DIV><P>Not allowed</P><BR/>";
525        let ctx2 = LintContext::new(content2, crate::config::MarkdownFlavor::Standard);
526        let result2 = rule.check(&ctx2).unwrap();
527        assert_eq!(result2.len(), 2); // <P> and </P> flagged
528        assert_eq!(
529            result2[0].message,
530            "Inline HTML found: <P> (use Markdown syntax instead)"
531        );
532        assert_eq!(
533            result2[1].message,
534            "Inline HTML found: </P> (use Markdown syntax instead)"
535        );
536    }
537
538    #[test]
539    fn test_md033_html_comments() {
540        let rule = MD033NoInlineHtml::default();
541        let content = "<!-- This is a comment --> <p>Not a comment</p>";
542        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
543        let result = rule.check(&ctx).unwrap();
544        // Should detect warnings for HTML tags (comments are skipped)
545        assert_eq!(result.len(), 2); // <p> and </p>
546        assert_eq!(
547            result[0].message,
548            "Inline HTML found: <p> (use Markdown syntax instead)"
549        );
550        assert_eq!(
551            result[1].message,
552            "Inline HTML found: </p> (use Markdown syntax instead)"
553        );
554    }
555
556    #[test]
557    fn test_md033_tags_in_links() {
558        let rule = MD033NoInlineHtml::default();
559        let content = "[Link](http://example.com/<div>)";
560        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
561        let result = rule.check(&ctx).unwrap();
562        // The <div> in the URL should be detected as HTML (not skipped)
563        assert_eq!(result.len(), 1);
564        assert_eq!(
565            result[0].message,
566            "Inline HTML found: <div> (use Markdown syntax instead)"
567        );
568
569        let content2 = "[Link <a>text</a>](url)";
570        let ctx2 = LintContext::new(content2, crate::config::MarkdownFlavor::Standard);
571        let result2 = rule.check(&ctx2).unwrap();
572        // Reports one warning per HTML tag (true markdownlint compatibility)
573        assert_eq!(result2.len(), 2); // <a> and </a>
574        assert_eq!(
575            result2[0].message,
576            "Inline HTML found: <a> (use Markdown syntax instead)"
577        );
578        assert_eq!(
579            result2[1].message,
580            "Inline HTML found: </a> (use Markdown syntax instead)"
581        );
582    }
583
584    #[test]
585    fn test_md033_fix_escaping() {
586        let rule = MD033NoInlineHtml::default();
587        let content = "Text with <div> and <br/> tags.";
588        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
589        let fixed_content = rule.fix(&ctx).unwrap();
590        // No fix for HTML tags; output should be unchanged
591        assert_eq!(fixed_content, content);
592    }
593
594    #[test]
595    fn test_md033_in_code_blocks() {
596        let rule = MD033NoInlineHtml::default();
597        let content = "```html\n<div>Code</div>\n```\n<div>Not code</div>";
598        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
599        let result = rule.check(&ctx).unwrap();
600        // Reports one warning per HTML tag (true markdownlint compatibility)
601        assert_eq!(result.len(), 2); // <div> and </div> outside code block
602        assert_eq!(
603            result[0].message,
604            "Inline HTML found: <div> (use Markdown syntax instead)"
605        );
606        assert_eq!(
607            result[1].message,
608            "Inline HTML found: </div> (use Markdown syntax instead)"
609        );
610    }
611
612    #[test]
613    fn test_md033_in_code_spans() {
614        let rule = MD033NoInlineHtml::default();
615        let content = "Text with `<p>in code</p>` span. <br/> Not in span.";
616        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
617        let result = rule.check(&ctx).unwrap();
618        // Should detect <br/> outside code span, but not tags inside code span
619        assert_eq!(result.len(), 1);
620        assert_eq!(
621            result[0].message,
622            "Inline HTML found: <br/> (use Markdown syntax instead)"
623        );
624    }
625
626    #[test]
627    fn test_md033_issue_90_code_span_with_diff_block() {
628        // Test for issue #90: inline code span followed by diff code block
629        let rule = MD033NoInlineHtml::default();
630        let content = r#"# Heading
631
632`<env>`
633
634```diff
635- this
636+ that
637```"#;
638        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
639        let result = rule.check(&ctx).unwrap();
640        // Should NOT detect <env> as HTML since it's inside backticks
641        assert_eq!(result.len(), 0, "Should not report HTML tags inside code spans");
642    }
643
644    #[test]
645    fn test_md033_multiple_code_spans_with_angle_brackets() {
646        // Test multiple code spans on same line
647        let rule = MD033NoInlineHtml::default();
648        let content = "`<one>` and `<two>` and `<three>` are all code spans";
649        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
650        let result = rule.check(&ctx).unwrap();
651        assert_eq!(result.len(), 0, "Should not report HTML tags inside any code spans");
652    }
653
654    #[test]
655    fn test_md033_nested_angle_brackets_in_code_span() {
656        // Test nested angle brackets
657        let rule = MD033NoInlineHtml::default();
658        let content = "Text with `<<nested>>` brackets";
659        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
660        let result = rule.check(&ctx).unwrap();
661        assert_eq!(result.len(), 0, "Should handle nested angle brackets in code spans");
662    }
663
664    #[test]
665    fn test_md033_code_span_at_end_before_code_block() {
666        // Test code span at end of line before code block
667        let rule = MD033NoInlineHtml::default();
668        let content = "Testing `<test>`\n```\ncode here\n```";
669        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
670        let result = rule.check(&ctx).unwrap();
671        assert_eq!(result.len(), 0, "Should handle code span before code block");
672    }
673}