rumdl_lib/rules/
md033_no_inline_html.rs

1//!
2//! Rule MD033: No HTML tags
3//!
4//! See [docs/md033.md](../../docs/md033.md) for full documentation, configuration, and examples.
5
6use crate::rule::{LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
7use crate::utils::kramdown_utils::{is_kramdown_block_attribute, is_kramdown_extension};
8use crate::utils::range_utils::calculate_html_tag_range;
9use crate::utils::regex_cache::*;
10use lazy_static::lazy_static;
11use regex::Regex;
12use std::collections::HashSet;
13
14mod md033_config;
15use md033_config::MD033Config;
16
17lazy_static! {
18    // HTML/Markdown comment pattern (specific to MD033)
19    static ref HTML_COMMENT_PATTERN: Regex = Regex::new(r"<!--.*?-->").unwrap();
20}
21
22#[derive(Clone)]
23pub struct MD033NoInlineHtml {
24    config: MD033Config,
25    allowed: HashSet<String>,
26}
27
28impl Default for MD033NoInlineHtml {
29    fn default() -> Self {
30        let config = MD033Config::default();
31        let allowed = config.allowed_set();
32        Self { config, allowed }
33    }
34}
35
36impl MD033NoInlineHtml {
37    pub fn new() -> Self {
38        Self::default()
39    }
40
41    pub fn with_allowed(allowed_vec: Vec<String>) -> Self {
42        let config = MD033Config {
43            allowed: allowed_vec.clone(),
44        };
45        let allowed = config.allowed_set();
46        Self { config, allowed }
47    }
48
49    pub fn from_config_struct(config: MD033Config) -> Self {
50        let allowed = config.allowed_set();
51        Self { config, allowed }
52    }
53
54    // Efficient check for allowed tags using HashSet (case-insensitive)
55    #[inline]
56    fn is_tag_allowed(&self, tag: &str) -> bool {
57        if self.allowed.is_empty() {
58            return false;
59        }
60        // Remove angle brackets and slashes, then split by whitespace or '>'
61        let tag = tag.trim_start_matches('<').trim_start_matches('/');
62        let tag_name = tag
63            .split(|c: char| c.is_whitespace() || c == '>' || c == '/')
64            .next()
65            .unwrap_or("");
66        self.allowed.contains(&tag_name.to_lowercase())
67    }
68
69    // Check if a tag is an HTML comment
70    #[inline]
71    fn is_html_comment(&self, tag: &str) -> bool {
72        tag.starts_with("<!--") && tag.ends_with("-->")
73    }
74
75    // Check if a tag is likely a programming type annotation rather than HTML
76    #[inline]
77    fn is_likely_type_annotation(&self, tag: &str) -> bool {
78        // Common programming type names that are often used in generics
79        const COMMON_TYPES: &[&str] = &[
80            "string",
81            "number",
82            "any",
83            "void",
84            "null",
85            "undefined",
86            "array",
87            "promise",
88            "function",
89            "error",
90            "date",
91            "regexp",
92            "symbol",
93            "bigint",
94            "map",
95            "set",
96            "weakmap",
97            "weakset",
98            "iterator",
99            "generator",
100            "t",
101            "u",
102            "v",
103            "k",
104            "e", // Common single-letter type parameters
105            "userdata",
106            "apiresponse",
107            "config",
108            "options",
109            "params",
110            "result",
111            "response",
112            "request",
113            "data",
114            "item",
115            "element",
116            "node",
117        ];
118
119        let tag_content = tag
120            .trim_start_matches('<')
121            .trim_end_matches('>')
122            .trim_start_matches('/');
123        let tag_name = tag_content
124            .split(|c: char| c.is_whitespace() || c == '>' || c == '/')
125            .next()
126            .unwrap_or("");
127
128        // Check if it's a simple tag (no attributes) with a common type name
129        if !tag_content.contains(' ') && !tag_content.contains('=') {
130            COMMON_TYPES.contains(&tag_name.to_ascii_lowercase().as_str())
131        } else {
132            false
133        }
134    }
135
136    // Check if a tag is actually an email address in angle brackets
137    #[inline]
138    fn is_email_address(&self, tag: &str) -> bool {
139        let content = tag.trim_start_matches('<').trim_end_matches('>');
140        // Simple email pattern: contains @ and has reasonable structure
141        content.contains('@')
142            && content.chars().all(|c| c.is_alphanumeric() || "@.-_+".contains(c))
143            && content.split('@').count() == 2
144            && content.split('@').all(|part| !part.is_empty())
145    }
146
147    // Check if a tag has the markdown attribute (MkDocs/Material for MkDocs)
148    #[inline]
149    fn has_markdown_attribute(&self, tag: &str) -> bool {
150        // Check for various forms of markdown attribute
151        // Examples: <div markdown>, <div markdown="1">, <div class="result" markdown>
152        tag.contains(" markdown>") || tag.contains(" markdown=") || tag.contains(" markdown ")
153    }
154
155    // Check if a tag is actually a URL in angle brackets
156    #[inline]
157    fn is_url_in_angle_brackets(&self, tag: &str) -> bool {
158        let content = tag.trim_start_matches('<').trim_end_matches('>');
159        // Check for common URL schemes
160        content.starts_with("http://")
161            || content.starts_with("https://")
162            || content.starts_with("ftp://")
163            || content.starts_with("ftps://")
164            || content.starts_with("mailto:")
165    }
166
167    /// Find HTML tags that span multiple lines
168    fn find_multiline_html_tags(
169        &self,
170        ctx: &crate::lint_context::LintContext,
171        content: &str,
172        nomarkdown_ranges: &[(usize, usize)],
173        warnings: &mut Vec<LintWarning>,
174    ) {
175        // Early return: if content has no incomplete tags at line ends, skip processing
176        if !content.contains('<') || !content.lines().any(|line| line.trim_end().ends_with('<')) {
177            return;
178        }
179
180        // Simple approach: use regex to find patterns like <tagname and then look for closing >
181        lazy_static::lazy_static! {
182            static ref INCOMPLETE_TAG_START: regex::Regex = regex::Regex::new(r"(?i)<[a-zA-Z][^>]*$").unwrap();
183        }
184
185        let lines: Vec<&str> = content.lines().collect();
186
187        for (i, line) in lines.iter().enumerate() {
188            let line_num = i + 1;
189
190            // Skip code blocks and empty lines
191            if line.trim().is_empty() || ctx.is_in_code_block(line_num) {
192                continue;
193            }
194
195            // Skip lines inside nomarkdown blocks
196            if nomarkdown_ranges
197                .iter()
198                .any(|(start, end)| line_num >= *start && line_num <= *end)
199            {
200                continue;
201            }
202
203            // Early return: skip lines that don't end with incomplete tags
204            if !line.contains('<') {
205                continue;
206            }
207
208            // Look for incomplete HTML tags at the end of the line
209            if let Some(incomplete_match) = INCOMPLETE_TAG_START.find(line) {
210                let start_column = incomplete_match.start() + 1; // 1-indexed
211
212                // Build the complete tag by looking at subsequent lines
213                let mut complete_tag = incomplete_match.as_str().to_string();
214                let mut found_end = false;
215
216                // Look for the closing > in subsequent lines (limit search to 10 lines)
217                for (j, next_line) in lines.iter().enumerate().skip(i + 1).take(10) {
218                    let next_line_num = j + 1;
219
220                    // Stop if we hit a code block
221                    if ctx.is_in_code_block(next_line_num) {
222                        break;
223                    }
224
225                    complete_tag.push(' '); // Add space to normalize whitespace
226                    complete_tag.push_str(next_line.trim());
227
228                    if next_line.contains('>') {
229                        found_end = true;
230                        break;
231                    }
232                }
233
234                if found_end {
235                    // Extract just the tag part (up to the first >)
236                    if let Some(end_pos) = complete_tag.find('>') {
237                        let final_tag = &complete_tag[0..=end_pos];
238
239                        // Apply the same filters as single-line tags
240                        let skip_mkdocs_markdown = ctx.flavor == crate::config::MarkdownFlavor::MkDocs
241                            && self.has_markdown_attribute(final_tag);
242
243                        if !self.is_html_comment(final_tag)
244                            && !self.is_likely_type_annotation(final_tag)
245                            && !self.is_email_address(final_tag)
246                            && !self.is_url_in_angle_brackets(final_tag)
247                            && !self.is_tag_allowed(final_tag)
248                            && !skip_mkdocs_markdown
249                            && HTML_TAG_FINDER.is_match(final_tag)
250                        {
251                            // Check for duplicates (avoid flagging the same position twice)
252                            let already_warned =
253                                warnings.iter().any(|w| w.line == line_num && w.column == start_column);
254
255                            if !already_warned {
256                                let (start_line, start_col, end_line, end_col) = calculate_html_tag_range(
257                                    line_num,
258                                    line,
259                                    incomplete_match.start(),
260                                    incomplete_match.len(),
261                                );
262                                warnings.push(LintWarning {
263                                    rule_name: Some(self.name()),
264                                    line: start_line,
265                                    column: start_col,
266                                    end_line,
267                                    end_column: end_col,
268                                    message: format!("HTML tag found: {final_tag} (use Markdown syntax instead)"),
269                                    severity: Severity::Warning,
270                                    fix: None,
271                                });
272                            }
273                        }
274                    }
275                }
276            }
277        }
278    }
279}
280
281impl Rule for MD033NoInlineHtml {
282    fn name(&self) -> &'static str {
283        "MD033"
284    }
285
286    fn description(&self) -> &'static str {
287        "Inline HTML is not allowed"
288    }
289
290    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
291        let content = ctx.content;
292
293        // Early return: if no HTML tags at all, skip processing
294        if content.is_empty() || !has_html_tags(content) {
295            return Ok(Vec::new());
296        }
297
298        // Quick check for HTML tag pattern before expensive processing
299        if !HTML_TAG_QUICK_CHECK.is_match(content) {
300            return Ok(Vec::new());
301        }
302
303        let mut warnings = Vec::new();
304        let lines: Vec<&str> = content.lines().collect();
305
306        // Track nomarkdown and comment blocks
307        let mut in_nomarkdown = false;
308        let mut in_comment = false;
309        let mut nomarkdown_ranges: Vec<(usize, usize)> = Vec::new();
310        let mut nomarkdown_start = 0;
311        let mut comment_start = 0;
312
313        // First pass: identify nomarkdown and comment blocks
314        for (i, line) in lines.iter().enumerate() {
315            let line_num = i + 1;
316
317            // Check for nomarkdown start
318            if line.trim() == "{::nomarkdown}" {
319                in_nomarkdown = true;
320                nomarkdown_start = line_num;
321            } else if line.trim() == "{:/nomarkdown}" && in_nomarkdown {
322                in_nomarkdown = false;
323                nomarkdown_ranges.push((nomarkdown_start, line_num));
324            }
325
326            // Check for comment blocks
327            if line.trim() == "{::comment}" {
328                in_comment = true;
329                comment_start = line_num;
330            } else if line.trim() == "{:/comment}" && in_comment {
331                in_comment = false;
332                nomarkdown_ranges.push((comment_start, line_num));
333            }
334        }
335
336        // Second pass: find single-line HTML tags
337        // To match markdownlint behavior, report one warning per HTML tag
338        for (i, line) in lines.iter().enumerate() {
339            let line_num = i + 1;
340
341            if line.trim().is_empty() {
342                continue;
343            }
344            if ctx.is_in_code_block(line_num) {
345                continue;
346            }
347            // Skip lines that are indented code blocks (4+ spaces or tab) per CommonMark spec
348            // Even if they're not in the structure's code blocks (e.g., HTML blocks)
349            if line.starts_with("    ") || line.starts_with('\t') {
350                continue;
351            }
352
353            // Skip lines inside nomarkdown blocks
354            if nomarkdown_ranges
355                .iter()
356                .any(|(start, end)| line_num >= *start && line_num <= *end)
357            {
358                continue;
359            }
360
361            // Skip Kramdown extensions and block attributes
362            if is_kramdown_extension(line) || is_kramdown_block_attribute(line) {
363                continue;
364            }
365
366            // Find all HTML tags in the line using regex
367            for tag_match in HTML_TAG_FINDER.find_iter(line) {
368                let tag = tag_match.as_str();
369
370                // Skip HTML comments
371                if self.is_html_comment(tag) {
372                    continue;
373                }
374
375                // Skip likely programming type annotations
376                if self.is_likely_type_annotation(tag) {
377                    continue;
378                }
379
380                // Skip email addresses in angle brackets
381                if self.is_email_address(tag) {
382                    continue;
383                }
384
385                // Skip URLs in angle brackets
386                if self.is_url_in_angle_brackets(tag) {
387                    continue;
388                }
389
390                // Skip tags inside code spans
391                let tag_start_col = tag_match.start() + 1; // 1-indexed
392                if ctx.is_in_code_span(line_num, tag_start_col) {
393                    continue;
394                }
395
396                // Skip allowed tags
397                if self.is_tag_allowed(tag) {
398                    continue;
399                }
400
401                // Skip tags with markdown attribute in MkDocs mode
402                if ctx.flavor == crate::config::MarkdownFlavor::MkDocs && self.has_markdown_attribute(tag) {
403                    continue;
404                }
405
406                // Report each HTML tag individually (true markdownlint compatibility)
407                let (start_line, start_col, end_line, end_col) =
408                    calculate_html_tag_range(line_num, line, tag_match.start(), tag_match.len());
409                warnings.push(LintWarning {
410                    rule_name: Some(self.name()),
411                    line: start_line,
412                    column: start_col,
413                    end_line,
414                    end_column: end_col,
415                    message: format!("Inline HTML found: {tag} (use Markdown syntax instead)"),
416                    severity: Severity::Warning,
417                    fix: None,
418                });
419            }
420        }
421
422        // Third pass: find multi-line HTML tags
423        self.find_multiline_html_tags(ctx, ctx.content, &nomarkdown_ranges, &mut warnings);
424
425        Ok(warnings)
426    }
427
428    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
429        // No fix for MD033: do not remove or alter HTML, just return the input unchanged
430        Ok(ctx.content.to_string())
431    }
432
433    fn fix_capability(&self) -> crate::rule::FixCapability {
434        crate::rule::FixCapability::Unfixable
435    }
436
437    /// Get the category of this rule for selective processing
438    fn category(&self) -> RuleCategory {
439        RuleCategory::Html
440    }
441
442    /// Check if this rule should be skipped
443    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
444        let content = ctx.content;
445        content.is_empty() || !has_html_tags(content)
446    }
447
448    fn as_any(&self) -> &dyn std::any::Any {
449        self
450    }
451
452    fn default_config_section(&self) -> Option<(String, toml::Value)> {
453        let json_value = serde_json::to_value(&self.config).ok()?;
454        Some((
455            self.name().to_string(),
456            crate::rule_config_serde::json_to_toml_value(&json_value)?,
457        ))
458    }
459
460    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
461    where
462        Self: Sized,
463    {
464        let rule_config = crate::rule_config_serde::load_rule_config::<MD033Config>(config);
465        Box::new(Self::from_config_struct(rule_config))
466    }
467}
468
469#[cfg(test)]
470mod tests {
471    use super::*;
472    use crate::lint_context::LintContext;
473    use crate::rule::Rule;
474
475    #[test]
476    fn test_md033_basic_html() {
477        let rule = MD033NoInlineHtml::default();
478        let content = "<div>Some content</div>";
479        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
480        let result = rule.check(&ctx).unwrap();
481        // Reports one warning per HTML tag (true markdownlint compatibility)
482        assert_eq!(result.len(), 2); // <div> and </div>
483        assert!(result[0].message.starts_with("Inline HTML found: <div>"));
484        assert!(result[1].message.starts_with("Inline HTML found: </div>"));
485    }
486
487    #[test]
488    fn test_md033_case_insensitive() {
489        let rule = MD033NoInlineHtml::default();
490        let content = "<DiV>Some <B>content</B></dIv>";
491        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
492        let result = rule.check(&ctx).unwrap();
493        // Reports one warning per HTML tag (true markdownlint compatibility)
494        assert_eq!(result.len(), 4); // <DiV>, <B>, </B>, </dIv>
495        assert_eq!(
496            result[0].message,
497            "Inline HTML found: <DiV> (use Markdown syntax instead)"
498        );
499        assert_eq!(
500            result[1].message,
501            "Inline HTML found: <B> (use Markdown syntax instead)"
502        );
503        assert_eq!(
504            result[2].message,
505            "Inline HTML found: </B> (use Markdown syntax instead)"
506        );
507        assert_eq!(
508            result[3].message,
509            "Inline HTML found: </dIv> (use Markdown syntax instead)"
510        );
511    }
512
513    #[test]
514    fn test_md033_allowed_tags() {
515        let rule = MD033NoInlineHtml::with_allowed(vec!["div".to_string(), "br".to_string()]);
516        let content = "<div>Allowed</div><p>Not allowed</p><br/>";
517        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
518        let result = rule.check(&ctx).unwrap();
519        // Only warnings for non-allowed tags (<p> and </p>, div and br are allowed)
520        assert_eq!(result.len(), 2);
521        assert_eq!(
522            result[0].message,
523            "Inline HTML found: <p> (use Markdown syntax instead)"
524        );
525        assert_eq!(
526            result[1].message,
527            "Inline HTML found: </p> (use Markdown syntax instead)"
528        );
529
530        // Test case-insensitivity of allowed tags
531        let content2 = "<DIV>Allowed</DIV><P>Not allowed</P><BR/>";
532        let ctx2 = LintContext::new(content2, crate::config::MarkdownFlavor::Standard);
533        let result2 = rule.check(&ctx2).unwrap();
534        assert_eq!(result2.len(), 2); // <P> and </P> flagged
535        assert_eq!(
536            result2[0].message,
537            "Inline HTML found: <P> (use Markdown syntax instead)"
538        );
539        assert_eq!(
540            result2[1].message,
541            "Inline HTML found: </P> (use Markdown syntax instead)"
542        );
543    }
544
545    #[test]
546    fn test_md033_html_comments() {
547        let rule = MD033NoInlineHtml::default();
548        let content = "<!-- This is a comment --> <p>Not a comment</p>";
549        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
550        let result = rule.check(&ctx).unwrap();
551        // Should detect warnings for HTML tags (comments are skipped)
552        assert_eq!(result.len(), 2); // <p> and </p>
553        assert_eq!(
554            result[0].message,
555            "Inline HTML found: <p> (use Markdown syntax instead)"
556        );
557        assert_eq!(
558            result[1].message,
559            "Inline HTML found: </p> (use Markdown syntax instead)"
560        );
561    }
562
563    #[test]
564    fn test_md033_tags_in_links() {
565        let rule = MD033NoInlineHtml::default();
566        let content = "[Link](http://example.com/<div>)";
567        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
568        let result = rule.check(&ctx).unwrap();
569        // The <div> in the URL should be detected as HTML (not skipped)
570        assert_eq!(result.len(), 1);
571        assert_eq!(
572            result[0].message,
573            "Inline HTML found: <div> (use Markdown syntax instead)"
574        );
575
576        let content2 = "[Link <a>text</a>](url)";
577        let ctx2 = LintContext::new(content2, crate::config::MarkdownFlavor::Standard);
578        let result2 = rule.check(&ctx2).unwrap();
579        // Reports one warning per HTML tag (true markdownlint compatibility)
580        assert_eq!(result2.len(), 2); // <a> and </a>
581        assert_eq!(
582            result2[0].message,
583            "Inline HTML found: <a> (use Markdown syntax instead)"
584        );
585        assert_eq!(
586            result2[1].message,
587            "Inline HTML found: </a> (use Markdown syntax instead)"
588        );
589    }
590
591    #[test]
592    fn test_md033_fix_escaping() {
593        let rule = MD033NoInlineHtml::default();
594        let content = "Text with <div> and <br/> tags.";
595        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
596        let fixed_content = rule.fix(&ctx).unwrap();
597        // No fix for HTML tags; output should be unchanged
598        assert_eq!(fixed_content, content);
599    }
600
601    #[test]
602    fn test_md033_in_code_blocks() {
603        let rule = MD033NoInlineHtml::default();
604        let content = "```html\n<div>Code</div>\n```\n<div>Not code</div>";
605        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
606        let result = rule.check(&ctx).unwrap();
607        // Reports one warning per HTML tag (true markdownlint compatibility)
608        assert_eq!(result.len(), 2); // <div> and </div> outside code block
609        assert_eq!(
610            result[0].message,
611            "Inline HTML found: <div> (use Markdown syntax instead)"
612        );
613        assert_eq!(
614            result[1].message,
615            "Inline HTML found: </div> (use Markdown syntax instead)"
616        );
617    }
618
619    #[test]
620    fn test_md033_in_code_spans() {
621        let rule = MD033NoInlineHtml::default();
622        let content = "Text with `<p>in code</p>` span. <br/> Not in span.";
623        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
624        let result = rule.check(&ctx).unwrap();
625        // Should detect <br/> outside code span, but not tags inside code span
626        assert_eq!(result.len(), 1);
627        assert_eq!(
628            result[0].message,
629            "Inline HTML found: <br/> (use Markdown syntax instead)"
630        );
631    }
632}