rumdl_lib/rules/
md033_no_inline_html.rs

1//!
2//! Rule MD033: No HTML tags
3//!
4//! See [docs/md033.md](../../docs/md033.md) for full documentation, configuration, and examples.
5
6use crate::rule::{LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
7use crate::utils::document_structure::{DocumentStructure, DocumentStructureExtensions};
8use crate::utils::kramdown_utils::{is_kramdown_block_attribute, is_kramdown_extension};
9use crate::utils::range_utils::calculate_html_tag_range;
10use crate::utils::regex_cache::*;
11use lazy_static::lazy_static;
12use regex::Regex;
13use std::collections::HashSet;
14
15mod md033_config;
16use md033_config::MD033Config;
17
18lazy_static! {
19    // HTML/Markdown comment pattern (specific to MD033)
20    static ref HTML_COMMENT_PATTERN: Regex = Regex::new(r"<!--.*?-->").unwrap();
21}
22
23#[derive(Clone)]
24pub struct MD033NoInlineHtml {
25    config: MD033Config,
26    allowed: HashSet<String>,
27}
28
29impl Default for MD033NoInlineHtml {
30    fn default() -> Self {
31        let config = MD033Config::default();
32        let allowed = config.allowed_set();
33        Self { config, allowed }
34    }
35}
36
37impl MD033NoInlineHtml {
38    pub fn new() -> Self {
39        Self::default()
40    }
41
42    pub fn with_allowed(allowed_vec: Vec<String>) -> Self {
43        let config = MD033Config {
44            allowed: allowed_vec.clone(),
45        };
46        let allowed = config.allowed_set();
47        Self { config, allowed }
48    }
49
50    pub fn from_config_struct(config: MD033Config) -> Self {
51        let allowed = config.allowed_set();
52        Self { config, allowed }
53    }
54
55    // Efficient check for allowed tags using HashSet (case-insensitive)
56    #[inline]
57    fn is_tag_allowed(&self, tag: &str) -> bool {
58        if self.allowed.is_empty() {
59            return false;
60        }
61        // Remove angle brackets and slashes, then split by whitespace or '>'
62        let tag = tag.trim_start_matches('<').trim_start_matches('/');
63        let tag_name = tag
64            .split(|c: char| c.is_whitespace() || c == '>' || c == '/')
65            .next()
66            .unwrap_or("");
67        self.allowed.contains(&tag_name.to_lowercase())
68    }
69
70    // Check if a tag is an HTML comment
71    #[inline]
72    fn is_html_comment(&self, tag: &str) -> bool {
73        tag.starts_with("<!--") && tag.ends_with("-->")
74    }
75
76    // Check if a tag is likely a programming type annotation rather than HTML
77    #[inline]
78    fn is_likely_type_annotation(&self, tag: &str) -> bool {
79        // Common programming type names that are often used in generics
80        const COMMON_TYPES: &[&str] = &[
81            "string",
82            "number",
83            "any",
84            "void",
85            "null",
86            "undefined",
87            "array",
88            "promise",
89            "function",
90            "error",
91            "date",
92            "regexp",
93            "symbol",
94            "bigint",
95            "map",
96            "set",
97            "weakmap",
98            "weakset",
99            "iterator",
100            "generator",
101            "t",
102            "u",
103            "v",
104            "k",
105            "e", // Common single-letter type parameters
106            "userdata",
107            "apiresponse",
108            "config",
109            "options",
110            "params",
111            "result",
112            "response",
113            "request",
114            "data",
115            "item",
116            "element",
117            "node",
118        ];
119
120        let tag_content = tag
121            .trim_start_matches('<')
122            .trim_end_matches('>')
123            .trim_start_matches('/');
124        let tag_name = tag_content
125            .split(|c: char| c.is_whitespace() || c == '>' || c == '/')
126            .next()
127            .unwrap_or("");
128
129        // Check if it's a simple tag (no attributes) with a common type name
130        if !tag_content.contains(' ') && !tag_content.contains('=') {
131            COMMON_TYPES.contains(&tag_name.to_ascii_lowercase().as_str())
132        } else {
133            false
134        }
135    }
136
137    // Check if a tag is actually an email address in angle brackets
138    #[inline]
139    fn is_email_address(&self, tag: &str) -> bool {
140        let content = tag.trim_start_matches('<').trim_end_matches('>');
141        // Simple email pattern: contains @ and has reasonable structure
142        content.contains('@')
143            && content.chars().all(|c| c.is_alphanumeric() || "@.-_+".contains(c))
144            && content.split('@').count() == 2
145            && content.split('@').all(|part| !part.is_empty())
146    }
147
148    // Check if a tag has the markdown attribute (MkDocs/Material for MkDocs)
149    #[inline]
150    fn has_markdown_attribute(&self, tag: &str) -> bool {
151        // Check for various forms of markdown attribute
152        // Examples: <div markdown>, <div markdown="1">, <div class="result" markdown>
153        tag.contains(" markdown>") || tag.contains(" markdown=") || tag.contains(" markdown ")
154    }
155
156    // Check if a tag is actually a URL in angle brackets
157    #[inline]
158    fn is_url_in_angle_brackets(&self, tag: &str) -> bool {
159        let content = tag.trim_start_matches('<').trim_end_matches('>');
160        // Check for common URL schemes
161        content.starts_with("http://")
162            || content.starts_with("https://")
163            || content.starts_with("ftp://")
164            || content.starts_with("ftps://")
165            || content.starts_with("mailto:")
166    }
167
168    /// Find HTML tags that span multiple lines
169    fn find_multiline_html_tags(
170        &self,
171        ctx: &crate::lint_context::LintContext,
172        content: &str,
173        structure: &DocumentStructure,
174        nomarkdown_ranges: &[(usize, usize)],
175        warnings: &mut Vec<LintWarning>,
176    ) {
177        // Early return: if content has no incomplete tags at line ends, skip processing
178        if !content.contains('<') || !content.lines().any(|line| line.trim_end().ends_with('<')) {
179            return;
180        }
181
182        // Simple approach: use regex to find patterns like <tagname and then look for closing >
183        lazy_static::lazy_static! {
184            static ref INCOMPLETE_TAG_START: regex::Regex = regex::Regex::new(r"(?i)<[a-zA-Z][^>]*$").unwrap();
185        }
186
187        let lines: Vec<&str> = content.lines().collect();
188
189        for (i, line) in lines.iter().enumerate() {
190            let line_num = i + 1;
191
192            // Skip code blocks and empty lines
193            if line.trim().is_empty() || structure.is_in_code_block(line_num) {
194                continue;
195            }
196
197            // Skip lines inside nomarkdown blocks
198            if nomarkdown_ranges
199                .iter()
200                .any(|(start, end)| line_num >= *start && line_num <= *end)
201            {
202                continue;
203            }
204
205            // Early return: skip lines that don't end with incomplete tags
206            if !line.contains('<') {
207                continue;
208            }
209
210            // Look for incomplete HTML tags at the end of the line
211            if let Some(incomplete_match) = INCOMPLETE_TAG_START.find(line) {
212                let start_column = incomplete_match.start() + 1; // 1-indexed
213
214                // Build the complete tag by looking at subsequent lines
215                let mut complete_tag = incomplete_match.as_str().to_string();
216                let mut found_end = false;
217
218                // Look for the closing > in subsequent lines (limit search to 10 lines)
219                for (j, next_line) in lines.iter().enumerate().skip(i + 1).take(10) {
220                    let next_line_num = j + 1;
221
222                    // Stop if we hit a code block
223                    if structure.is_in_code_block(next_line_num) {
224                        break;
225                    }
226
227                    complete_tag.push(' '); // Add space to normalize whitespace
228                    complete_tag.push_str(next_line.trim());
229
230                    if next_line.contains('>') {
231                        found_end = true;
232                        break;
233                    }
234                }
235
236                if found_end {
237                    // Extract just the tag part (up to the first >)
238                    if let Some(end_pos) = complete_tag.find('>') {
239                        let final_tag = &complete_tag[0..=end_pos];
240
241                        // Apply the same filters as single-line tags
242                        let skip_mkdocs_markdown = ctx.flavor == crate::config::MarkdownFlavor::MkDocs
243                            && self.has_markdown_attribute(final_tag);
244
245                        if !self.is_html_comment(final_tag)
246                            && !self.is_likely_type_annotation(final_tag)
247                            && !self.is_email_address(final_tag)
248                            && !self.is_url_in_angle_brackets(final_tag)
249                            && !self.is_tag_allowed(final_tag)
250                            && !skip_mkdocs_markdown
251                            && HTML_TAG_FINDER.is_match(final_tag)
252                        {
253                            // Check for duplicates (avoid flagging the same position twice)
254                            let already_warned =
255                                warnings.iter().any(|w| w.line == line_num && w.column == start_column);
256
257                            if !already_warned {
258                                let (start_line, start_col, end_line, end_col) = calculate_html_tag_range(
259                                    line_num,
260                                    line,
261                                    incomplete_match.start(),
262                                    incomplete_match.len(),
263                                );
264                                warnings.push(LintWarning {
265                                    rule_name: Some(self.name()),
266                                    line: start_line,
267                                    column: start_col,
268                                    end_line,
269                                    end_column: end_col,
270                                    message: format!("HTML tag found: {final_tag} (use Markdown syntax instead)"),
271                                    severity: Severity::Warning,
272                                    fix: None,
273                                });
274                            }
275                        }
276                    }
277                }
278            }
279        }
280    }
281}
282
283impl Rule for MD033NoInlineHtml {
284    fn name(&self) -> &'static str {
285        "MD033"
286    }
287
288    fn description(&self) -> &'static str {
289        "Inline HTML is not allowed"
290    }
291
292    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
293        let content = ctx.content;
294        let structure = DocumentStructure::new(content);
295        self.check_with_structure(ctx, &structure)
296    }
297
298    /// Optimized check using document structure
299    fn check_with_structure(
300        &self,
301        ctx: &crate::lint_context::LintContext,
302        structure: &DocumentStructure,
303    ) -> LintResult {
304        let content = ctx.content;
305
306        // Early return: if no HTML tags at all, skip processing
307        if content.is_empty() || !has_html_tags(content) {
308            return Ok(Vec::new());
309        }
310
311        // Quick check for HTML tag pattern before expensive processing
312        if !HTML_TAG_QUICK_CHECK.is_match(content) {
313            return Ok(Vec::new());
314        }
315
316        let mut warnings = Vec::new();
317        let lines: Vec<&str> = content.lines().collect();
318
319        // Track nomarkdown and comment blocks
320        let mut in_nomarkdown = false;
321        let mut in_comment = false;
322        let mut nomarkdown_ranges: Vec<(usize, usize)> = Vec::new();
323        let mut nomarkdown_start = 0;
324        let mut comment_start = 0;
325
326        // First pass: identify nomarkdown and comment blocks
327        for (i, line) in lines.iter().enumerate() {
328            let line_num = i + 1;
329
330            // Check for nomarkdown start
331            if line.trim() == "{::nomarkdown}" {
332                in_nomarkdown = true;
333                nomarkdown_start = line_num;
334            } else if line.trim() == "{:/nomarkdown}" && in_nomarkdown {
335                in_nomarkdown = false;
336                nomarkdown_ranges.push((nomarkdown_start, line_num));
337            }
338
339            // Check for comment blocks
340            if line.trim() == "{::comment}" {
341                in_comment = true;
342                comment_start = line_num;
343            } else if line.trim() == "{:/comment}" && in_comment {
344                in_comment = false;
345                nomarkdown_ranges.push((comment_start, line_num));
346            }
347        }
348
349        // Second pass: find single-line HTML tags
350        // To match markdownlint behavior, report one warning per HTML tag
351        for (i, line) in lines.iter().enumerate() {
352            let line_num = i + 1;
353
354            if line.trim().is_empty() {
355                continue;
356            }
357            if structure.is_in_code_block(line_num) {
358                continue;
359            }
360            // Skip lines that are indented code blocks (4+ spaces or tab) per CommonMark spec
361            // Even if they're not in the structure's code blocks (e.g., HTML blocks)
362            if line.starts_with("    ") || line.starts_with('\t') {
363                continue;
364            }
365
366            // Skip lines inside nomarkdown blocks
367            if nomarkdown_ranges
368                .iter()
369                .any(|(start, end)| line_num >= *start && line_num <= *end)
370            {
371                continue;
372            }
373
374            // Skip Kramdown extensions and block attributes
375            if is_kramdown_extension(line) || is_kramdown_block_attribute(line) {
376                continue;
377            }
378
379            // Find all HTML tags in the line using regex
380            for tag_match in HTML_TAG_FINDER.find_iter(line) {
381                let tag = tag_match.as_str();
382
383                // Skip HTML comments
384                if self.is_html_comment(tag) {
385                    continue;
386                }
387
388                // Skip likely programming type annotations
389                if self.is_likely_type_annotation(tag) {
390                    continue;
391                }
392
393                // Skip email addresses in angle brackets
394                if self.is_email_address(tag) {
395                    continue;
396                }
397
398                // Skip URLs in angle brackets
399                if self.is_url_in_angle_brackets(tag) {
400                    continue;
401                }
402
403                // Skip tags inside code spans
404                let tag_start_col = tag_match.start() + 1; // 1-indexed
405                if structure.is_in_code_span(line_num, tag_start_col) {
406                    continue;
407                }
408
409                // Skip allowed tags
410                if self.is_tag_allowed(tag) {
411                    continue;
412                }
413
414                // Skip tags with markdown attribute in MkDocs mode
415                if ctx.flavor == crate::config::MarkdownFlavor::MkDocs && self.has_markdown_attribute(tag) {
416                    continue;
417                }
418
419                // Report each HTML tag individually (true markdownlint compatibility)
420                let (start_line, start_col, end_line, end_col) =
421                    calculate_html_tag_range(line_num, line, tag_match.start(), tag_match.len());
422                warnings.push(LintWarning {
423                    rule_name: Some(self.name()),
424                    line: start_line,
425                    column: start_col,
426                    end_line,
427                    end_column: end_col,
428                    message: format!("Inline HTML found: {tag} (use Markdown syntax instead)"),
429                    severity: Severity::Warning,
430                    fix: None,
431                });
432            }
433        }
434
435        // Third pass: find multi-line HTML tags
436        self.find_multiline_html_tags(ctx, ctx.content, structure, &nomarkdown_ranges, &mut warnings);
437
438        Ok(warnings)
439    }
440
441    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
442        // No fix for MD033: do not remove or alter HTML, just return the input unchanged
443        Ok(ctx.content.to_string())
444    }
445
446    fn fix_capability(&self) -> crate::rule::FixCapability {
447        crate::rule::FixCapability::Unfixable
448    }
449
450    /// Get the category of this rule for selective processing
451    fn category(&self) -> RuleCategory {
452        RuleCategory::Html
453    }
454
455    /// Check if this rule should be skipped
456    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
457        let content = ctx.content;
458        content.is_empty() || !has_html_tags(content)
459    }
460
461    fn as_any(&self) -> &dyn std::any::Any {
462        self
463    }
464
465    fn as_maybe_document_structure(&self) -> Option<&dyn crate::rule::MaybeDocumentStructure> {
466        Some(self)
467    }
468
469    fn default_config_section(&self) -> Option<(String, toml::Value)> {
470        let json_value = serde_json::to_value(&self.config).ok()?;
471        Some((
472            self.name().to_string(),
473            crate::rule_config_serde::json_to_toml_value(&json_value)?,
474        ))
475    }
476
477    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
478    where
479        Self: Sized,
480    {
481        let rule_config = crate::rule_config_serde::load_rule_config::<MD033Config>(config);
482        Box::new(Self::from_config_struct(rule_config))
483    }
484}
485
486impl DocumentStructureExtensions for MD033NoInlineHtml {
487    fn has_relevant_elements(
488        &self,
489        ctx: &crate::lint_context::LintContext,
490        _doc_structure: &DocumentStructure,
491    ) -> bool {
492        // Rule is only relevant if content contains potential HTML tags
493        ctx.content.contains('<') && ctx.content.contains('>')
494    }
495}
496
497#[cfg(test)]
498mod tests {
499    use super::*;
500    use crate::lint_context::LintContext;
501    use crate::rule::Rule;
502
503    #[test]
504    fn test_md033_basic_html() {
505        let rule = MD033NoInlineHtml::default();
506        let content = "<div>Some content</div>";
507        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
508        let result = rule.check(&ctx).unwrap();
509        // Reports one warning per HTML tag (true markdownlint compatibility)
510        assert_eq!(result.len(), 2); // <div> and </div>
511        assert!(result[0].message.starts_with("Inline HTML found: <div>"));
512        assert!(result[1].message.starts_with("Inline HTML found: </div>"));
513    }
514
515    #[test]
516    fn test_md033_case_insensitive() {
517        let rule = MD033NoInlineHtml::default();
518        let content = "<DiV>Some <B>content</B></dIv>";
519        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
520        let result = rule.check(&ctx).unwrap();
521        // Reports one warning per HTML tag (true markdownlint compatibility)
522        assert_eq!(result.len(), 4); // <DiV>, <B>, </B>, </dIv>
523        assert_eq!(
524            result[0].message,
525            "Inline HTML found: <DiV> (use Markdown syntax instead)"
526        );
527        assert_eq!(
528            result[1].message,
529            "Inline HTML found: <B> (use Markdown syntax instead)"
530        );
531        assert_eq!(
532            result[2].message,
533            "Inline HTML found: </B> (use Markdown syntax instead)"
534        );
535        assert_eq!(
536            result[3].message,
537            "Inline HTML found: </dIv> (use Markdown syntax instead)"
538        );
539    }
540
541    #[test]
542    fn test_md033_allowed_tags() {
543        let rule = MD033NoInlineHtml::with_allowed(vec!["div".to_string(), "br".to_string()]);
544        let content = "<div>Allowed</div><p>Not allowed</p><br/>";
545        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
546        let result = rule.check(&ctx).unwrap();
547        // Only warnings for non-allowed tags (<p> and </p>, div and br are allowed)
548        assert_eq!(result.len(), 2);
549        assert_eq!(
550            result[0].message,
551            "Inline HTML found: <p> (use Markdown syntax instead)"
552        );
553        assert_eq!(
554            result[1].message,
555            "Inline HTML found: </p> (use Markdown syntax instead)"
556        );
557
558        // Test case-insensitivity of allowed tags
559        let content2 = "<DIV>Allowed</DIV><P>Not allowed</P><BR/>";
560        let ctx2 = LintContext::new(content2, crate::config::MarkdownFlavor::Standard);
561        let result2 = rule.check(&ctx2).unwrap();
562        assert_eq!(result2.len(), 2); // <P> and </P> flagged
563        assert_eq!(
564            result2[0].message,
565            "Inline HTML found: <P> (use Markdown syntax instead)"
566        );
567        assert_eq!(
568            result2[1].message,
569            "Inline HTML found: </P> (use Markdown syntax instead)"
570        );
571    }
572
573    #[test]
574    fn test_md033_html_comments() {
575        let rule = MD033NoInlineHtml::default();
576        let content = "<!-- This is a comment --> <p>Not a comment</p>";
577        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
578        let result = rule.check(&ctx).unwrap();
579        // Should detect warnings for HTML tags (comments are skipped)
580        assert_eq!(result.len(), 2); // <p> and </p>
581        assert_eq!(
582            result[0].message,
583            "Inline HTML found: <p> (use Markdown syntax instead)"
584        );
585        assert_eq!(
586            result[1].message,
587            "Inline HTML found: </p> (use Markdown syntax instead)"
588        );
589    }
590
591    #[test]
592    fn test_md033_tags_in_links() {
593        let rule = MD033NoInlineHtml::default();
594        let content = "[Link](http://example.com/<div>)";
595        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
596        let result = rule.check(&ctx).unwrap();
597        // The <div> in the URL should be detected as HTML (not skipped)
598        assert_eq!(result.len(), 1);
599        assert_eq!(
600            result[0].message,
601            "Inline HTML found: <div> (use Markdown syntax instead)"
602        );
603
604        let content2 = "[Link <a>text</a>](url)";
605        let ctx2 = LintContext::new(content2, crate::config::MarkdownFlavor::Standard);
606        let result2 = rule.check(&ctx2).unwrap();
607        // Reports one warning per HTML tag (true markdownlint compatibility)
608        assert_eq!(result2.len(), 2); // <a> and </a>
609        assert_eq!(
610            result2[0].message,
611            "Inline HTML found: <a> (use Markdown syntax instead)"
612        );
613        assert_eq!(
614            result2[1].message,
615            "Inline HTML found: </a> (use Markdown syntax instead)"
616        );
617    }
618
619    #[test]
620    fn test_md033_fix_escaping() {
621        let rule = MD033NoInlineHtml::default();
622        let content = "Text with <div> and <br/> tags.";
623        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
624        let fixed_content = rule.fix(&ctx).unwrap();
625        // No fix for HTML tags; output should be unchanged
626        assert_eq!(fixed_content, content);
627    }
628
629    #[test]
630    fn test_md033_in_code_blocks() {
631        let rule = MD033NoInlineHtml::default();
632        let content = "```html\n<div>Code</div>\n```\n<div>Not code</div>";
633        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
634        let result = rule.check(&ctx).unwrap();
635        // Reports one warning per HTML tag (true markdownlint compatibility)
636        assert_eq!(result.len(), 2); // <div> and </div> outside code block
637        assert_eq!(
638            result[0].message,
639            "Inline HTML found: <div> (use Markdown syntax instead)"
640        );
641        assert_eq!(
642            result[1].message,
643            "Inline HTML found: </div> (use Markdown syntax instead)"
644        );
645    }
646
647    #[test]
648    fn test_md033_in_code_spans() {
649        let rule = MD033NoInlineHtml::default();
650        let content = "Text with `<p>in code</p>` span. <br/> Not in span.";
651        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
652        let result = rule.check(&ctx).unwrap();
653        // Should detect <br/> outside code span, but not tags inside code span
654        assert_eq!(result.len(), 1);
655        assert_eq!(
656            result[0].message,
657            "Inline HTML found: <br/> (use Markdown syntax instead)"
658        );
659    }
660}