rumdl_lib/rules/
md033_no_inline_html.rs

1//!
2//! Rule MD033: No HTML tags
3//!
4//! See [docs/md033.md](../../docs/md033.md) for full documentation, configuration, and examples.
5
6use crate::rule::{LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
7use crate::utils::document_structure::{DocumentStructure, DocumentStructureExtensions};
8use crate::utils::kramdown_utils::{is_kramdown_block_attribute, is_kramdown_extension};
9use crate::utils::range_utils::calculate_html_tag_range;
10use crate::utils::regex_cache::*;
11use lazy_static::lazy_static;
12use regex::Regex;
13use std::collections::HashSet;
14
15mod md033_config;
16use md033_config::MD033Config;
17
18lazy_static! {
19    // HTML/Markdown comment pattern (specific to MD033)
20    static ref HTML_COMMENT_PATTERN: Regex = Regex::new(r"<!--.*?-->").unwrap();
21}
22
23#[derive(Clone)]
24pub struct MD033NoInlineHtml {
25    config: MD033Config,
26    allowed: HashSet<String>,
27}
28
29impl Default for MD033NoInlineHtml {
30    fn default() -> Self {
31        let config = MD033Config::default();
32        let allowed = config.allowed_set();
33        Self { config, allowed }
34    }
35}
36
37impl MD033NoInlineHtml {
38    pub fn new() -> Self {
39        Self::default()
40    }
41
42    pub fn with_allowed(allowed_vec: Vec<String>) -> Self {
43        let config = MD033Config {
44            allowed: allowed_vec.clone(),
45        };
46        let allowed = config.allowed_set();
47        Self { config, allowed }
48    }
49
50    pub fn from_config_struct(config: MD033Config) -> Self {
51        let allowed = config.allowed_set();
52        Self { config, allowed }
53    }
54
55    // Efficient check for allowed tags using HashSet (case-insensitive)
56    #[inline]
57    fn is_tag_allowed(&self, tag: &str) -> bool {
58        if self.allowed.is_empty() {
59            return false;
60        }
61        // Remove angle brackets and slashes, then split by whitespace or '>'
62        let tag = tag.trim_start_matches('<').trim_start_matches('/');
63        let tag_name = tag
64            .split(|c: char| c.is_whitespace() || c == '>' || c == '/')
65            .next()
66            .unwrap_or("");
67        self.allowed.contains(&tag_name.to_lowercase())
68    }
69
70    // Check if a tag is an HTML comment
71    #[inline]
72    fn is_html_comment(&self, tag: &str) -> bool {
73        tag.starts_with("<!--") && tag.ends_with("-->")
74    }
75
76    // Check if a tag is likely a programming type annotation rather than HTML
77    #[inline]
78    fn is_likely_type_annotation(&self, tag: &str) -> bool {
79        // Common programming type names that are often used in generics
80        const COMMON_TYPES: &[&str] = &[
81            "string",
82            "number",
83            "any",
84            "void",
85            "null",
86            "undefined",
87            "array",
88            "promise",
89            "function",
90            "error",
91            "date",
92            "regexp",
93            "symbol",
94            "bigint",
95            "map",
96            "set",
97            "weakmap",
98            "weakset",
99            "iterator",
100            "generator",
101            "t",
102            "u",
103            "v",
104            "k",
105            "e", // Common single-letter type parameters
106            "userdata",
107            "apiresponse",
108            "config",
109            "options",
110            "params",
111            "result",
112            "response",
113            "request",
114            "data",
115            "item",
116            "element",
117            "node",
118        ];
119
120        let tag_content = tag
121            .trim_start_matches('<')
122            .trim_end_matches('>')
123            .trim_start_matches('/');
124        let tag_name = tag_content
125            .split(|c: char| c.is_whitespace() || c == '>' || c == '/')
126            .next()
127            .unwrap_or("");
128
129        // Check if it's a simple tag (no attributes) with a common type name
130        if !tag_content.contains(' ') && !tag_content.contains('=') {
131            COMMON_TYPES.contains(&tag_name.to_ascii_lowercase().as_str())
132        } else {
133            false
134        }
135    }
136
137    // Check if a tag is actually an email address in angle brackets
138    #[inline]
139    fn is_email_address(&self, tag: &str) -> bool {
140        let content = tag.trim_start_matches('<').trim_end_matches('>');
141        // Simple email pattern: contains @ and has reasonable structure
142        content.contains('@')
143            && content.chars().all(|c| c.is_alphanumeric() || "@.-_+".contains(c))
144            && content.split('@').count() == 2
145            && content.split('@').all(|part| !part.is_empty())
146    }
147
148    // Check if a tag is actually a URL in angle brackets
149    #[inline]
150    fn is_url_in_angle_brackets(&self, tag: &str) -> bool {
151        let content = tag.trim_start_matches('<').trim_end_matches('>');
152        // Check for common URL schemes
153        content.starts_with("http://")
154            || content.starts_with("https://")
155            || content.starts_with("ftp://")
156            || content.starts_with("ftps://")
157            || content.starts_with("mailto:")
158    }
159
160    /// Find HTML tags that span multiple lines
161    fn find_multiline_html_tags(
162        &self,
163        content: &str,
164        structure: &DocumentStructure,
165        nomarkdown_ranges: &[(usize, usize)],
166        warnings: &mut Vec<LintWarning>,
167    ) {
168        // Early return: if content has no incomplete tags at line ends, skip processing
169        if !content.contains('<') || !content.lines().any(|line| line.trim_end().ends_with('<')) {
170            return;
171        }
172
173        // Simple approach: use regex to find patterns like <tagname and then look for closing >
174        lazy_static::lazy_static! {
175            static ref INCOMPLETE_TAG_START: regex::Regex = regex::Regex::new(r"(?i)<[a-zA-Z][^>]*$").unwrap();
176        }
177
178        let lines: Vec<&str> = content.lines().collect();
179
180        for (i, line) in lines.iter().enumerate() {
181            let line_num = i + 1;
182
183            // Skip code blocks and empty lines
184            if line.trim().is_empty() || structure.is_in_code_block(line_num) {
185                continue;
186            }
187
188            // Skip lines inside nomarkdown blocks
189            if nomarkdown_ranges
190                .iter()
191                .any(|(start, end)| line_num >= *start && line_num <= *end)
192            {
193                continue;
194            }
195
196            // Early return: skip lines that don't end with incomplete tags
197            if !line.contains('<') {
198                continue;
199            }
200
201            // Look for incomplete HTML tags at the end of the line
202            if let Some(incomplete_match) = INCOMPLETE_TAG_START.find(line) {
203                let start_column = incomplete_match.start() + 1; // 1-indexed
204
205                // Build the complete tag by looking at subsequent lines
206                let mut complete_tag = incomplete_match.as_str().to_string();
207                let mut found_end = false;
208
209                // Look for the closing > in subsequent lines (limit search to 10 lines)
210                for (j, next_line) in lines.iter().enumerate().skip(i + 1).take(10) {
211                    let next_line_num = j + 1;
212
213                    // Stop if we hit a code block
214                    if structure.is_in_code_block(next_line_num) {
215                        break;
216                    }
217
218                    complete_tag.push(' '); // Add space to normalize whitespace
219                    complete_tag.push_str(next_line.trim());
220
221                    if next_line.contains('>') {
222                        found_end = true;
223                        break;
224                    }
225                }
226
227                if found_end {
228                    // Extract just the tag part (up to the first >)
229                    if let Some(end_pos) = complete_tag.find('>') {
230                        let final_tag = &complete_tag[0..=end_pos];
231
232                        // Apply the same filters as single-line tags
233                        if !self.is_html_comment(final_tag)
234                            && !self.is_likely_type_annotation(final_tag)
235                            && !self.is_email_address(final_tag)
236                            && !self.is_url_in_angle_brackets(final_tag)
237                            && !self.is_tag_allowed(final_tag)
238                            && HTML_TAG_FINDER.is_match(final_tag)
239                        {
240                            // Check for duplicates (avoid flagging the same position twice)
241                            let already_warned =
242                                warnings.iter().any(|w| w.line == line_num && w.column == start_column);
243
244                            if !already_warned {
245                                let (start_line, start_col, end_line, end_col) = calculate_html_tag_range(
246                                    line_num,
247                                    line,
248                                    incomplete_match.start(),
249                                    incomplete_match.len(),
250                                );
251                                warnings.push(LintWarning {
252                                    rule_name: Some(self.name()),
253                                    line: start_line,
254                                    column: start_col,
255                                    end_line,
256                                    end_column: end_col,
257                                    message: format!("HTML tag found: {final_tag} (use Markdown syntax instead)"),
258                                    severity: Severity::Warning,
259                                    fix: None,
260                                });
261                            }
262                        }
263                    }
264                }
265            }
266        }
267    }
268}
269
270impl Rule for MD033NoInlineHtml {
271    fn name(&self) -> &'static str {
272        "MD033"
273    }
274
275    fn description(&self) -> &'static str {
276        "Inline HTML is not allowed"
277    }
278
279    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
280        let content = ctx.content;
281        let structure = DocumentStructure::new(content);
282        self.check_with_structure(ctx, &structure)
283    }
284
285    /// Optimized check using document structure
286    fn check_with_structure(
287        &self,
288        ctx: &crate::lint_context::LintContext,
289        structure: &DocumentStructure,
290    ) -> LintResult {
291        let content = ctx.content;
292
293        // Early return: if no HTML tags at all, skip processing
294        if content.is_empty() || !has_html_tags(content) {
295            return Ok(Vec::new());
296        }
297
298        // Quick check for HTML tag pattern before expensive processing
299        if !HTML_TAG_QUICK_CHECK.is_match(content) {
300            return Ok(Vec::new());
301        }
302
303        let mut warnings = Vec::new();
304        let lines: Vec<&str> = content.lines().collect();
305
306        // Track nomarkdown and comment blocks
307        let mut in_nomarkdown = false;
308        let mut in_comment = false;
309        let mut nomarkdown_ranges: Vec<(usize, usize)> = Vec::new();
310        let mut nomarkdown_start = 0;
311        let mut comment_start = 0;
312
313        // First pass: identify nomarkdown and comment blocks
314        for (i, line) in lines.iter().enumerate() {
315            let line_num = i + 1;
316
317            // Check for nomarkdown start
318            if line.trim() == "{::nomarkdown}" {
319                in_nomarkdown = true;
320                nomarkdown_start = line_num;
321            } else if line.trim() == "{:/nomarkdown}" && in_nomarkdown {
322                in_nomarkdown = false;
323                nomarkdown_ranges.push((nomarkdown_start, line_num));
324            }
325
326            // Check for comment blocks
327            if line.trim() == "{::comment}" {
328                in_comment = true;
329                comment_start = line_num;
330            } else if line.trim() == "{:/comment}" && in_comment {
331                in_comment = false;
332                nomarkdown_ranges.push((comment_start, line_num));
333            }
334        }
335
336        // Second pass: find single-line HTML tags
337        // To match markdownlint behavior, report one warning per HTML tag
338        for (i, line) in lines.iter().enumerate() {
339            let line_num = i + 1;
340
341            if line.trim().is_empty() {
342                continue;
343            }
344            if structure.is_in_code_block(line_num) {
345                continue;
346            }
347            // Skip lines that are indented code blocks (4+ spaces or tab) per CommonMark spec
348            // Even if they're not in the structure's code blocks (e.g., HTML blocks)
349            if line.starts_with("    ") || line.starts_with('\t') {
350                continue;
351            }
352
353            // Skip lines inside nomarkdown blocks
354            if nomarkdown_ranges
355                .iter()
356                .any(|(start, end)| line_num >= *start && line_num <= *end)
357            {
358                continue;
359            }
360
361            // Skip Kramdown extensions and block attributes
362            if is_kramdown_extension(line) || is_kramdown_block_attribute(line) {
363                continue;
364            }
365
366            // Find all HTML tags in the line using regex
367            for tag_match in HTML_TAG_FINDER.find_iter(line) {
368                let tag = tag_match.as_str();
369
370                // Skip HTML comments
371                if self.is_html_comment(tag) {
372                    continue;
373                }
374
375                // Skip likely programming type annotations
376                if self.is_likely_type_annotation(tag) {
377                    continue;
378                }
379
380                // Skip email addresses in angle brackets
381                if self.is_email_address(tag) {
382                    continue;
383                }
384
385                // Skip URLs in angle brackets
386                if self.is_url_in_angle_brackets(tag) {
387                    continue;
388                }
389
390                // Skip tags inside code spans
391                let tag_start_col = tag_match.start() + 1; // 1-indexed
392                if structure.is_in_code_span(line_num, tag_start_col) {
393                    continue;
394                }
395
396                // Skip allowed tags
397                if self.is_tag_allowed(tag) {
398                    continue;
399                }
400
401                // Report each HTML tag individually (true markdownlint compatibility)
402                let (start_line, start_col, end_line, end_col) =
403                    calculate_html_tag_range(line_num, line, tag_match.start(), tag_match.len());
404                warnings.push(LintWarning {
405                    rule_name: Some(self.name()),
406                    line: start_line,
407                    column: start_col,
408                    end_line,
409                    end_column: end_col,
410                    message: format!("Inline HTML found: {tag} (use Markdown syntax instead)"),
411                    severity: Severity::Warning,
412                    fix: None,
413                });
414            }
415        }
416
417        // Third pass: find multi-line HTML tags
418        self.find_multiline_html_tags(ctx.content, structure, &nomarkdown_ranges, &mut warnings);
419
420        Ok(warnings)
421    }
422
423    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
424        // No fix for MD033: do not remove or alter HTML, just return the input unchanged
425        Ok(ctx.content.to_string())
426    }
427
428    fn fix_capability(&self) -> crate::rule::FixCapability {
429        crate::rule::FixCapability::Unfixable
430    }
431
432    /// Get the category of this rule for selective processing
433    fn category(&self) -> RuleCategory {
434        RuleCategory::Html
435    }
436
437    /// Check if this rule should be skipped
438    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
439        let content = ctx.content;
440        content.is_empty() || !has_html_tags(content)
441    }
442
443    fn as_any(&self) -> &dyn std::any::Any {
444        self
445    }
446
447    fn as_maybe_document_structure(&self) -> Option<&dyn crate::rule::MaybeDocumentStructure> {
448        Some(self)
449    }
450
451    fn default_config_section(&self) -> Option<(String, toml::Value)> {
452        let json_value = serde_json::to_value(&self.config).ok()?;
453        Some((
454            self.name().to_string(),
455            crate::rule_config_serde::json_to_toml_value(&json_value)?,
456        ))
457    }
458
459    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
460    where
461        Self: Sized,
462    {
463        let rule_config = crate::rule_config_serde::load_rule_config::<MD033Config>(config);
464        Box::new(Self::from_config_struct(rule_config))
465    }
466}
467
468impl DocumentStructureExtensions for MD033NoInlineHtml {
469    fn has_relevant_elements(
470        &self,
471        ctx: &crate::lint_context::LintContext,
472        _doc_structure: &DocumentStructure,
473    ) -> bool {
474        // Rule is only relevant if content contains potential HTML tags
475        ctx.content.contains('<') && ctx.content.contains('>')
476    }
477}
478
479#[cfg(test)]
480mod tests {
481    use super::*;
482    use crate::lint_context::LintContext;
483    use crate::rule::Rule;
484
485    #[test]
486    fn test_md033_basic_html() {
487        let rule = MD033NoInlineHtml::default();
488        let content = "<div>Some content</div>";
489        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
490        let result = rule.check(&ctx).unwrap();
491        // Reports one warning per HTML tag (true markdownlint compatibility)
492        assert_eq!(result.len(), 2); // <div> and </div>
493        assert!(result[0].message.starts_with("Inline HTML found: <div>"));
494        assert!(result[1].message.starts_with("Inline HTML found: </div>"));
495    }
496
497    #[test]
498    fn test_md033_case_insensitive() {
499        let rule = MD033NoInlineHtml::default();
500        let content = "<DiV>Some <B>content</B></dIv>";
501        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
502        let result = rule.check(&ctx).unwrap();
503        // Reports one warning per HTML tag (true markdownlint compatibility)
504        assert_eq!(result.len(), 4); // <DiV>, <B>, </B>, </dIv>
505        assert_eq!(
506            result[0].message,
507            "Inline HTML found: <DiV> (use Markdown syntax instead)"
508        );
509        assert_eq!(
510            result[1].message,
511            "Inline HTML found: <B> (use Markdown syntax instead)"
512        );
513        assert_eq!(
514            result[2].message,
515            "Inline HTML found: </B> (use Markdown syntax instead)"
516        );
517        assert_eq!(
518            result[3].message,
519            "Inline HTML found: </dIv> (use Markdown syntax instead)"
520        );
521    }
522
523    #[test]
524    fn test_md033_allowed_tags() {
525        let rule = MD033NoInlineHtml::with_allowed(vec!["div".to_string(), "br".to_string()]);
526        let content = "<div>Allowed</div><p>Not allowed</p><br/>";
527        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
528        let result = rule.check(&ctx).unwrap();
529        // Only warnings for non-allowed tags (<p> and </p>, div and br are allowed)
530        assert_eq!(result.len(), 2);
531        assert_eq!(
532            result[0].message,
533            "Inline HTML found: <p> (use Markdown syntax instead)"
534        );
535        assert_eq!(
536            result[1].message,
537            "Inline HTML found: </p> (use Markdown syntax instead)"
538        );
539
540        // Test case-insensitivity of allowed tags
541        let content2 = "<DIV>Allowed</DIV><P>Not allowed</P><BR/>";
542        let ctx2 = LintContext::new(content2, crate::config::MarkdownFlavor::Standard);
543        let result2 = rule.check(&ctx2).unwrap();
544        assert_eq!(result2.len(), 2); // <P> and </P> flagged
545        assert_eq!(
546            result2[0].message,
547            "Inline HTML found: <P> (use Markdown syntax instead)"
548        );
549        assert_eq!(
550            result2[1].message,
551            "Inline HTML found: </P> (use Markdown syntax instead)"
552        );
553    }
554
555    #[test]
556    fn test_md033_html_comments() {
557        let rule = MD033NoInlineHtml::default();
558        let content = "<!-- This is a comment --> <p>Not a comment</p>";
559        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
560        let result = rule.check(&ctx).unwrap();
561        // Should detect warnings for HTML tags (comments are skipped)
562        assert_eq!(result.len(), 2); // <p> and </p>
563        assert_eq!(
564            result[0].message,
565            "Inline HTML found: <p> (use Markdown syntax instead)"
566        );
567        assert_eq!(
568            result[1].message,
569            "Inline HTML found: </p> (use Markdown syntax instead)"
570        );
571    }
572
573    #[test]
574    fn test_md033_tags_in_links() {
575        let rule = MD033NoInlineHtml::default();
576        let content = "[Link](http://example.com/<div>)";
577        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
578        let result = rule.check(&ctx).unwrap();
579        // The <div> in the URL should be detected as HTML (not skipped)
580        assert_eq!(result.len(), 1);
581        assert_eq!(
582            result[0].message,
583            "Inline HTML found: <div> (use Markdown syntax instead)"
584        );
585
586        let content2 = "[Link <a>text</a>](url)";
587        let ctx2 = LintContext::new(content2, crate::config::MarkdownFlavor::Standard);
588        let result2 = rule.check(&ctx2).unwrap();
589        // Reports one warning per HTML tag (true markdownlint compatibility)
590        assert_eq!(result2.len(), 2); // <a> and </a>
591        assert_eq!(
592            result2[0].message,
593            "Inline HTML found: <a> (use Markdown syntax instead)"
594        );
595        assert_eq!(
596            result2[1].message,
597            "Inline HTML found: </a> (use Markdown syntax instead)"
598        );
599    }
600
601    #[test]
602    fn test_md033_fix_escaping() {
603        let rule = MD033NoInlineHtml::default();
604        let content = "Text with <div> and <br/> tags.";
605        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
606        let fixed_content = rule.fix(&ctx).unwrap();
607        // No fix for HTML tags; output should be unchanged
608        assert_eq!(fixed_content, content);
609    }
610
611    #[test]
612    fn test_md033_in_code_blocks() {
613        let rule = MD033NoInlineHtml::default();
614        let content = "```html\n<div>Code</div>\n```\n<div>Not code</div>";
615        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
616        let result = rule.check(&ctx).unwrap();
617        // Reports one warning per HTML tag (true markdownlint compatibility)
618        assert_eq!(result.len(), 2); // <div> and </div> outside code block
619        assert_eq!(
620            result[0].message,
621            "Inline HTML found: <div> (use Markdown syntax instead)"
622        );
623        assert_eq!(
624            result[1].message,
625            "Inline HTML found: </div> (use Markdown syntax instead)"
626        );
627    }
628
629    #[test]
630    fn test_md033_in_code_spans() {
631        let rule = MD033NoInlineHtml::default();
632        let content = "Text with `<p>in code</p>` span. <br/> Not in span.";
633        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
634        let result = rule.check(&ctx).unwrap();
635        // Should detect <br/> outside code span, but not tags inside code span
636        assert_eq!(result.len(), 1);
637        assert_eq!(
638            result[0].message,
639            "Inline HTML found: <br/> (use Markdown syntax instead)"
640        );
641    }
642}