mdbook_lint_core/rules/standard/
md033.rs

1//! MD033: Inline HTML should be avoided
2//!
3//! This rule checks for inline HTML elements in markdown content, which should
4//! generally be avoided in favor of pure Markdown syntax.
5
6use crate::error::Result;
7use crate::rule::{AstRule, RuleCategory, RuleMetadata};
8use crate::{
9    Document,
10    violation::{Severity, Violation},
11};
12
13/// Rule to detect inline HTML elements
14pub struct MD033;
15
16impl AstRule for MD033 {
17    fn id(&self) -> &'static str {
18        "MD033"
19    }
20
21    fn name(&self) -> &'static str {
22        "no-inline-html"
23    }
24
25    fn description(&self) -> &'static str {
26        "Inline HTML should be avoided"
27    }
28
29    fn metadata(&self) -> RuleMetadata {
30        RuleMetadata::stable(RuleCategory::Content).introduced_in("mdbook-lint v0.1.0")
31    }
32
33    fn check_ast<'a>(
34        &self,
35        document: &Document,
36        _ast: &'a comrak::nodes::AstNode<'a>,
37    ) -> Result<Vec<Violation>> {
38        let mut violations = Vec::new();
39        let lines = &document.lines;
40
41        let mut in_code_block = false;
42
43        for (line_idx, line) in lines.iter().enumerate() {
44            let line_num = line_idx + 1;
45
46            // Track fenced code blocks to ignore HTML inside them
47            if line.trim_start().starts_with("```") || line.trim_start().starts_with("~~~") {
48                in_code_block = !in_code_block;
49                continue;
50            }
51
52            // Skip lines inside code blocks
53            if in_code_block {
54                continue;
55            }
56
57            // Simple HTML detection without regex
58            violations.extend(self.check_line_for_html(line, line_num));
59        }
60
61        Ok(violations)
62    }
63}
64
65impl MD033 {
66    /// Check a single line for HTML tags and comments
67    fn check_line_for_html(&self, line: &str, line_num: usize) -> Vec<Violation> {
68        let mut violations = Vec::new();
69        let mut chars = line.char_indices().peekable();
70        let mut in_backticks = false;
71
72        while let Some((i, ch)) = chars.next() {
73            match ch {
74                '`' => {
75                    in_backticks = !in_backticks;
76                }
77                '<' if !in_backticks => {
78                    // Look ahead to see if this looks like an HTML tag or comment
79                    let remaining = &line[i..];
80
81                    if remaining.starts_with("<!--") {
82                        // HTML comment
83                        if let Some(end) = remaining.find("-->") {
84                            let comment = &remaining[..end + 3];
85                            violations.push(self.create_violation(
86                                format!("Inline HTML element found: {comment}"),
87                                line_num,
88                                i + 1,
89                                Severity::Warning,
90                            ));
91                            // Skip past the comment
92                            for _ in 0..end + 2 {
93                                chars.next();
94                            }
95                        }
96                    } else if let Some(tag_end) = remaining.find('>') {
97                        let potential_tag = &remaining[..tag_end + 1];
98                        if self.is_html_tag(potential_tag) {
99                            violations.push(self.create_violation(
100                                format!("Inline HTML element found: {potential_tag}"),
101                                line_num,
102                                i + 1,
103                                Severity::Warning,
104                            ));
105                            // Skip past the tag
106                            for _ in 0..tag_end {
107                                chars.next();
108                            }
109                        }
110                    }
111                }
112                _ => {}
113            }
114        }
115
116        violations
117    }
118
119    /// Simple check if a string looks like an HTML tag
120    fn is_html_tag(&self, s: &str) -> bool {
121        if !s.starts_with('<') || !s.ends_with('>') {
122            return false;
123        }
124
125        let content = &s[1..s.len() - 1];
126        if content.is_empty() {
127            return false;
128        }
129
130        // Handle closing tags
131        let tag_name = if let Some(stripped) = content.strip_prefix('/') {
132            stripped
133        } else {
134            content
135        }
136        .split_whitespace()
137        .next()
138        .unwrap_or("");
139
140        // List of common HTML tags
141        let html_tags = [
142            "a",
143            "abbr",
144            "b",
145            "br",
146            "cite",
147            "code",
148            "em",
149            "i",
150            "img",
151            "kbd",
152            "mark",
153            "q",
154            "s",
155            "samp",
156            "small",
157            "span",
158            "strong",
159            "sub",
160            "sup",
161            "time",
162            "u",
163            "var",
164            "wbr",
165            "h1",
166            "h2",
167            "h3",
168            "h4",
169            "h5",
170            "h6",
171            "p",
172            "div",
173            "section",
174            "article",
175            "header",
176            "footer",
177            "nav",
178            "aside",
179            "main",
180            "figure",
181            "figcaption",
182            "blockquote",
183            "pre",
184            "ul",
185            "ol",
186            "li",
187            "dl",
188            "dt",
189            "dd",
190            "table",
191            "thead",
192            "tbody",
193            "tfoot",
194            "tr",
195            "th",
196            "td",
197            "form",
198            "input",
199            "button",
200            "select",
201            "option",
202            "textarea",
203            "label",
204            "fieldset",
205            "legend",
206        ];
207
208        html_tags.contains(&tag_name.to_lowercase().as_str())
209    }
210}
211
212#[cfg(test)]
213mod tests {
214    use super::*;
215    use crate::Document;
216    use crate::rule::Rule;
217    use std::path::PathBuf;
218
219    #[test]
220    fn test_md033_no_violations() {
221        let content = r#"# Valid Markdown
222
223This document contains only valid Markdown:
224
225**Bold text** and *italic text*.
226
227`code spans` are fine.
228
229```html
230<p>HTML in code blocks is fine</p>
231<div class="example">
232    <span>This is ignored</span>
233</div>
234```
235
236[Links](https://example.com) are good.
237
238> Blockquotes are fine
239
240- List items
241- More items
242
243## Another heading
244
245Regular paragraphs without HTML.
246"#;
247        let document = Document::new(content.to_string(), PathBuf::from("test.md")).unwrap();
248        let rule = MD033;
249        let violations = rule.check(&document).unwrap();
250
251        assert_eq!(violations.len(), 0);
252    }
253
254    #[test]
255    fn test_md033_html_violations() {
256        let content = r#"# Document with HTML
257
258This paragraph has <strong>inline HTML</strong>.
259
260<p>This is a paragraph tag.</p>
261
262Some text with <em>emphasis</em> and <code>code</code> tags.
263
264<div class="container">
265Block level HTML
266</div>
267
268More content with <span class="highlight">spans</span>.
269"#;
270        let document = Document::new(content.to_string(), PathBuf::from("test.md")).unwrap();
271        let rule = MD033;
272        let violations = rule.check(&document).unwrap();
273
274        assert_eq!(violations.len(), 12);
275        assert!(violations[0].message.contains("<strong>"));
276        assert!(violations[1].message.contains("</strong>"));
277        assert!(violations[2].message.contains("<p>"));
278        assert!(violations[3].message.contains("</p>"));
279        assert!(violations[4].message.contains("<em>"));
280        assert!(violations[5].message.contains("</em>"));
281        assert!(violations[6].message.contains("<code>"));
282        assert!(violations[7].message.contains("</code>"));
283        assert!(violations[8].message.contains("<div"));
284        assert!(violations[9].message.contains("</div>"));
285        assert!(violations[10].message.contains("<span"));
286        assert!(violations[11].message.contains("</span>"));
287    }
288
289    #[test]
290    fn test_md033_html_comments() {
291        let content = r#"# Document with HTML Comments
292
293This has <!-- a comment --> in it.
294
295Regular text here.
296
297<!-- Another comment -->
298"#;
299        let document = Document::new(content.to_string(), PathBuf::from("test.md")).unwrap();
300        let rule = MD033;
301        let violations = rule.check(&document).unwrap();
302
303        assert_eq!(violations.len(), 2);
304        assert!(violations[0].message.contains("<!-- a comment -->"));
305        assert!(violations[1].message.contains("<!-- Another comment -->"));
306    }
307
308    #[test]
309    fn test_md033_code_blocks_ignored() {
310        let content = r#"# Code Blocks Should Be Ignored
311
312```html
313<div class="example">
314    <p>This HTML should be ignored</p>
315    <span>Even this</span>
316</div>
317```
318
319But this <strong>should be detected</strong>.
320
321```javascript
322const html = '<div>This is in JS code</div>';
323```
324
325And this <em>should also be detected</em>.
326
327~~~html
328<article>
329    <header>More HTML to ignore</header>
330</article>
331~~~
332"#;
333        let document = Document::new(content.to_string(), PathBuf::from("test.md")).unwrap();
334        let rule = MD033;
335        let violations = rule.check(&document).unwrap();
336
337        assert_eq!(violations.len(), 4);
338        assert!(violations[0].message.contains("<strong>"));
339        assert!(violations[1].message.contains("</strong>"));
340        assert!(violations[2].message.contains("<em>"));
341        assert!(violations[3].message.contains("</em>"));
342    }
343
344    #[test]
345    fn test_md033_inline_code_ignored() {
346        let content = r#"# Inline Code Should Be Ignored
347
348This `<span>HTML in backticks</span>` should be ignored.
349
350But this <div>should be detected</div>.
351
352Use `<strong>` tags for bold text, but don't use <strong>actual tags</strong>.
353
354Multiple `<code>` spans with `<em>emphasis</em>` should be ignored.
355"#;
356        let document = Document::new(content.to_string(), PathBuf::from("test.md")).unwrap();
357        let rule = MD033;
358        let violations = rule.check(&document).unwrap();
359
360        assert_eq!(violations.len(), 4);
361        assert!(violations[0].message.contains("<div>"));
362        assert!(violations[1].message.contains("</div>"));
363        assert!(violations[2].message.contains("<strong>"));
364        assert!(violations[3].message.contains("</strong>"));
365    }
366
367    #[test]
368    fn test_md033_mixed_content() {
369        let content = r#"# Mixed Content
370
371Regular text with <b>bold HTML</b> tag.
372
373```html
374<p>This should be ignored</p>
375```
376
377Back to regular content with <i>italic</i>.
378
379The `<em>` tag is mentioned in code, but <em>this usage</em> is flagged.
380
381More `<span class="test">code examples</span>` that should be ignored.
382
383Final <strong>HTML usage</strong> to detect.
384"#;
385        let document = Document::new(content.to_string(), PathBuf::from("test.md")).unwrap();
386        let rule = MD033;
387        let violations = rule.check(&document).unwrap();
388
389        assert_eq!(violations.len(), 8);
390        assert!(violations[0].message.contains("<b>"));
391        assert!(violations[1].message.contains("</b>"));
392        assert!(violations[2].message.contains("<i>"));
393        assert!(violations[3].message.contains("</i>"));
394        assert!(violations[4].message.contains("<em>"));
395        assert!(violations[5].message.contains("</em>"));
396        assert!(violations[6].message.contains("<strong>"));
397        assert!(violations[7].message.contains("</strong>"));
398    }
399}