Skip to main content

rumdl_lib/utils/
mkdocs_html_markdown.rs

1//! MkDocs HTML with markdown attribute detection
2//!
3//! Detects HTML elements (primarily divs) with the `markdown` attribute,
4//! which tells MkDocs/Python-Markdown to process the content as Markdown.
5//!
6//! Common patterns:
7//! - `<div class="grid cards" markdown>` - Grid cards
8//! - `<div markdown="1">` - Explicit markdown processing
9//! - `<div markdown="block">` - Block-level markdown
10
11use regex::Regex;
12use std::sync::LazyLock;
13
14/// Pattern to detect HTML opening tags with markdown attribute.
15/// Handles:
16/// - `<div markdown>` or `<div markdown="1">` or `<div markdown="block">`
17/// - Attribute can appear anywhere in the tag
18/// - Case-insensitive tag names (HTML is case-insensitive)
19/// - Various attribute value formats
20static MARKDOWN_HTML_OPEN: LazyLock<Regex> = LazyLock::new(|| {
21    Regex::new(
22        r#"(?i)^(\s*)<(div|section|article|aside|details|figure|footer|header|main|nav)\b[^>]*\bmarkdown\b[^>]*>"#,
23    )
24    .unwrap()
25});
26
27/// Check if a line starts a markdown-enabled HTML block
28fn is_markdown_html_start(line: &str) -> bool {
29    MARKDOWN_HTML_OPEN.is_match(line)
30}
31
32/// Get the tag name from a markdown HTML opening line
33fn get_tag_name(line: &str) -> Option<String> {
34    MARKDOWN_HTML_OPEN
35        .captures(line)
36        .map(|caps| caps.get(2).map(|m| m.as_str().to_lowercase()).unwrap_or_default())
37}
38
39/// Track state for markdown HTML block parsing
40#[derive(Debug, Default)]
41pub struct MarkdownHtmlTracker {
42    /// Stack of open tags (tag name, depth at that level)
43    tag_stack: Vec<(String, usize)>,
44    /// Current nesting depth
45    depth: usize,
46}
47
48impl MarkdownHtmlTracker {
49    pub fn new() -> Self {
50        Self::default()
51    }
52
53    /// Process a line and return whether the line is inside a markdown HTML block.
54    /// Returns true if:
55    /// - This line opens a new markdown HTML block
56    /// - This line is part of an existing markdown HTML block (even if it closes it)
57    pub fn process_line(&mut self, line: &str) -> bool {
58        let trimmed = line.trim();
59
60        // Check for opening tag
61        if is_markdown_html_start(line) {
62            if let Some(tag) = get_tag_name(line) {
63                self.depth += 1;
64                self.tag_stack.push((tag.clone(), self.depth));
65
66                // Check if this line also closes the tag (self-contained)
67                if self.count_closes(line, &tag) > 0 {
68                    self.depth -= 1;
69                    self.tag_stack.pop();
70                }
71            }
72            return true;
73        }
74
75        // If we're inside a markdown HTML block at the start of this line
76        if !self.tag_stack.is_empty() {
77            // Count opening and closing tags for our tracked tags
78            for (tag, _) in self.tag_stack.clone() {
79                let opens = self.count_opens(trimmed, &tag);
80                let closes = self.count_closes(trimmed, &tag);
81
82                self.depth += opens;
83
84                for _ in 0..closes {
85                    if self.depth > 0 {
86                        self.depth -= 1;
87                    }
88                }
89            }
90
91            // Clean up stack when depth reaches initial level
92            while let Some((_, start_depth)) = self.tag_stack.last() {
93                if self.depth < *start_depth {
94                    self.tag_stack.pop();
95                } else {
96                    break;
97                }
98            }
99
100            // Return true because this line was inside the block at the start
101            // (even if it also closes the block)
102            return true;
103        }
104
105        false
106    }
107
108    /// Count opening tags of a specific type in a line (case-insensitive)
109    fn count_opens(&self, line: &str, tag: &str) -> usize {
110        let line_lower = line.to_lowercase();
111        let open_pattern = format!("<{}", tag.to_lowercase());
112        let mut count = 0;
113        let mut search_start = 0;
114
115        while let Some(pos) = line_lower[search_start..].find(&open_pattern) {
116            let abs_pos = search_start + pos;
117            let after_tag = abs_pos + open_pattern.len();
118
119            // Verify it's a tag boundary (followed by whitespace, >, or /)
120            if after_tag >= line_lower.len()
121                || line_lower[after_tag..].starts_with(|c: char| c.is_whitespace() || c == '>' || c == '/')
122            {
123                count += 1;
124            }
125            search_start = after_tag;
126        }
127        count
128    }
129
130    /// Count closing tags of a specific type in a line (case-insensitive)
131    fn count_closes(&self, line: &str, tag: &str) -> usize {
132        let line_lower = line.to_lowercase();
133        let close_pattern = format!("</{}", tag.to_lowercase());
134        let mut count = 0;
135        let mut search_start = 0;
136
137        while let Some(pos) = line_lower[search_start..].find(&close_pattern) {
138            let abs_pos = search_start + pos;
139            let after_tag = abs_pos + close_pattern.len();
140
141            // Find the closing > (may have whitespace before it)
142            if let Some(rest) = line_lower.get(after_tag..)
143                && rest.trim_start().starts_with('>')
144            {
145                count += 1;
146            }
147            search_start = after_tag;
148        }
149        count
150    }
151
152    /// Check if currently inside a markdown HTML block
153    pub fn is_inside(&self) -> bool {
154        !self.tag_stack.is_empty()
155    }
156
157    /// Reset the tracker state
158    pub fn reset(&mut self) {
159        self.tag_stack.clear();
160        self.depth = 0;
161    }
162}
163
164#[cfg(test)]
165mod tests {
166    use super::*;
167
168    #[test]
169    fn test_markdown_html_detection() {
170        // Basic patterns
171        assert!(is_markdown_html_start("<div markdown>"));
172        assert!(is_markdown_html_start("<div class=\"grid cards\" markdown>"));
173        assert!(is_markdown_html_start("<div markdown=\"1\">"));
174        assert!(is_markdown_html_start("<div markdown=\"block\">"));
175
176        // Attribute order variations
177        assert!(is_markdown_html_start("<div markdown class=\"test\">"));
178        assert!(is_markdown_html_start("<div id=\"foo\" markdown>"));
179
180        // Case insensitivity
181        assert!(is_markdown_html_start("<DIV markdown>"));
182        assert!(is_markdown_html_start("<Div Markdown>"));
183
184        // With indentation
185        assert!(is_markdown_html_start("  <div markdown>"));
186        assert!(is_markdown_html_start("    <div class=\"grid\" markdown>"));
187
188        // Other valid HTML5 elements
189        assert!(is_markdown_html_start("<section markdown>"));
190        assert!(is_markdown_html_start("<article markdown>"));
191        assert!(is_markdown_html_start("<details markdown>"));
192
193        // Should NOT match
194        assert!(!is_markdown_html_start("<div class=\"test\">"));
195        assert!(!is_markdown_html_start("<span markdown>")); // span not in allowed list
196        assert!(!is_markdown_html_start("text with markdown word"));
197        assert!(!is_markdown_html_start("<div>markdown</div>"));
198    }
199
200    #[test]
201    fn test_tracker_basic() {
202        let mut tracker = MarkdownHtmlTracker::new();
203
204        assert!(!tracker.is_inside());
205
206        assert!(tracker.process_line("<div class=\"grid cards\" markdown>"));
207        assert!(tracker.is_inside());
208
209        assert!(tracker.process_line("-   Content here"));
210        assert!(tracker.is_inside());
211
212        assert!(tracker.process_line("    ---"));
213        assert!(tracker.is_inside());
214
215        // Close the div
216        tracker.process_line("</div>");
217        assert!(!tracker.is_inside());
218    }
219
220    #[test]
221    fn test_tracker_nested() {
222        let mut tracker = MarkdownHtmlTracker::new();
223
224        tracker.process_line("<div markdown>");
225        assert!(tracker.is_inside());
226
227        tracker.process_line("<div>nested</div>");
228        assert!(tracker.is_inside());
229
230        tracker.process_line("</div>");
231        assert!(!tracker.is_inside());
232    }
233
234    #[test]
235    fn test_grid_cards_pattern() {
236        let content = r#"<div class="grid cards" markdown>
237
238-   :zap:{ .lg .middle } **Built for speed**
239
240    ---
241
242    Written in Rust.
243
244</div>"#;
245
246        let mut tracker = MarkdownHtmlTracker::new();
247        let mut inside_lines = Vec::new();
248
249        for (i, line) in content.lines().enumerate() {
250            let inside = tracker.process_line(line);
251            if inside {
252                inside_lines.push(i);
253            }
254        }
255
256        // All lines except the last </div> should be marked as inside
257        assert!(inside_lines.contains(&0)); // <div ...>
258        assert!(inside_lines.contains(&2)); // -   :zap:...
259        assert!(inside_lines.contains(&4)); // ---
260        assert!(inside_lines.contains(&6)); // Written in Rust.
261        assert!(!tracker.is_inside()); // After </div>
262    }
263
264    #[test]
265    fn test_same_line_open_close() {
266        let mut tracker = MarkdownHtmlTracker::new();
267
268        // Single line with both open and close
269        let result = tracker.process_line("<div markdown>content</div>");
270        assert!(result); // The line itself is part of the block
271        assert!(!tracker.is_inside()); // But after processing, we're outside
272    }
273
274    #[test]
275    fn test_multiple_sequential_blocks() {
276        let mut tracker = MarkdownHtmlTracker::new();
277
278        // First block
279        assert!(tracker.process_line("<div markdown>"));
280        assert!(tracker.is_inside());
281        assert!(tracker.process_line("Content 1"));
282        tracker.process_line("</div>");
283        assert!(!tracker.is_inside());
284
285        // Second block (should work independently)
286        assert!(tracker.process_line("<section markdown>"));
287        assert!(tracker.is_inside());
288        assert!(tracker.process_line("Content 2"));
289        tracker.process_line("</section>");
290        assert!(!tracker.is_inside());
291    }
292
293    #[test]
294    fn test_deeply_nested_same_tag() {
295        let mut tracker = MarkdownHtmlTracker::new();
296
297        assert!(tracker.process_line("<div markdown>"));
298        assert!(tracker.is_inside());
299
300        // Nested div (without markdown attr)
301        assert!(tracker.process_line("<div class=\"inner\">"));
302        assert!(tracker.is_inside());
303
304        // Close inner div
305        assert!(tracker.process_line("</div>"));
306        assert!(tracker.is_inside()); // Still inside outer div
307
308        // Close outer div
309        tracker.process_line("</div>");
310        assert!(!tracker.is_inside());
311    }
312
313    #[test]
314    fn test_deeply_nested_different_tags() {
315        let mut tracker = MarkdownHtmlTracker::new();
316
317        assert!(tracker.process_line("<article markdown>"));
318        assert!(tracker.is_inside());
319
320        // Inner section (without markdown)
321        assert!(tracker.process_line("<section>"));
322        assert!(tracker.is_inside());
323
324        // Close section - tracker only tracks article
325        assert!(tracker.process_line("</section>"));
326        assert!(tracker.is_inside());
327
328        // Close article
329        tracker.process_line("</article>");
330        assert!(!tracker.is_inside());
331    }
332
333    #[test]
334    fn test_multiple_closes_same_line() {
335        let mut tracker = MarkdownHtmlTracker::new();
336
337        assert!(tracker.process_line("<div markdown>"));
338        assert!(tracker.process_line("<div>inner</div></div>"));
339        assert!(!tracker.is_inside());
340    }
341
342    #[test]
343    fn test_count_opens_boundary_check() {
344        let tracker = MarkdownHtmlTracker::new();
345
346        // Should match
347        assert_eq!(tracker.count_opens("<div>", "div"), 1);
348        assert_eq!(tracker.count_opens("<div class='x'>", "div"), 1);
349        assert_eq!(tracker.count_opens("<DIV>", "div"), 1);
350        assert_eq!(tracker.count_opens("<div/><div>", "div"), 2);
351
352        // Should NOT match (divider is not div)
353        assert_eq!(tracker.count_opens("<divider>", "div"), 0);
354        assert_eq!(tracker.count_opens("<dividend>", "div"), 0);
355    }
356
357    #[test]
358    fn test_count_closes_variations() {
359        let tracker = MarkdownHtmlTracker::new();
360
361        assert_eq!(tracker.count_closes("</div>", "div"), 1);
362        assert_eq!(tracker.count_closes("</DIV>", "div"), 1);
363        assert_eq!(tracker.count_closes("</div >", "div"), 1);
364        assert_eq!(tracker.count_closes("</div  >", "div"), 1);
365        assert_eq!(tracker.count_closes("</div></div>", "div"), 2);
366        assert_eq!(tracker.count_closes("text</div>more</div>end", "div"), 2);
367    }
368
369    #[test]
370    fn test_reset() {
371        let mut tracker = MarkdownHtmlTracker::new();
372
373        tracker.process_line("<div markdown>");
374        assert!(tracker.is_inside());
375
376        tracker.reset();
377        assert!(!tracker.is_inside());
378
379        // Should work fresh after reset
380        tracker.process_line("<section markdown>");
381        assert!(tracker.is_inside());
382    }
383}