Skip to main content

rumdl_lib/utils/
mkdocs_html_markdown.rs

1//! MkDocs HTML with markdown attribute detection
2//!
3//! Detects HTML elements (primarily divs) with the `markdown` attribute,
4//! which tells MkDocs/Python-Markdown to process the content as Markdown.
5//!
6//! Common patterns:
7//! - `<div class="grid cards" markdown>` - Grid cards
8//! - `<div markdown="1">` - Explicit markdown processing
9//! - `<div markdown="block">` - Block-level markdown
10
11use regex::Regex;
12use std::sync::LazyLock;
13
14/// Pattern to detect HTML opening tags with markdown attribute.
15/// Handles:
16/// - `<div markdown>` or `<div markdown="1">` or `<div markdown="block">`
17/// - Attribute can appear anywhere in the tag
18/// - Case-insensitive tag names (HTML is case-insensitive)
19/// - Various attribute value formats
20static MARKDOWN_HTML_OPEN: LazyLock<Regex> = LazyLock::new(|| {
21    Regex::new(
22        r#"(?i)^(\s*)<(div|section|article|aside|details|figure|footer|header|main|nav)\b[^>]*\bmarkdown\b[^>]*>"#,
23    )
24    .unwrap()
25});
26
27/// Check if a line starts a markdown-enabled HTML block
28fn is_markdown_html_start(line: &str) -> bool {
29    MARKDOWN_HTML_OPEN.is_match(line)
30}
31
32/// Get the tag name from a markdown HTML opening line
33fn get_tag_name(line: &str) -> Option<String> {
34    MARKDOWN_HTML_OPEN
35        .captures(line)
36        .map(|caps| caps.get(2).map(|m| m.as_str().to_lowercase()).unwrap_or_default())
37}
38
39/// Track state for markdown HTML block parsing
40#[derive(Debug, Default)]
41pub struct MarkdownHtmlTracker {
42    /// Stack of open tags (tag name, depth at that level)
43    tag_stack: Vec<(String, usize)>,
44    /// Current nesting depth
45    depth: usize,
46}
47
48impl MarkdownHtmlTracker {
49    pub fn new() -> Self {
50        Self::default()
51    }
52
53    /// Process a line and return whether the line is inside a markdown HTML block.
54    /// Returns true if:
55    /// - This line opens a new markdown HTML block
56    /// - This line is part of an existing markdown HTML block (even if it closes it)
57    pub fn process_line(&mut self, line: &str) -> bool {
58        let trimmed = line.trim();
59
60        // Check for opening tag
61        if is_markdown_html_start(line) {
62            if let Some(tag) = get_tag_name(line) {
63                self.depth += 1;
64                self.tag_stack.push((tag.clone(), self.depth));
65
66                // Check if this line also closes the tag (self-contained)
67                let line_lower = line.to_lowercase();
68                if Self::count_closes_lowered(&line_lower, &tag) > 0 {
69                    self.depth -= 1;
70                    self.tag_stack.pop();
71                }
72            }
73            return true;
74        }
75
76        // If we're inside a markdown HTML block at the start of this line
77        if !self.tag_stack.is_empty() {
78            // Lowercase the line once for all tag comparisons
79            let line_lower = trimmed.to_lowercase();
80
81            // Collect tag names by reference before mutating depth
82            let tags: Vec<String> = self.tag_stack.iter().map(|(tag, _)| tag.clone()).collect();
83            for tag in &tags {
84                let opens = Self::count_opens_lowered(&line_lower, tag);
85                let closes = Self::count_closes_lowered(&line_lower, tag);
86
87                self.depth += opens;
88
89                for _ in 0..closes {
90                    if self.depth > 0 {
91                        self.depth -= 1;
92                    }
93                }
94            }
95
96            // Clean up stack when depth reaches initial level
97            while let Some((_, start_depth)) = self.tag_stack.last() {
98                if self.depth < *start_depth {
99                    self.tag_stack.pop();
100                } else {
101                    break;
102                }
103            }
104
105            // Return true because this line was inside the block at the start
106            // (even if it also closes the block)
107            return true;
108        }
109
110        false
111    }
112
113    /// Count opening tags of a specific type in a pre-lowercased line.
114    /// `tag` is already lowercase (stored that way in `tag_stack`).
115    fn count_opens_lowered(line_lower: &str, tag: &str) -> usize {
116        let open_pattern = format!("<{tag}");
117        let mut count = 0;
118        let mut search_start = 0;
119
120        while let Some(pos) = line_lower[search_start..].find(&open_pattern) {
121            let abs_pos = search_start + pos;
122            let after_tag = abs_pos + open_pattern.len();
123
124            // Verify it's a tag boundary (followed by whitespace, >, or /)
125            if after_tag >= line_lower.len()
126                || line_lower[after_tag..].starts_with(|c: char| c.is_whitespace() || c == '>' || c == '/')
127            {
128                count += 1;
129            }
130            search_start = after_tag;
131        }
132        count
133    }
134
135    /// Count closing tags of a specific type in a pre-lowercased line.
136    /// `tag` is already lowercase (stored that way in `tag_stack`).
137    fn count_closes_lowered(line_lower: &str, tag: &str) -> usize {
138        let close_pattern = format!("</{tag}");
139        let mut count = 0;
140        let mut search_start = 0;
141
142        while let Some(pos) = line_lower[search_start..].find(&close_pattern) {
143            let abs_pos = search_start + pos;
144            let after_tag = abs_pos + close_pattern.len();
145
146            // Find the closing > (may have whitespace before it)
147            if let Some(rest) = line_lower.get(after_tag..)
148                && rest.trim_start().starts_with('>')
149            {
150                count += 1;
151            }
152            search_start = after_tag;
153        }
154        count
155    }
156
157    /// Check if currently inside a markdown HTML block
158    pub fn is_inside(&self) -> bool {
159        !self.tag_stack.is_empty()
160    }
161
162    /// Reset the tracker state
163    pub fn reset(&mut self) {
164        self.tag_stack.clear();
165        self.depth = 0;
166    }
167}
168
169#[cfg(test)]
170mod tests {
171    use super::*;
172
173    #[test]
174    fn test_markdown_html_detection() {
175        // Basic patterns
176        assert!(is_markdown_html_start("<div markdown>"));
177        assert!(is_markdown_html_start("<div class=\"grid cards\" markdown>"));
178        assert!(is_markdown_html_start("<div markdown=\"1\">"));
179        assert!(is_markdown_html_start("<div markdown=\"block\">"));
180
181        // Attribute order variations
182        assert!(is_markdown_html_start("<div markdown class=\"test\">"));
183        assert!(is_markdown_html_start("<div id=\"foo\" markdown>"));
184
185        // Case insensitivity
186        assert!(is_markdown_html_start("<DIV markdown>"));
187        assert!(is_markdown_html_start("<Div Markdown>"));
188
189        // With indentation
190        assert!(is_markdown_html_start("  <div markdown>"));
191        assert!(is_markdown_html_start("    <div class=\"grid\" markdown>"));
192
193        // Other valid HTML5 elements
194        assert!(is_markdown_html_start("<section markdown>"));
195        assert!(is_markdown_html_start("<article markdown>"));
196        assert!(is_markdown_html_start("<details markdown>"));
197
198        // Should NOT match
199        assert!(!is_markdown_html_start("<div class=\"test\">"));
200        assert!(!is_markdown_html_start("<span markdown>")); // span not in allowed list
201        assert!(!is_markdown_html_start("text with markdown word"));
202        assert!(!is_markdown_html_start("<div>markdown</div>"));
203    }
204
205    #[test]
206    fn test_tracker_basic() {
207        let mut tracker = MarkdownHtmlTracker::new();
208
209        assert!(!tracker.is_inside());
210
211        assert!(tracker.process_line("<div class=\"grid cards\" markdown>"));
212        assert!(tracker.is_inside());
213
214        assert!(tracker.process_line("-   Content here"));
215        assert!(tracker.is_inside());
216
217        assert!(tracker.process_line("    ---"));
218        assert!(tracker.is_inside());
219
220        // Close the div
221        tracker.process_line("</div>");
222        assert!(!tracker.is_inside());
223    }
224
225    #[test]
226    fn test_tracker_nested() {
227        let mut tracker = MarkdownHtmlTracker::new();
228
229        tracker.process_line("<div markdown>");
230        assert!(tracker.is_inside());
231
232        tracker.process_line("<div>nested</div>");
233        assert!(tracker.is_inside());
234
235        tracker.process_line("</div>");
236        assert!(!tracker.is_inside());
237    }
238
239    #[test]
240    fn test_grid_cards_pattern() {
241        let content = r#"<div class="grid cards" markdown>
242
243-   :zap:{ .lg .middle } **Built for speed**
244
245    ---
246
247    Written in Rust.
248
249</div>"#;
250
251        let mut tracker = MarkdownHtmlTracker::new();
252        let mut inside_lines = Vec::new();
253
254        for (i, line) in content.lines().enumerate() {
255            let inside = tracker.process_line(line);
256            if inside {
257                inside_lines.push(i);
258            }
259        }
260
261        // All lines except the last </div> should be marked as inside
262        assert!(inside_lines.contains(&0)); // <div ...>
263        assert!(inside_lines.contains(&2)); // -   :zap:...
264        assert!(inside_lines.contains(&4)); // ---
265        assert!(inside_lines.contains(&6)); // Written in Rust.
266        assert!(!tracker.is_inside()); // After </div>
267    }
268
269    #[test]
270    fn test_same_line_open_close() {
271        let mut tracker = MarkdownHtmlTracker::new();
272
273        // Single line with both open and close
274        let result = tracker.process_line("<div markdown>content</div>");
275        assert!(result); // The line itself is part of the block
276        assert!(!tracker.is_inside()); // But after processing, we're outside
277    }
278
279    #[test]
280    fn test_multiple_sequential_blocks() {
281        let mut tracker = MarkdownHtmlTracker::new();
282
283        // First block
284        assert!(tracker.process_line("<div markdown>"));
285        assert!(tracker.is_inside());
286        assert!(tracker.process_line("Content 1"));
287        tracker.process_line("</div>");
288        assert!(!tracker.is_inside());
289
290        // Second block (should work independently)
291        assert!(tracker.process_line("<section markdown>"));
292        assert!(tracker.is_inside());
293        assert!(tracker.process_line("Content 2"));
294        tracker.process_line("</section>");
295        assert!(!tracker.is_inside());
296    }
297
298    #[test]
299    fn test_deeply_nested_same_tag() {
300        let mut tracker = MarkdownHtmlTracker::new();
301
302        assert!(tracker.process_line("<div markdown>"));
303        assert!(tracker.is_inside());
304
305        // Nested div (without markdown attr)
306        assert!(tracker.process_line("<div class=\"inner\">"));
307        assert!(tracker.is_inside());
308
309        // Close inner div
310        assert!(tracker.process_line("</div>"));
311        assert!(tracker.is_inside()); // Still inside outer div
312
313        // Close outer div
314        tracker.process_line("</div>");
315        assert!(!tracker.is_inside());
316    }
317
318    #[test]
319    fn test_deeply_nested_different_tags() {
320        let mut tracker = MarkdownHtmlTracker::new();
321
322        assert!(tracker.process_line("<article markdown>"));
323        assert!(tracker.is_inside());
324
325        // Inner section (without markdown)
326        assert!(tracker.process_line("<section>"));
327        assert!(tracker.is_inside());
328
329        // Close section - tracker only tracks article
330        assert!(tracker.process_line("</section>"));
331        assert!(tracker.is_inside());
332
333        // Close article
334        tracker.process_line("</article>");
335        assert!(!tracker.is_inside());
336    }
337
338    #[test]
339    fn test_multiple_closes_same_line() {
340        let mut tracker = MarkdownHtmlTracker::new();
341
342        assert!(tracker.process_line("<div markdown>"));
343        assert!(tracker.process_line("<div>inner</div></div>"));
344        assert!(!tracker.is_inside());
345    }
346
347    #[test]
348    fn test_count_opens_boundary_check() {
349        // Should match (input is pre-lowercased)
350        assert_eq!(MarkdownHtmlTracker::count_opens_lowered("<div>", "div"), 1);
351        assert_eq!(MarkdownHtmlTracker::count_opens_lowered("<div class='x'>", "div"), 1);
352        assert_eq!(MarkdownHtmlTracker::count_opens_lowered("<div>", "div"), 1);
353        assert_eq!(MarkdownHtmlTracker::count_opens_lowered("<div/><div>", "div"), 2);
354
355        // Should NOT match (divider is not div)
356        assert_eq!(MarkdownHtmlTracker::count_opens_lowered("<divider>", "div"), 0);
357        assert_eq!(MarkdownHtmlTracker::count_opens_lowered("<dividend>", "div"), 0);
358
359        // Case-insensitive via pre-lowercased input
360        assert_eq!(
361            MarkdownHtmlTracker::count_opens_lowered(&"<DIV>".to_lowercase(), "div"),
362            1
363        );
364    }
365
366    #[test]
367    fn test_count_closes_variations() {
368        // Input is pre-lowercased
369        assert_eq!(MarkdownHtmlTracker::count_closes_lowered("</div>", "div"), 1);
370        assert_eq!(
371            MarkdownHtmlTracker::count_closes_lowered(&"</DIV>".to_lowercase(), "div"),
372            1
373        );
374        assert_eq!(MarkdownHtmlTracker::count_closes_lowered("</div >", "div"), 1);
375        assert_eq!(MarkdownHtmlTracker::count_closes_lowered("</div  >", "div"), 1);
376        assert_eq!(MarkdownHtmlTracker::count_closes_lowered("</div></div>", "div"), 2);
377        assert_eq!(
378            MarkdownHtmlTracker::count_closes_lowered("text</div>more</div>end", "div"),
379            2
380        );
381    }
382
383    #[test]
384    fn test_reset() {
385        let mut tracker = MarkdownHtmlTracker::new();
386
387        tracker.process_line("<div markdown>");
388        assert!(tracker.is_inside());
389
390        tracker.reset();
391        assert!(!tracker.is_inside());
392
393        // Should work fresh after reset
394        tracker.process_line("<section markdown>");
395        assert!(tracker.is_inside());
396    }
397}