Skip to main content

rumdl_lib/utils/
quarto_divs.rs

1//! Quarto div and callout block detection utilities
2//!
3//! This module provides detection for Quarto/Pandoc fenced div syntax which uses
4//! `:::` markers to create structured content blocks.
5//!
6//! Common patterns:
7//! - `::: {.callout-note}` - Callout block with type
8//! - `::: {.callout-warning}` - Warning callout
9//! - `::: {#myid .class}` - Generic div with id and class
10//! - `::: myclass` - Simple div with class (shorthand)
11//! - `:::` - Closing marker
12//!
13//! Callout types: `callout-note`, `callout-warning`, `callout-tip`,
14//! `callout-important`, `callout-caution`
15
16use regex::Regex;
17use std::sync::LazyLock;
18
19use crate::utils::skip_context::ByteRange;
20
21/// Pattern to match div opening markers
22/// Matches: ::: {.class}, ::: {#id .class}, ::: classname, etc.
23/// Does NOT match a closing ::: on its own
24static DIV_OPEN_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*):::\s*(?:\{[^}]+\}|\S+)").unwrap());
25
26/// Pattern to match div closing markers
27/// Matches: ::: (with optional whitespace before and after)
28static DIV_CLOSE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*):::\s*$").unwrap());
29
30/// Pattern to match callout blocks specifically
31/// Callout types: note, warning, tip, important, caution
32static CALLOUT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
33    Regex::new(r"^(\s*):::\s*\{[^}]*\.callout-(?:note|warning|tip|important|caution)[^}]*\}").unwrap()
34});
35
36/// Pattern to match Pandoc-style attributes on any element
37/// Matches: {#id}, {.class}, {#id .class key="value"}, etc.
38/// Note: We match the entire attribute block including contents
39static PANDOC_ATTR_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\{[^}]+\}").unwrap());
40
41/// Check if a line is a div opening marker
42pub fn is_div_open(line: &str) -> bool {
43    DIV_OPEN_PATTERN.is_match(line)
44}
45
46/// Check if a line is a div closing marker (just `:::`)
47pub fn is_div_close(line: &str) -> bool {
48    DIV_CLOSE_PATTERN.is_match(line)
49}
50
51/// Check if a line is a callout block opening
52pub fn is_callout_open(line: &str) -> bool {
53    CALLOUT_PATTERN.is_match(line)
54}
55
56/// Check if a line contains Pandoc-style attributes
57pub fn has_pandoc_attributes(line: &str) -> bool {
58    PANDOC_ATTR_PATTERN.is_match(line)
59}
60
61/// Get the indentation level of a div marker
62pub fn get_div_indent(line: &str) -> usize {
63    let mut indent = 0;
64    for c in line.chars() {
65        match c {
66            ' ' => indent += 1,
67            '\t' => indent += 4, // Tabs expand to 4 spaces (CommonMark)
68            _ => break,
69        }
70    }
71    indent
72}
73
74/// Track div nesting state for a document
75#[derive(Debug, Clone, Default)]
76pub struct DivTracker {
77    /// Stack of div indentation levels for nesting tracking
78    indent_stack: Vec<usize>,
79}
80
81impl DivTracker {
82    pub fn new() -> Self {
83        Self::default()
84    }
85
86    /// Process a line and return whether we're inside a div after processing
87    pub fn process_line(&mut self, line: &str) -> bool {
88        let trimmed = line.trim_start();
89
90        if trimmed.starts_with(":::") {
91            let indent = get_div_indent(line);
92
93            if is_div_close(line) {
94                // Closing marker - pop the matching div from stack
95                // Pop the top div if its indent is >= the closing marker's indent
96                if let Some(&top_indent) = self.indent_stack.last()
97                    && top_indent >= indent
98                {
99                    self.indent_stack.pop();
100                }
101            } else if is_div_open(line) {
102                // Opening marker - push to stack
103                self.indent_stack.push(indent);
104            }
105        }
106
107        !self.indent_stack.is_empty()
108    }
109
110    /// Check if we're currently inside a div
111    pub fn is_inside_div(&self) -> bool {
112        !self.indent_stack.is_empty()
113    }
114}
115
116/// Detect Quarto div block ranges in content
117/// Returns a vector of byte ranges (start, end) for each div block
118pub fn detect_div_block_ranges(content: &str) -> Vec<ByteRange> {
119    let mut ranges = Vec::new();
120    let mut tracker = DivTracker::new();
121    let mut div_start: Option<usize> = None;
122    let mut byte_offset = 0;
123
124    for line in content.lines() {
125        let line_len = line.len();
126        let was_inside = tracker.is_inside_div();
127        let is_inside = tracker.process_line(line);
128
129        // Started a new div block
130        if !was_inside && is_inside {
131            div_start = Some(byte_offset);
132        }
133        // Exited a div block
134        else if was_inside
135            && !is_inside
136            && let Some(start) = div_start.take()
137        {
138            // End at the start of the closing line
139            ranges.push(ByteRange {
140                start,
141                end: byte_offset + line_len,
142            });
143        }
144
145        // Account for newline
146        byte_offset += line_len + 1;
147    }
148
149    // Handle unclosed divs at end of document
150    if let Some(start) = div_start {
151        ranges.push(ByteRange {
152            start,
153            end: content.len(),
154        });
155    }
156
157    ranges
158}
159
160/// Check if a byte position is within a div block
161pub fn is_within_div_block_ranges(ranges: &[ByteRange], position: usize) -> bool {
162    ranges.iter().any(|r| position >= r.start && position < r.end)
163}
164
165// ============================================================================
166// Pandoc/Quarto Citation Support
167// ============================================================================
168//
169// Pandoc citation syntax:
170// - Inline citation: @smith2020
171// - Parenthetical citation: [@smith2020]
172// - Suppress author: [-@smith2020]
173// - With locator: [@smith2020, p. 10]
174// - Multiple citations: [@smith2020; @jones2021]
175// - With prefix: [see @smith2020]
176//
177// Citation keys must start with a letter, digit, or underscore, and may contain
178// alphanumerics, underscores, hyphens, periods, and colons.
179
180/// Pattern to match bracketed citations: [@key], [-@key], [see @key], [@a; @b]
181static BRACKETED_CITATION_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
182    // Matches [...] containing at least one @key
183    Regex::new(r"\[[^\]]*@[a-zA-Z0-9_][a-zA-Z0-9_:.#$%&\-+?<>~/]*[^\]]*\]").unwrap()
184});
185
186/// Pattern to match inline citations: @key (not inside brackets)
187/// Citation key: starts with letter/digit/underscore, contains alphanumerics and some punctuation
188/// The @ must be preceded by whitespace, start of line, or punctuation (not alphanumeric)
189static INLINE_CITATION_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
190    // Match @ at start of string, after whitespace, or after non-alphanumeric (except @[)
191    Regex::new(r"(?:^|[\s\(\[\{,;:])(@[a-zA-Z0-9_][a-zA-Z0-9_:.#$%&\-+?<>~/]*)").unwrap()
192});
193
194/// Quick check if text might contain citations
195#[inline]
196pub fn has_citations(text: &str) -> bool {
197    text.contains('@')
198}
199
200/// Find all citation ranges in content (byte ranges)
201/// Returns ranges for both bracketed `[@key]` and inline `@key` citations
202pub fn find_citation_ranges(content: &str) -> Vec<ByteRange> {
203    let mut ranges = Vec::new();
204
205    // Find bracketed citations first (higher priority)
206    for mat in BRACKETED_CITATION_PATTERN.find_iter(content) {
207        ranges.push(ByteRange {
208            start: mat.start(),
209            end: mat.end(),
210        });
211    }
212
213    // Find inline citations (but not inside already-found brackets)
214    for cap in INLINE_CITATION_PATTERN.captures_iter(content) {
215        if let Some(mat) = cap.get(1) {
216            let start = mat.start();
217            // Skip if this is inside a bracketed citation
218            if !ranges.iter().any(|r| start >= r.start && start < r.end) {
219                ranges.push(ByteRange { start, end: mat.end() });
220            }
221        }
222    }
223
224    // Sort by start position
225    ranges.sort_by_key(|r| r.start);
226    ranges
227}
228
229#[cfg(test)]
230mod tests {
231    use super::*;
232
233    #[test]
234    fn test_div_open_detection() {
235        // Valid div openings
236        assert!(is_div_open("::: {.callout-note}"));
237        assert!(is_div_open("::: {.callout-warning}"));
238        assert!(is_div_open("::: {#myid .class}"));
239        assert!(is_div_open("::: bordered"));
240        assert!(is_div_open("  ::: {.note}")); // Indented
241        assert!(is_div_open("::: {.callout-tip title=\"My Title\"}"));
242
243        // Invalid patterns
244        assert!(!is_div_open(":::")); // Just closing marker
245        assert!(!is_div_open(":::  ")); // Just closing with trailing space
246        assert!(!is_div_open("Regular text"));
247        assert!(!is_div_open("# Heading"));
248        assert!(!is_div_open("```python")); // Code fence
249    }
250
251    #[test]
252    fn test_div_close_detection() {
253        assert!(is_div_close(":::"));
254        assert!(is_div_close(":::  "));
255        assert!(is_div_close("  :::"));
256        assert!(is_div_close("    :::  "));
257
258        assert!(!is_div_close("::: {.note}"));
259        assert!(!is_div_close("::: class"));
260        assert!(!is_div_close(":::note"));
261    }
262
263    #[test]
264    fn test_callout_detection() {
265        assert!(is_callout_open("::: {.callout-note}"));
266        assert!(is_callout_open("::: {.callout-warning}"));
267        assert!(is_callout_open("::: {.callout-tip}"));
268        assert!(is_callout_open("::: {.callout-important}"));
269        assert!(is_callout_open("::: {.callout-caution}"));
270        assert!(is_callout_open("::: {#myid .callout-note}"));
271        assert!(is_callout_open("::: {.callout-note title=\"Title\"}"));
272
273        assert!(!is_callout_open("::: {.note}")); // Not a callout
274        assert!(!is_callout_open("::: {.bordered}")); // Not a callout
275        assert!(!is_callout_open("::: callout-note")); // Missing braces
276    }
277
278    #[test]
279    fn test_div_tracker() {
280        let mut tracker = DivTracker::new();
281
282        // Enter a div
283        assert!(tracker.process_line("::: {.callout-note}"));
284        assert!(tracker.is_inside_div());
285
286        // Inside content
287        assert!(tracker.process_line("This is content."));
288        assert!(tracker.is_inside_div());
289
290        // Exit the div
291        assert!(!tracker.process_line(":::"));
292        assert!(!tracker.is_inside_div());
293    }
294
295    #[test]
296    fn test_nested_divs() {
297        let mut tracker = DivTracker::new();
298
299        // Outer div
300        assert!(tracker.process_line("::: {.outer}"));
301        assert!(tracker.is_inside_div());
302
303        // Inner div
304        assert!(tracker.process_line("  ::: {.inner}"));
305        assert!(tracker.is_inside_div());
306
307        // Content
308        assert!(tracker.process_line("    Content"));
309        assert!(tracker.is_inside_div());
310
311        // Close inner
312        assert!(tracker.process_line("  :::"));
313        assert!(tracker.is_inside_div());
314
315        // Close outer
316        assert!(!tracker.process_line(":::"));
317        assert!(!tracker.is_inside_div());
318    }
319
320    #[test]
321    fn test_detect_div_block_ranges() {
322        let content = r#"# Heading
323
324::: {.callout-note}
325This is a note.
326:::
327
328Regular text.
329
330::: {.bordered}
331Content here.
332:::
333"#;
334        let ranges = detect_div_block_ranges(content);
335        assert_eq!(ranges.len(), 2);
336
337        // First div
338        let first_div_content = &content[ranges[0].start..ranges[0].end];
339        assert!(first_div_content.contains("callout-note"));
340        assert!(first_div_content.contains("This is a note"));
341
342        // Second div
343        let second_div_content = &content[ranges[1].start..ranges[1].end];
344        assert!(second_div_content.contains("bordered"));
345        assert!(second_div_content.contains("Content here"));
346    }
347
348    #[test]
349    fn test_pandoc_attributes() {
350        assert!(has_pandoc_attributes("# Heading {#custom-id}"));
351        assert!(has_pandoc_attributes("# Heading {.unnumbered}"));
352        assert!(has_pandoc_attributes("![Image](path.png){#fig-1 width=\"50%\"}"));
353        assert!(has_pandoc_attributes("{#id .class key=\"value\"}"));
354
355        assert!(!has_pandoc_attributes("# Heading"));
356        assert!(!has_pandoc_attributes("Regular text"));
357        assert!(!has_pandoc_attributes("{}"));
358    }
359
360    #[test]
361    fn test_div_with_title_attribute() {
362        let content = r#"::: {.callout-note title="Important Note"}
363This is the content of the note.
364It can span multiple lines.
365:::
366"#;
367        let ranges = detect_div_block_ranges(content);
368        assert_eq!(ranges.len(), 1);
369        assert!(is_callout_open("::: {.callout-note title=\"Important Note\"}"));
370    }
371
372    #[test]
373    fn test_unclosed_div() {
374        let content = r#"::: {.callout-note}
375This note is never closed.
376"#;
377        let ranges = detect_div_block_ranges(content);
378        assert_eq!(ranges.len(), 1);
379        // Should include all content to end of document
380        assert_eq!(ranges[0].end, content.len());
381    }
382
383    #[test]
384    fn test_heading_inside_callout() {
385        let content = r#"::: {.callout-warning}
386## Warning Title
387
388Warning content here.
389:::
390"#;
391        let ranges = detect_div_block_ranges(content);
392        assert_eq!(ranges.len(), 1);
393
394        let div_content = &content[ranges[0].start..ranges[0].end];
395        assert!(div_content.contains("## Warning Title"));
396    }
397
398    // Citation tests
399    #[test]
400    fn test_has_citations() {
401        assert!(has_citations("See @smith2020 for details."));
402        assert!(has_citations("[@smith2020]"));
403        assert!(has_citations("Multiple [@a; @b] citations"));
404        assert!(!has_citations("No citations here"));
405        // has_citations is just a quick @ check - emails will pass (intended behavior)
406        assert!(has_citations("Email: user@example.com"));
407    }
408
409    #[test]
410    fn test_bracketed_citation_detection() {
411        let content = "See [@smith2020] for more info.";
412        let ranges = find_citation_ranges(content);
413        assert_eq!(ranges.len(), 1);
414        assert_eq!(&content[ranges[0].start..ranges[0].end], "[@smith2020]");
415    }
416
417    #[test]
418    fn test_inline_citation_detection() {
419        let content = "As @smith2020 argues, this is true.";
420        let ranges = find_citation_ranges(content);
421        assert_eq!(ranges.len(), 1);
422        assert_eq!(&content[ranges[0].start..ranges[0].end], "@smith2020");
423    }
424
425    #[test]
426    fn test_multiple_citations_in_brackets() {
427        let content = "See [@smith2020; @jones2021] for details.";
428        let ranges = find_citation_ranges(content);
429        assert_eq!(ranges.len(), 1);
430        assert_eq!(&content[ranges[0].start..ranges[0].end], "[@smith2020; @jones2021]");
431    }
432
433    #[test]
434    fn test_citation_with_prefix() {
435        let content = "[see @smith2020, p. 10]";
436        let ranges = find_citation_ranges(content);
437        assert_eq!(ranges.len(), 1);
438        assert_eq!(&content[ranges[0].start..ranges[0].end], "[see @smith2020, p. 10]");
439    }
440
441    #[test]
442    fn test_suppress_author_citation() {
443        let content = "The theory [-@smith2020] states that...";
444        let ranges = find_citation_ranges(content);
445        assert_eq!(ranges.len(), 1);
446        assert_eq!(&content[ranges[0].start..ranges[0].end], "[-@smith2020]");
447    }
448
449    #[test]
450    fn test_mixed_citations() {
451        let content = "@smith2020 argues that [@jones2021] is wrong.";
452        let ranges = find_citation_ranges(content);
453        assert_eq!(ranges.len(), 2);
454        // Inline citation
455        assert_eq!(&content[ranges[0].start..ranges[0].end], "@smith2020");
456        // Bracketed citation
457        assert_eq!(&content[ranges[1].start..ranges[1].end], "[@jones2021]");
458    }
459
460    #[test]
461    fn test_email_not_confused_with_citation() {
462        // Email addresses should not match as inline citations when properly filtered
463        // The has_citations() is just a quick check, but find_citation_ranges uses more strict patterns
464        let content = "Contact user@example.com for help.";
465        let ranges = find_citation_ranges(content);
466        // Email should not be detected as citation (@ is preceded by alphanumeric)
467        assert!(
468            ranges.is_empty()
469                || !ranges.iter().any(|r| {
470                    let s = &content[r.start..r.end];
471                    s.contains("example.com")
472                })
473        );
474    }
475}