rumdl_lib/utils/
quarto_divs.rs

1//! Quarto div and callout block detection utilities
2//!
3//! This module provides detection for Quarto/Pandoc fenced div syntax which uses
4//! `:::` markers to create structured content blocks.
5//!
6//! Common patterns:
7//! - `::: {.callout-note}` - Callout block with type
8//! - `::: {.callout-warning}` - Warning callout
9//! - `::: {#myid .class}` - Generic div with id and class
10//! - `::: myclass` - Simple div with class (shorthand)
11//! - `:::` - Closing marker
12//!
13//! Callout types: `callout-note`, `callout-warning`, `callout-tip`,
14//! `callout-important`, `callout-caution`
15
16use regex::Regex;
17use std::sync::LazyLock;
18
19use crate::utils::skip_context::ByteRange;
20
21/// Pattern to match div opening markers
22/// Matches: ::: {.class}, ::: {#id .class}, ::: classname, etc.
23/// Does NOT match a closing ::: on its own
24static DIV_OPEN_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*):::\s*(?:\{[^}]+\}|\S+)").unwrap());
25
26/// Pattern to match div closing markers
27/// Matches: ::: (with optional whitespace before and after)
28static DIV_CLOSE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*):::\s*$").unwrap());
29
30/// Pattern to match callout blocks specifically
31/// Callout types: note, warning, tip, important, caution
32static CALLOUT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
33    Regex::new(r"^(\s*):::\s*\{[^}]*\.callout-(?:note|warning|tip|important|caution)[^}]*\}").unwrap()
34});
35
36/// Pattern to match Pandoc-style attributes on any element
37/// Matches: {#id}, {.class}, {#id .class key="value"}, etc.
38/// Note: We match the entire attribute block including contents
39static PANDOC_ATTR_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\{[^}]+\}").unwrap());
40
41/// Check if a line is a div opening marker
42pub fn is_div_open(line: &str) -> bool {
43    DIV_OPEN_PATTERN.is_match(line)
44}
45
46/// Check if a line is a div closing marker (just `:::`)
47pub fn is_div_close(line: &str) -> bool {
48    DIV_CLOSE_PATTERN.is_match(line)
49}
50
51/// Check if a line is a callout block opening
52pub fn is_callout_open(line: &str) -> bool {
53    CALLOUT_PATTERN.is_match(line)
54}
55
56/// Check if a line contains Pandoc-style attributes
57pub fn has_pandoc_attributes(line: &str) -> bool {
58    PANDOC_ATTR_PATTERN.is_match(line)
59}
60
61/// Get the indentation level of a div marker
62pub fn get_div_indent(line: &str) -> usize {
63    let mut indent = 0;
64    for c in line.chars() {
65        match c {
66            ' ' => indent += 1,
67            '\t' => indent += 4, // Tabs expand to 4 spaces (CommonMark)
68            _ => break,
69        }
70    }
71    indent
72}
73
74/// Track div nesting state for a document
75#[derive(Debug, Clone, Default)]
76pub struct DivTracker {
77    /// Stack of div indentation levels for nesting tracking
78    indent_stack: Vec<usize>,
79}
80
81impl DivTracker {
82    pub fn new() -> Self {
83        Self::default()
84    }
85
86    /// Process a line and return whether we're inside a div after processing
87    pub fn process_line(&mut self, line: &str) -> bool {
88        let trimmed = line.trim_start();
89
90        if trimmed.starts_with(":::") {
91            let indent = get_div_indent(line);
92
93            if is_div_close(line) {
94                // Closing marker - pop the matching div from stack
95                // Pop the top div if its indent is >= the closing marker's indent
96                if let Some(&top_indent) = self.indent_stack.last()
97                    && top_indent >= indent
98                {
99                    self.indent_stack.pop();
100                }
101            } else if is_div_open(line) {
102                // Opening marker - push to stack
103                self.indent_stack.push(indent);
104            }
105        }
106
107        !self.indent_stack.is_empty()
108    }
109
110    /// Check if we're currently inside a div
111    pub fn is_inside_div(&self) -> bool {
112        !self.indent_stack.is_empty()
113    }
114
115    /// Get current nesting depth
116    pub fn depth(&self) -> usize {
117        self.indent_stack.len()
118    }
119}
120
121/// Detect Quarto div block ranges in content
122/// Returns a vector of byte ranges (start, end) for each div block
123pub fn detect_div_block_ranges(content: &str) -> Vec<ByteRange> {
124    let mut ranges = Vec::new();
125    let mut tracker = DivTracker::new();
126    let mut div_start: Option<usize> = None;
127    let mut byte_offset = 0;
128
129    for line in content.lines() {
130        let line_len = line.len();
131        let was_inside = tracker.is_inside_div();
132        let is_inside = tracker.process_line(line);
133
134        // Started a new div block
135        if !was_inside && is_inside {
136            div_start = Some(byte_offset);
137        }
138        // Exited a div block
139        else if was_inside
140            && !is_inside
141            && let Some(start) = div_start.take()
142        {
143            // End at the start of the closing line
144            ranges.push(ByteRange {
145                start,
146                end: byte_offset + line_len,
147            });
148        }
149
150        // Account for newline
151        byte_offset += line_len + 1;
152    }
153
154    // Handle unclosed divs at end of document
155    if let Some(start) = div_start {
156        ranges.push(ByteRange {
157            start,
158            end: content.len(),
159        });
160    }
161
162    ranges
163}
164
165/// Check if a byte position is within a div block
166pub fn is_within_div_block_ranges(ranges: &[ByteRange], position: usize) -> bool {
167    ranges.iter().any(|r| position >= r.start && position < r.end)
168}
169
170/// Extract class names from a Pandoc attribute block
171/// Returns classes like "callout-note", "bordered", etc.
172pub fn extract_classes(line: &str) -> Vec<String> {
173    let mut classes = Vec::new();
174
175    // Look for {.class ...} patterns
176    if let Some(captures) = PANDOC_ATTR_PATTERN.find(line) {
177        let attr_block = captures.as_str();
178        // Strip the braces to get the inner content
179        let inner = attr_block.trim_start_matches('{').trim_end_matches('}').trim();
180
181        // Extract each .class by splitting on whitespace and looking for . prefix
182        for part in inner.split_whitespace() {
183            if let Some(class) = part.strip_prefix('.') {
184                // Clean up any trailing = if followed by attribute value
185                let class = class.split('=').next().unwrap_or(class);
186                if !class.is_empty() {
187                    classes.push(class.to_string());
188                }
189            }
190        }
191    }
192
193    classes
194}
195
196/// Extract the ID from a Pandoc attribute block
197pub fn extract_id(line: &str) -> Option<String> {
198    if let Some(captures) = PANDOC_ATTR_PATTERN.find(line) {
199        let attr_block = captures.as_str();
200        // Strip the braces to get the inner content
201        let inner = attr_block.trim_start_matches('{').trim_end_matches('}').trim();
202
203        // Extract #id by splitting on whitespace and looking for # prefix
204        for part in inner.split_whitespace() {
205            if let Some(id) = part.strip_prefix('#') {
206                // Clean up any trailing = if followed by attribute value
207                let id = id.split('=').next().unwrap_or(id);
208                if !id.is_empty() {
209                    return Some(id.to_string());
210                }
211            }
212        }
213    }
214    None
215}
216
217// ============================================================================
218// Pandoc/Quarto Citation Support
219// ============================================================================
220//
221// Pandoc citation syntax:
222// - Inline citation: @smith2020
223// - Parenthetical citation: [@smith2020]
224// - Suppress author: [-@smith2020]
225// - With locator: [@smith2020, p. 10]
226// - Multiple citations: [@smith2020; @jones2021]
227// - With prefix: [see @smith2020]
228//
229// Citation keys must start with a letter, digit, or underscore, and may contain
230// alphanumerics, underscores, hyphens, periods, and colons.
231
232/// Pattern to match bracketed citations: [@key], [-@key], [see @key], [@a; @b]
233static BRACKETED_CITATION_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
234    // Matches [...] containing at least one @key
235    Regex::new(r"\[[^\]]*@[a-zA-Z0-9_][a-zA-Z0-9_:.#$%&\-+?<>~/]*[^\]]*\]").unwrap()
236});
237
238/// Pattern to match inline citations: @key (not inside brackets)
239/// Citation key: starts with letter/digit/underscore, contains alphanumerics and some punctuation
240/// The @ must be preceded by whitespace, start of line, or punctuation (not alphanumeric)
241static INLINE_CITATION_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
242    // Match @ at start of string, after whitespace, or after non-alphanumeric (except @[)
243    Regex::new(r"(?:^|[\s\(\[\{,;:])(@[a-zA-Z0-9_][a-zA-Z0-9_:.#$%&\-+?<>~/]*)").unwrap()
244});
245
246/// Quick check if text might contain citations
247#[inline]
248pub fn has_citations(text: &str) -> bool {
249    text.contains('@')
250}
251
252/// Find all citation ranges in content (byte ranges)
253/// Returns ranges for both bracketed [@key] and inline @key citations
254pub fn find_citation_ranges(content: &str) -> Vec<ByteRange> {
255    let mut ranges = Vec::new();
256
257    // Find bracketed citations first (higher priority)
258    for mat in BRACKETED_CITATION_PATTERN.find_iter(content) {
259        ranges.push(ByteRange {
260            start: mat.start(),
261            end: mat.end(),
262        });
263    }
264
265    // Find inline citations (but not inside already-found brackets)
266    for cap in INLINE_CITATION_PATTERN.captures_iter(content) {
267        if let Some(mat) = cap.get(1) {
268            let start = mat.start();
269            // Skip if this is inside a bracketed citation
270            if !ranges.iter().any(|r| start >= r.start && start < r.end) {
271                ranges.push(ByteRange { start, end: mat.end() });
272            }
273        }
274    }
275
276    // Sort by start position
277    ranges.sort_by_key(|r| r.start);
278    ranges
279}
280
281/// Check if a byte position is within a citation
282pub fn is_in_citation(ranges: &[ByteRange], position: usize) -> bool {
283    ranges.iter().any(|r| position >= r.start && position < r.end)
284}
285
286/// Extract citation key from a citation string (removes @ prefix)
287pub fn extract_citation_key(citation: &str) -> Option<&str> {
288    citation.strip_prefix('@').or_else(|| {
289        // Handle [-@key] format
290        citation.strip_prefix("[-@").and_then(|s| s.strip_suffix(']'))
291    })
292}
293
294#[cfg(test)]
295mod tests {
296    use super::*;
297
298    #[test]
299    fn test_div_open_detection() {
300        // Valid div openings
301        assert!(is_div_open("::: {.callout-note}"));
302        assert!(is_div_open("::: {.callout-warning}"));
303        assert!(is_div_open("::: {#myid .class}"));
304        assert!(is_div_open("::: bordered"));
305        assert!(is_div_open("  ::: {.note}")); // Indented
306        assert!(is_div_open("::: {.callout-tip title=\"My Title\"}"));
307
308        // Invalid patterns
309        assert!(!is_div_open(":::")); // Just closing marker
310        assert!(!is_div_open(":::  ")); // Just closing with trailing space
311        assert!(!is_div_open("Regular text"));
312        assert!(!is_div_open("# Heading"));
313        assert!(!is_div_open("```python")); // Code fence
314    }
315
316    #[test]
317    fn test_div_close_detection() {
318        assert!(is_div_close(":::"));
319        assert!(is_div_close(":::  "));
320        assert!(is_div_close("  :::"));
321        assert!(is_div_close("    :::  "));
322
323        assert!(!is_div_close("::: {.note}"));
324        assert!(!is_div_close("::: class"));
325        assert!(!is_div_close(":::note"));
326    }
327
328    #[test]
329    fn test_callout_detection() {
330        assert!(is_callout_open("::: {.callout-note}"));
331        assert!(is_callout_open("::: {.callout-warning}"));
332        assert!(is_callout_open("::: {.callout-tip}"));
333        assert!(is_callout_open("::: {.callout-important}"));
334        assert!(is_callout_open("::: {.callout-caution}"));
335        assert!(is_callout_open("::: {#myid .callout-note}"));
336        assert!(is_callout_open("::: {.callout-note title=\"Title\"}"));
337
338        assert!(!is_callout_open("::: {.note}")); // Not a callout
339        assert!(!is_callout_open("::: {.bordered}")); // Not a callout
340        assert!(!is_callout_open("::: callout-note")); // Missing braces
341    }
342
343    #[test]
344    fn test_div_tracker() {
345        let mut tracker = DivTracker::new();
346
347        // Enter a div
348        assert!(tracker.process_line("::: {.callout-note}"));
349        assert!(tracker.is_inside_div());
350        assert_eq!(tracker.depth(), 1);
351
352        // Inside content
353        assert!(tracker.process_line("This is content."));
354        assert!(tracker.is_inside_div());
355
356        // Exit the div
357        assert!(!tracker.process_line(":::"));
358        assert!(!tracker.is_inside_div());
359        assert_eq!(tracker.depth(), 0);
360    }
361
362    #[test]
363    fn test_nested_divs() {
364        let mut tracker = DivTracker::new();
365
366        // Outer div
367        assert!(tracker.process_line("::: {.outer}"));
368        assert_eq!(tracker.depth(), 1);
369
370        // Inner div
371        assert!(tracker.process_line("  ::: {.inner}"));
372        assert_eq!(tracker.depth(), 2);
373
374        // Content
375        assert!(tracker.process_line("    Content"));
376        assert!(tracker.is_inside_div());
377
378        // Close inner
379        assert!(tracker.process_line("  :::"));
380        assert_eq!(tracker.depth(), 1);
381
382        // Close outer
383        assert!(!tracker.process_line(":::"));
384        assert_eq!(tracker.depth(), 0);
385    }
386
387    #[test]
388    fn test_detect_div_block_ranges() {
389        let content = r#"# Heading
390
391::: {.callout-note}
392This is a note.
393:::
394
395Regular text.
396
397::: {.bordered}
398Content here.
399:::
400"#;
401        let ranges = detect_div_block_ranges(content);
402        assert_eq!(ranges.len(), 2);
403
404        // First div
405        let first_div_content = &content[ranges[0].start..ranges[0].end];
406        assert!(first_div_content.contains("callout-note"));
407        assert!(first_div_content.contains("This is a note"));
408
409        // Second div
410        let second_div_content = &content[ranges[1].start..ranges[1].end];
411        assert!(second_div_content.contains("bordered"));
412        assert!(second_div_content.contains("Content here"));
413    }
414
415    #[test]
416    fn test_extract_classes() {
417        assert_eq!(extract_classes("::: {.callout-note}"), vec!["callout-note"]);
418        assert_eq!(
419            extract_classes("::: {#myid .bordered .highlighted}"),
420            vec!["bordered", "highlighted"]
421        );
422        assert_eq!(
423            extract_classes("::: {.callout-warning title=\"Alert\"}"),
424            vec!["callout-warning"]
425        );
426
427        assert!(extract_classes("Regular text").is_empty());
428        assert!(extract_classes("::: classname").is_empty()); // No braces
429    }
430
431    #[test]
432    fn test_extract_id() {
433        assert_eq!(extract_id("::: {#myid}"), Some("myid".to_string()));
434        assert_eq!(extract_id("::: {#myid .class}"), Some("myid".to_string()));
435        assert_eq!(extract_id("::: {.class #custom-id}"), Some("custom-id".to_string()));
436
437        assert_eq!(extract_id("::: {.class}"), None);
438        assert_eq!(extract_id("Regular text"), None);
439    }
440
441    #[test]
442    fn test_pandoc_attributes() {
443        assert!(has_pandoc_attributes("# Heading {#custom-id}"));
444        assert!(has_pandoc_attributes("# Heading {.unnumbered}"));
445        assert!(has_pandoc_attributes("![Image](path.png){#fig-1 width=\"50%\"}"));
446        assert!(has_pandoc_attributes("{#id .class key=\"value\"}"));
447
448        assert!(!has_pandoc_attributes("# Heading"));
449        assert!(!has_pandoc_attributes("Regular text"));
450        assert!(!has_pandoc_attributes("{}"));
451    }
452
453    #[test]
454    fn test_div_with_title_attribute() {
455        let content = r#"::: {.callout-note title="Important Note"}
456This is the content of the note.
457It can span multiple lines.
458:::
459"#;
460        let ranges = detect_div_block_ranges(content);
461        assert_eq!(ranges.len(), 1);
462        assert!(is_callout_open("::: {.callout-note title=\"Important Note\"}"));
463    }
464
465    #[test]
466    fn test_unclosed_div() {
467        let content = r#"::: {.callout-note}
468This note is never closed.
469"#;
470        let ranges = detect_div_block_ranges(content);
471        assert_eq!(ranges.len(), 1);
472        // Should include all content to end of document
473        assert_eq!(ranges[0].end, content.len());
474    }
475
476    #[test]
477    fn test_heading_inside_callout() {
478        let content = r#"::: {.callout-warning}
479## Warning Title
480
481Warning content here.
482:::
483"#;
484        let ranges = detect_div_block_ranges(content);
485        assert_eq!(ranges.len(), 1);
486
487        let div_content = &content[ranges[0].start..ranges[0].end];
488        assert!(div_content.contains("## Warning Title"));
489    }
490
491    // Citation tests
492    #[test]
493    fn test_has_citations() {
494        assert!(has_citations("See @smith2020 for details."));
495        assert!(has_citations("[@smith2020]"));
496        assert!(has_citations("Multiple [@a; @b] citations"));
497        assert!(!has_citations("No citations here"));
498        // has_citations is just a quick @ check - emails will pass (intended behavior)
499        assert!(has_citations("Email: user@example.com"));
500    }
501
502    #[test]
503    fn test_bracketed_citation_detection() {
504        let content = "See [@smith2020] for more info.";
505        let ranges = find_citation_ranges(content);
506        assert_eq!(ranges.len(), 1);
507        assert_eq!(&content[ranges[0].start..ranges[0].end], "[@smith2020]");
508    }
509
510    #[test]
511    fn test_inline_citation_detection() {
512        let content = "As @smith2020 argues, this is true.";
513        let ranges = find_citation_ranges(content);
514        assert_eq!(ranges.len(), 1);
515        assert_eq!(&content[ranges[0].start..ranges[0].end], "@smith2020");
516    }
517
518    #[test]
519    fn test_multiple_citations_in_brackets() {
520        let content = "See [@smith2020; @jones2021] for details.";
521        let ranges = find_citation_ranges(content);
522        assert_eq!(ranges.len(), 1);
523        assert_eq!(&content[ranges[0].start..ranges[0].end], "[@smith2020; @jones2021]");
524    }
525
526    #[test]
527    fn test_citation_with_prefix() {
528        let content = "[see @smith2020, p. 10]";
529        let ranges = find_citation_ranges(content);
530        assert_eq!(ranges.len(), 1);
531        assert_eq!(&content[ranges[0].start..ranges[0].end], "[see @smith2020, p. 10]");
532    }
533
534    #[test]
535    fn test_suppress_author_citation() {
536        let content = "The theory [-@smith2020] states that...";
537        let ranges = find_citation_ranges(content);
538        assert_eq!(ranges.len(), 1);
539        assert_eq!(&content[ranges[0].start..ranges[0].end], "[-@smith2020]");
540    }
541
542    #[test]
543    fn test_mixed_citations() {
544        let content = "@smith2020 argues that [@jones2021] is wrong.";
545        let ranges = find_citation_ranges(content);
546        assert_eq!(ranges.len(), 2);
547        // Inline citation
548        assert_eq!(&content[ranges[0].start..ranges[0].end], "@smith2020");
549        // Bracketed citation
550        assert_eq!(&content[ranges[1].start..ranges[1].end], "[@jones2021]");
551    }
552
553    #[test]
554    fn test_citation_key_extraction() {
555        assert_eq!(extract_citation_key("@smith2020"), Some("smith2020"));
556        assert_eq!(extract_citation_key("@Smith_2020"), Some("Smith_2020"));
557        assert_eq!(extract_citation_key("@key:with:colons"), Some("key:with:colons"));
558        assert_eq!(extract_citation_key("not-a-citation"), None);
559    }
560
561    #[test]
562    fn test_is_in_citation() {
563        let content = "See [@smith2020] here.";
564        let ranges = find_citation_ranges(content);
565
566        // Position inside citation
567        assert!(is_in_citation(&ranges, 5)); // Inside [@smith2020]
568        // Position outside citation
569        assert!(!is_in_citation(&ranges, 0)); // "See "
570        assert!(!is_in_citation(&ranges, 17)); // " here."
571    }
572
573    #[test]
574    fn test_email_not_confused_with_citation() {
575        // Email addresses should not match as inline citations when properly filtered
576        // The has_citations() is just a quick check, but find_citation_ranges uses more strict patterns
577        let content = "Contact user@example.com for help.";
578        let ranges = find_citation_ranges(content);
579        // Email should not be detected as citation (@ is preceded by alphanumeric)
580        assert!(
581            ranges.is_empty()
582                || !ranges.iter().any(|r| {
583                    let s = &content[r.start..r.end];
584                    s.contains("example.com")
585                })
586        );
587    }
588}