Skip to main content

rumdl_lib/utils/
mkdocs_extensions.rs

1//! MkDocs PyMdown extensions support
2//!
3//! This module provides support for various PyMdown Markdown extensions
4//! commonly used with MkDocs Material:
5//!
6//! - **InlineHilite**: Inline code highlighting `` `#!python code` ``
7//! - **Keys**: Keyboard key notation `++ctrl+alt+delete++`
8//! - **Caret**: Superscript and insert `^superscript^` and `^^insert^^`
9//! - **Tilde**: Subscript and strikethrough `~subscript~` and `~~strike~~`
10//! - **Mark**: Highlight text `==highlighted==`
11//! - **SmartSymbols**: Auto-replace symbols `(c)` → `©`
12//!
13//! ## Architecture
14//!
15//! All markup detection follows a consistent span-based pattern:
16//! 1. `find_*_spans(line) -> Vec<(usize, usize)>` - find byte ranges
17//! 2. `is_in_*(line, position) -> bool` - check if position is inside markup
18//!
19//! For double-takes-precedence patterns (caret: ^^/^, tilde: ~~/~):
20//! - Double-delimiter spans are found first
21//! - Single-delimiter spans exclude positions inside double spans
22//!
23//! ## References
24//!
25//! - [PyMdown Extensions](https://facelessuser.github.io/pymdown-extensions/)
26
27use regex::Regex;
28use std::sync::LazyLock;
29
30// ============================================================================
31// Core span utilities
32// ============================================================================
33
34/// Check if a byte position falls within any span.
35/// Assumes spans are sorted by start position for early-exit optimization.
36#[inline]
37fn position_in_spans(position: usize, spans: &[(usize, usize)]) -> bool {
38    for &(start, end) in spans {
39        if position < start {
40            return false;
41        }
42        if position < end {
43            return true;
44        }
45    }
46    false
47}
48
49/// Find all regex matches as (start, end) byte spans.
50#[inline]
51fn find_regex_spans(line: &str, pattern: &Regex) -> Vec<(usize, usize)> {
52    pattern.find_iter(line).map(|m| (m.start(), m.end())).collect()
53}
54
55/// Find single-delimiter spans (like `~sub~` or `^super^`) that are NOT inside
56/// double-delimiter spans (like `~~strike~~` or `^^insert^^`).
57///
58/// Rules for single-delimiter content:
59/// - Must have at least one character between delimiters
60/// - Cannot contain whitespace (per PyMdown spec)
61/// - Cannot be inside a double-delimiter span
62fn find_single_delim_spans(line: &str, delim: char, double_spans: &[(usize, usize)]) -> Vec<(usize, usize)> {
63    let mut spans = Vec::new();
64    let mut chars = line.char_indices().peekable();
65    let delim_len = delim.len_utf8();
66
67    while let Some((start_byte, ch)) = chars.next() {
68        // Skip if inside a double-delimiter span
69        if position_in_spans(start_byte, double_spans) {
70            continue;
71        }
72
73        if ch != delim {
74            continue;
75        }
76
77        // Check if this is a double delimiter (skip it entirely)
78        if chars.peek().is_some_and(|(_, c)| *c == delim) {
79            chars.next();
80            continue;
81        }
82
83        // Look for closing single delimiter
84        let mut found_content = false;
85        let mut has_whitespace = false;
86
87        for (byte_pos, inner_ch) in chars.by_ref() {
88            // If we enter a double-delimiter span, stop looking
89            if position_in_spans(byte_pos, double_spans) {
90                break;
91            }
92
93            if inner_ch == delim {
94                // Check it's not the start of a double delimiter
95                let is_double = chars.peek().is_some_and(|(_, c)| *c == delim);
96                if !is_double && found_content && !has_whitespace {
97                    spans.push((start_byte, byte_pos + delim_len));
98                }
99                break;
100            }
101
102            found_content = true;
103            if inner_ch.is_whitespace() {
104                has_whitespace = true;
105            }
106        }
107    }
108
109    spans
110}
111
112// ============================================================================
113// InlineHilite: `#!lang code` syntax for inline code with syntax highlighting
114// ============================================================================
115
116/// Pattern to match inline hilite shebang at the start of backtick content
117static INLINE_HILITE_SHEBANG: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^#!([a-zA-Z][a-zA-Z0-9_+-]*)").unwrap());
118
119/// Check if code span content starts with InlineHilite shebang
120#[inline]
121pub fn is_inline_hilite_content(content: &str) -> bool {
122    INLINE_HILITE_SHEBANG.is_match(content)
123}
124
125// ============================================================================
126// Keys: ++key++ syntax for keyboard keys
127// ============================================================================
128
129/// Pattern to match keyboard key notation: `++key++` or `++key1+key2++`
130static KEYS_PATTERN: LazyLock<Regex> =
131    LazyLock::new(|| Regex::new(r"\+\+([a-zA-Z0-9_-]+(?:\+[a-zA-Z0-9_-]+)*)\+\+").unwrap());
132
133/// Find all keyboard shortcut spans
134fn find_keys_spans(line: &str) -> Vec<(usize, usize)> {
135    if !line.contains("++") {
136        return Vec::new();
137    }
138    find_regex_spans(line, &KEYS_PATTERN)
139}
140
141/// Check if a position in a line is within a keyboard shortcut
142fn is_in_keys(line: &str, position: usize) -> bool {
143    position_in_spans(position, &find_keys_spans(line))
144}
145
146// ============================================================================
147// Caret: ^superscript^ and ^^insert^^ syntax
148// ============================================================================
149
150/// Pattern to match insert: `^^text^^` (double caret)
151/// Handles content with single carets inside (e.g., `^^a^b^^`)
152static INSERT_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\^\^[^\^]+(?:\^[^\^]+)*\^\^").unwrap());
153
154/// Find all insert (^^text^^) spans
155fn find_insert_spans(line: &str) -> Vec<(usize, usize)> {
156    if !line.contains("^^") {
157        return Vec::new();
158    }
159    find_regex_spans(line, &INSERT_PATTERN)
160}
161
162/// Check if a position is within superscript or insert markup
163fn is_in_caret_markup(line: &str, position: usize) -> bool {
164    if !line.contains('^') {
165        return false;
166    }
167    let insert_spans = find_insert_spans(line);
168    if position_in_spans(position, &insert_spans) {
169        return true;
170    }
171    let super_spans = find_single_delim_spans(line, '^', &insert_spans);
172    position_in_spans(position, &super_spans)
173}
174
175// ============================================================================
176// Tilde: ~subscript~ and ~~strikethrough~~ syntax
177// ============================================================================
178
179/// Pattern to match strikethrough: `~~text~~` (double tilde)
180/// Handles content with single tildes inside (e.g., `~~a~b~~`)
181static STRIKETHROUGH_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"~~[^~]+(?:~[^~]+)*~~").unwrap());
182
183/// Find all strikethrough (~~text~~) spans
184fn find_strikethrough_spans(line: &str) -> Vec<(usize, usize)> {
185    if !line.contains("~~") {
186        return Vec::new();
187    }
188    find_regex_spans(line, &STRIKETHROUGH_PATTERN)
189}
190
191/// Check if a position is within subscript or strikethrough markup
192fn is_in_tilde_markup(line: &str, position: usize) -> bool {
193    if !line.contains('~') {
194        return false;
195    }
196    let strike_spans = find_strikethrough_spans(line);
197    if position_in_spans(position, &strike_spans) {
198        return true;
199    }
200    let sub_spans = find_single_delim_spans(line, '~', &strike_spans);
201    position_in_spans(position, &sub_spans)
202}
203
204// ============================================================================
205// Mark: ==highlighted== syntax
206// ============================================================================
207
208/// Pattern to match highlight/mark: `==text==`
209static MARK_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"==([^=]+)==").unwrap());
210
211/// Find all mark (==text==) spans
212fn find_mark_spans(line: &str) -> Vec<(usize, usize)> {
213    if !line.contains("==") {
214        return Vec::new();
215    }
216    find_regex_spans(line, &MARK_PATTERN)
217}
218
219/// Check if a position is within mark markup
220pub fn is_in_mark(line: &str, position: usize) -> bool {
221    position_in_spans(position, &find_mark_spans(line))
222}
223
224// ============================================================================
225// SmartSymbols: (c), (tm), (r), -->, <--, etc.
226// ============================================================================
227
228/// Pattern to match any SmartSymbol that might be replaced
229static SMART_SYMBOL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
230    Regex::new(r"(?:\(c\)|\(C\)|\(r\)|\(R\)|\(tm\)|\(TM\)|\(p\)|\.\.\.|-{2,3}|<->|<-|->|<=>|<=|=>|1/4|1/2|3/4|\+-|!=)")
231        .unwrap()
232});
233
234/// Find all SmartSymbol spans
235fn find_smart_symbol_spans(line: &str) -> Vec<(usize, usize)> {
236    // Quick rejection checks
237    if !line.contains('(')
238        && !line.contains("...")
239        && !line.contains("--")
240        && !line.contains("->")
241        && !line.contains("<-")
242        && !line.contains("=>")
243        && !line.contains("<=")
244        && !line.contains("1/")
245        && !line.contains("3/")
246        && !line.contains("+-")
247        && !line.contains("!=")
248    {
249        return Vec::new();
250    }
251    find_regex_spans(line, &SMART_SYMBOL_PATTERN)
252}
253
254/// Check if a position is at a SmartSymbol
255fn is_in_smart_symbol(line: &str, position: usize) -> bool {
256    position_in_spans(position, &find_smart_symbol_spans(line))
257}
258
259// ============================================================================
260// Combined utilities
261// ============================================================================
262
263/// Check if a position is within any PyMdown extension markup
264pub fn is_in_pymdown_markup(line: &str, position: usize) -> bool {
265    is_in_keys(line, position)
266        || is_in_caret_markup(line, position)
267        || is_in_tilde_markup(line, position)
268        || is_in_mark(line, position)
269        || is_in_smart_symbol(line, position)
270}
271
272#[cfg(test)]
273mod tests {
274    use super::*;
275
276    // =========================================================================
277    // Core utility tests
278    // =========================================================================
279
280    #[test]
281    fn test_position_in_spans_empty() {
282        assert!(!position_in_spans(0, &[]));
283        assert!(!position_in_spans(100, &[]));
284    }
285
286    #[test]
287    fn test_position_in_spans_early_exit() {
288        let spans = [(10, 20), (30, 40)];
289        assert!(!position_in_spans(5, &spans)); // Before all spans
290        assert!(!position_in_spans(25, &spans)); // Between spans
291        assert!(!position_in_spans(50, &spans)); // After all spans
292    }
293
294    #[test]
295    fn test_position_in_spans_inside() {
296        let spans = [(10, 20), (30, 40)];
297        assert!(position_in_spans(10, &spans)); // Start of first span
298        assert!(position_in_spans(15, &spans)); // Middle of first span
299        assert!(position_in_spans(19, &spans)); // End-1 of first span
300        assert!(!position_in_spans(20, &spans)); // End of first span (exclusive)
301        assert!(position_in_spans(30, &spans)); // Start of second span
302    }
303
304    // =========================================================================
305    // InlineHilite tests
306    // =========================================================================
307
308    #[test]
309    fn test_is_inline_hilite_content() {
310        assert!(is_inline_hilite_content("#!python print()"));
311        assert!(is_inline_hilite_content("#!js code"));
312
313        assert!(!is_inline_hilite_content("regular code"));
314        assert!(!is_inline_hilite_content(" #!python with space"));
315    }
316
317    // =========================================================================
318    // Keys tests
319    // =========================================================================
320
321    #[test]
322    fn test_is_in_keys() {
323        let line = "Press ++ctrl++ here";
324        assert!(!is_in_keys(line, 0)); // "P"
325        assert!(!is_in_keys(line, 5)); // " "
326        assert!(is_in_keys(line, 6)); // first +
327        assert!(is_in_keys(line, 10)); // "r"
328        assert!(is_in_keys(line, 13)); // last +
329        assert!(!is_in_keys(line, 14)); // " "
330    }
331
332    // =========================================================================
333    // Caret tests
334    // =========================================================================
335
336    #[test]
337    fn test_is_in_caret_markup() {
338        let line = "Text ^super^ here";
339        assert!(!is_in_caret_markup(line, 0));
340        assert!(is_in_caret_markup(line, 5)); // "^"
341        assert!(is_in_caret_markup(line, 8)); // "p"
342        assert!(!is_in_caret_markup(line, 13)); // " "
343
344        let line2 = "Text ^^insert^^ here";
345        assert!(is_in_caret_markup(line2, 5)); // first ^
346        assert!(is_in_caret_markup(line2, 10)); // "e"
347    }
348
349    // =========================================================================
350    // Tilde tests
351    // =========================================================================
352
353    #[test]
354    fn test_is_in_tilde_markup() {
355        let line = "Text ~sub~ here";
356        assert!(!is_in_tilde_markup(line, 0));
357        assert!(is_in_tilde_markup(line, 5)); // "~"
358        assert!(is_in_tilde_markup(line, 7)); // "u"
359        assert!(!is_in_tilde_markup(line, 12)); // " "
360
361        let line2 = "Text ~~strike~~ here";
362        assert!(is_in_tilde_markup(line2, 5)); // first ~
363        assert!(is_in_tilde_markup(line2, 10)); // "i"
364    }
365
366    #[test]
367    fn test_find_strikethrough_spans_triple_tilde() {
368        // ~~~a~~~ should match ~~a~~ (strikethrough) — the regex should find
369        // the leftmost valid pairing, not fail on extra tildes at the boundaries.
370        let line = "~~~a~~~";
371        let spans = find_strikethrough_spans(line);
372        assert_eq!(spans.len(), 1);
373        assert_eq!(&line[spans[0].0..spans[0].1], "~~a~~");
374    }
375
376    #[test]
377    fn test_find_strikethrough_spans_internal_single_tilde() {
378        // ~~a~b~~ must match as one strikethrough span, not split into
379        // strikethrough + subscript — the regex must allow single tildes
380        // inside the strikethrough body.
381        let line = "~~a~b~~";
382        let spans = find_strikethrough_spans(line);
383        assert_eq!(spans.len(), 1);
384        assert_eq!(&line[spans[0].0..spans[0].1], "~~a~b~~");
385
386        // And no inner subscript should be detected for the same line.
387        let sub_spans = find_single_delim_spans(line, '~', &spans);
388        assert!(sub_spans.is_empty());
389    }
390
391    // =========================================================================
392    // Mark tests
393    // =========================================================================
394
395    #[test]
396    fn test_is_in_mark() {
397        let line = "Text ==highlight== more";
398        assert!(!is_in_mark(line, 0));
399        assert!(is_in_mark(line, 5)); // first =
400        assert!(is_in_mark(line, 10)); // "h"
401        assert!(!is_in_mark(line, 19)); // " "
402    }
403
404    // =========================================================================
405    // SmartSymbols tests
406    // =========================================================================
407
408    #[test]
409    fn test_is_in_smart_symbol() {
410        let line = "Copyright (c) text";
411        assert!(!is_in_smart_symbol(line, 0));
412        assert!(is_in_smart_symbol(line, 10)); // "("
413        assert!(is_in_smart_symbol(line, 11)); // "c"
414        assert!(is_in_smart_symbol(line, 12)); // ")"
415        assert!(!is_in_smart_symbol(line, 14)); // " "
416    }
417
418    // =========================================================================
419    // Combined tests
420    // =========================================================================
421
422    #[test]
423    fn test_is_in_pymdown_markup() {
424        assert!(is_in_pymdown_markup("++ctrl++", 2));
425        assert!(is_in_pymdown_markup("^super^", 1));
426        assert!(is_in_pymdown_markup("~sub~", 1));
427        assert!(is_in_pymdown_markup("~~strike~~", 2));
428        assert!(is_in_pymdown_markup("==mark==", 2));
429        assert!(is_in_pymdown_markup("(c)", 1));
430
431        assert!(!is_in_pymdown_markup("plain text", 5));
432    }
433
434    #[test]
435    fn test_empty_line() {
436        assert!(!is_in_pymdown_markup("", 0));
437        assert!(!is_in_mark("", 0));
438        assert!(!is_inline_hilite_content(""));
439    }
440}