Skip to main content

panache_parser/parser/inlines/
subscript.rs

1//! Parsing for subscript (~text~)
2//!
3//! This is a Pandoc extension.
4//! Syntax: ~text~ produces subscript text.
5//!
6//! Rules:
7//! - Must have exactly 1 tilde on each side
8//! - Content cannot be empty
9//! - Tildes cannot have whitespace immediately inside
10//! - Must not be confused with ~~ (strikeout)
11
12use super::core::parse_inline_text;
13use crate::options::ParserOptions;
14use crate::syntax::SyntaxKind;
15use rowan::GreenNodeBuilder;
16
17/// Try to parse subscript (~text~)
18/// Returns: (total_len, inner_content)
19pub fn try_parse_subscript(text: &str) -> Option<(usize, &str)> {
20    let bytes = text.as_bytes();
21
22    // Must start with ~
23    if bytes.is_empty() || bytes[0] != b'~' {
24        return None;
25    }
26
27    // Pandoc fallback: when strikeout (`~~text~~`) doesn't match, `~~` is
28    // consumed as an empty Subscript (`Subscript []`), with the second `~`
29    // closing the first. Probed against `pandoc -f markdown` for
30    // `~~unclosed`, `*x ~~y*`, `a ~~b`, `~~ a ~~`. Dispatch order in
31    // `inlines/core.rs` runs strikeout before subscript so a real
32    // strikeout (`~~hello~~`) is not misinterpreted as two empty
33    // subscripts.
34    if bytes.len() > 1 && bytes[1] == b'~' {
35        return Some((2, ""));
36    }
37
38    // Content cannot start with whitespace
39    if bytes.len() > 1 && bytes[1].is_ascii_whitespace() {
40        return None;
41    }
42
43    // Find the closing ~
44    let mut pos = 1;
45    let mut found_close = false;
46
47    while pos < bytes.len() {
48        if bytes[pos] == b'~' {
49            // Make sure it's not part of ~~
50            if pos + 1 < bytes.len() && bytes[pos + 1] == b'~' {
51                return None;
52            }
53            found_close = true;
54            break;
55        }
56        pos += 1;
57    }
58
59    if !found_close {
60        return None;
61    }
62
63    // Extract content between the delimiters
64    let content = &text[1..pos];
65
66    // Content cannot be empty or only whitespace
67    if content.trim().is_empty() {
68        return None;
69    }
70
71    // Content cannot end with whitespace
72    if content.ends_with(char::is_whitespace) {
73        return None;
74    }
75
76    // Pandoc rule: subscripted text cannot contain unescaped whitespace.
77    // To include a space, source must escape it as `\ `. Verified against
78    // `pandoc -f markdown` for `~x y~` → not a subscript, `~x\ y~` →
79    // Subscript with NBSP-joined content.
80    if contains_unescaped_whitespace(content) {
81        return None;
82    }
83
84    let total_len = pos + 1; // Include closing ~
85    Some((total_len, content))
86}
87
88fn contains_unescaped_whitespace(content: &str) -> bool {
89    let bytes = content.as_bytes();
90    let mut i = 0;
91    while i < bytes.len() {
92        let b = bytes[i];
93        if b == b'\\' && i + 1 < bytes.len() {
94            i += 2;
95            continue;
96        }
97        if (b as char).is_whitespace() {
98            return true;
99        }
100        i += 1;
101    }
102    false
103}
104
105/// Emit a subscript node with its content
106pub fn emit_subscript(
107    builder: &mut GreenNodeBuilder,
108    inner_text: &str,
109    config: &ParserOptions,
110    suppress_footnote_refs: bool,
111) {
112    builder.start_node(SyntaxKind::SUBSCRIPT.into());
113
114    // Opening marker
115    builder.start_node(SyntaxKind::SUBSCRIPT_MARKER.into());
116    builder.token(SyntaxKind::SUBSCRIPT_MARKER.into(), "~");
117    builder.finish_node();
118
119    // Parse inner content recursively for nested inline elements
120    parse_inline_text(builder, inner_text, config, false, suppress_footnote_refs);
121
122    // Closing marker
123    builder.start_node(SyntaxKind::SUBSCRIPT_MARKER.into());
124    builder.token(SyntaxKind::SUBSCRIPT_MARKER.into(), "~");
125    builder.finish_node();
126
127    builder.finish_node();
128}
129
130#[cfg(test)]
131mod tests {
132    use super::*;
133
134    #[test]
135    fn test_simple_subscript() {
136        assert_eq!(try_parse_subscript("~2~"), Some((3, "2")));
137        assert_eq!(try_parse_subscript("~n~"), Some((3, "n")));
138    }
139
140    #[test]
141    fn test_subscript_with_multiple_chars() {
142        assert_eq!(try_parse_subscript("~text~"), Some((6, "text")));
143        assert_eq!(try_parse_subscript("~i+1~"), Some((5, "i+1")));
144    }
145
146    #[test]
147    fn test_no_whitespace_inside_delimiters() {
148        // Content cannot start with whitespace
149        assert_eq!(try_parse_subscript("~ text~"), None);
150
151        // Content cannot end with whitespace
152        assert_eq!(try_parse_subscript("~text ~"), None);
153    }
154
155    #[test]
156    fn test_empty_content() {
157        // `~~` is consumed as an empty Subscript (pandoc strikeout-fallback);
158        // a single space between tildes is still rejected as a degenerate
159        // form (pandoc: `~ ~` → plain text).
160        assert_eq!(try_parse_subscript("~~"), Some((2, "")));
161        assert_eq!(try_parse_subscript("~ ~"), None);
162    }
163
164    #[test]
165    fn test_no_closing() {
166        assert_eq!(try_parse_subscript("~text"), None);
167        assert_eq!(try_parse_subscript("~hello world"), None);
168    }
169
170    #[test]
171    fn test_double_tilde_unclosed_is_empty_subscript() {
172        // Pandoc strikeout-fallback: when `~~text~~` would otherwise match
173        // strikeout, the dispatch order in `inlines/core.rs` ensures
174        // strikeout fires first. When strikeout would not match (no closing
175        // `~~`), `~~` is consumed as an empty Subscript, leaving the rest
176        // of the input for downstream parsing. Probed against pandoc:
177        // `~~unclosed` → `Subscript [] , Str "unclosed"`. The standalone
178        // `try_parse_subscript("~~text~~")` now returns the empty form;
179        // real strikeout matching is the dispatcher's responsibility.
180        assert_eq!(try_parse_subscript("~~text~~"), Some((2, "")));
181        assert_eq!(try_parse_subscript("~~unclosed"), Some((2, "")));
182    }
183
184    #[test]
185    fn test_subscript_with_other_content_after() {
186        assert_eq!(try_parse_subscript("~2~ text"), Some((3, "2")));
187        assert_eq!(try_parse_subscript("~n~ of sequence"), Some((3, "n")));
188    }
189
190    #[test]
191    fn test_internal_whitespace_rejected() {
192        // Pandoc rejects unescaped internal whitespace in subscripts;
193        // backslash-escaped spaces are accepted.
194        assert_eq!(try_parse_subscript("~some text~"), None);
195        assert_eq!(
196            try_parse_subscript("~some\\ text~"),
197            Some((12, "some\\ text"))
198        );
199    }
200
201    #[test]
202    fn test_single_char() {
203        assert_eq!(try_parse_subscript("~a~"), Some((3, "a")));
204    }
205
206    #[test]
207    fn test_subscript_before_strikeout_marker() {
208        // If there's a subscript followed by another ~, it should work
209        assert_eq!(try_parse_subscript("~x~ ~"), Some((3, "x")));
210    }
211}