Skip to main content

panache_parser/parser/inlines/
native_spans.rs

1//! Native span parsing for Pandoc's `native_spans` extension.
2//!
3//! Syntax: `<span class="foo">content</span>`
4//!
5//! When the `native_spans` extension is enabled, HTML `<span>` tags are
6//! treated as native Pandoc Span elements instead of raw HTML.
7
8use crate::options::{Dialect, ParserOptions};
9use crate::syntax::SyntaxKind;
10use rowan::GreenNodeBuilder;
11
12use super::core::parse_inline_text;
13
14/// Try to parse a native HTML span starting at the current position.
15/// Returns Some((length, content, attributes)) if successful.
16///
17/// Native spans have the form: <span attrs...>content</span>
18/// The content can contain markdown that will be parsed recursively.
19pub(crate) fn try_parse_native_span(text: &str) -> Option<(usize, &str, String)> {
20    let bytes = text.as_bytes();
21
22    // Must start with <span
23    if !text.starts_with("<span") {
24        return None;
25    }
26
27    let mut pos = 5; // After "<span"
28
29    // Next char must be space, >, or end of tag
30    if pos >= text.len() {
31        return None;
32    }
33
34    let next_char = bytes[pos] as char;
35    if !matches!(next_char, ' ' | '\t' | '\n' | '\r' | '>') {
36        // Could be <spanx> or something else, not a span tag
37        return None;
38    }
39
40    // Parse attributes until we find >
41    let attr_start = pos;
42    while pos < text.len() && bytes[pos] != b'>' {
43        // Handle quoted attributes
44        if bytes[pos] == b'"' || bytes[pos] == b'\'' {
45            let quote = bytes[pos];
46            pos += 1;
47            // Skip until closing quote
48            while pos < text.len() && bytes[pos] != quote {
49                if bytes[pos] == b'\\' {
50                    pos += 2; // Skip escaped character
51                } else {
52                    pos += 1;
53                }
54            }
55            if pos < text.len() {
56                pos += 1; // Skip closing quote
57            }
58        } else {
59            pos += 1;
60        }
61    }
62
63    if pos >= text.len() {
64        // No closing > found
65        return None;
66    }
67
68    // Extract attributes
69    let attributes = text[attr_start..pos].trim().to_string();
70
71    // Skip the >
72    pos += 1;
73
74    // Now find the closing </span>
75    let content_start = pos;
76    let mut depth = 1;
77
78    while pos < text.len() && depth > 0 {
79        // Check for nested <span>
80        if bytes
81            .get(pos..)
82            .is_some_and(|tail| tail.starts_with(b"<span"))
83        {
84            // Make sure it's actually a span tag (space or > follows)
85            let check_pos = pos + 5;
86            if check_pos < text.len() {
87                let ch = bytes[check_pos] as char;
88                if matches!(ch, ' ' | '\t' | '\n' | '\r' | '>') {
89                    depth += 1;
90                    pos += 5;
91                    continue;
92                }
93            }
94        }
95
96        // Check for closing </span>
97        if bytes
98            .get(pos..)
99            .is_some_and(|tail| tail.starts_with(b"</span>"))
100        {
101            depth -= 1;
102            if depth == 0 {
103                // Found the matching closing tag
104                let content = &text[content_start..pos];
105                let total_len = pos + 7; // Include </span>
106                return Some((total_len, content, attributes));
107            }
108            pos += 7;
109            continue;
110        }
111
112        // Advance by UTF-8 char length so subsequent string slicing stays on
113        // char boundaries.
114        pos += text[pos..].chars().next().map_or(1, char::len_utf8);
115    }
116
117    // No matching closing tag found
118    None
119}
120
121/// Emit a native span node to the builder.
122///
123/// `raw` is the full byte slice of the matched span (`<span...>content</span>`)
124/// so the open-tag bytes can be tokenized byte-exactly. Under
125/// `Dialect::Pandoc`, the wrapper is `INLINE_HTML_SPAN` and the open tag's
126/// attribute region is exposed structurally as `HTML_ATTRS` (mirroring the
127/// `HTML_BLOCK_DIV` pattern). Under `Dialect::CommonMark` (with the
128/// `native_spans` extension explicitly enabled), the legacy `BRACKETED_SPAN`
129/// shape is preserved for backward compatibility.
130pub(crate) fn emit_native_span(
131    builder: &mut GreenNodeBuilder,
132    raw: &str,
133    content: &str,
134    config: &ParserOptions,
135) {
136    let close_tag = "</span>";
137    let open_tag_end = raw.len().saturating_sub(content.len() + close_tag.len());
138    let open_tag = &raw[..open_tag_end];
139
140    if config.dialect == Dialect::Pandoc {
141        builder.start_node(SyntaxKind::INLINE_HTML_SPAN.into());
142        emit_span_open_tag_tokens(builder, open_tag);
143        builder.start_node(SyntaxKind::SPAN_CONTENT.into());
144        parse_inline_text(builder, content, config, false);
145        builder.finish_node();
146        builder.token(SyntaxKind::SPAN_BRACKET_CLOSE.into(), close_tag);
147        builder.finish_node();
148        return;
149    }
150
151    // Legacy CommonMark + native_spans extension path: keep the original
152    // BRACKETED_SPAN shape. (Note: this path collapses multi-whitespace
153    // attribute regions to a single space; it's a pre-existing minor
154    // losslessness divergence not worth diverging the legacy shape for.)
155    let attrs_text = open_tag
156        .strip_prefix("<span")
157        .and_then(|s| s.strip_suffix('>'))
158        .map(str::trim)
159        .unwrap_or("");
160    builder.start_node(SyntaxKind::BRACKETED_SPAN.into());
161    builder.token(SyntaxKind::SPAN_BRACKET_OPEN.into(), "<span");
162    if !attrs_text.is_empty() {
163        builder.token(SyntaxKind::WHITESPACE.into(), " ");
164        builder.token(SyntaxKind::SPAN_ATTRIBUTES.into(), attrs_text);
165    }
166    builder.token(SyntaxKind::SPAN_BRACKET_OPEN.into(), ">");
167    builder.start_node(SyntaxKind::SPAN_CONTENT.into());
168    parse_inline_text(builder, content, config, false);
169    builder.finish_node();
170    builder.token(SyntaxKind::SPAN_BRACKET_CLOSE.into(), close_tag);
171    builder.finish_node();
172}
173
174/// Tokenize the open tag of an inline `<span ...>` byte-exactly into:
175/// `TEXT("<span") + (WHITESPACE + HTML_ATTRS{TEXT(attrs)} + WHITESPACE?)?
176/// + TEXT(">")`.
177///
178/// Bytes are byte-identical to the source — only the tokenization
179/// granularity changes so `AttributeNode::cast(HTML_ATTRS)` can read the
180/// attribute region structurally. Mirrors `emit_div_open_tag_tokens`.
181fn emit_span_open_tag_tokens(builder: &mut GreenNodeBuilder<'_>, open_tag: &str) {
182    let Some(rest) = open_tag.strip_prefix("<span") else {
183        // Defensive — shouldn't happen since try_parse_native_span gates on
184        // <span. Fall back to a single TEXT token to stay lossless.
185        builder.token(SyntaxKind::TEXT.into(), open_tag);
186        return;
187    };
188    builder.token(SyntaxKind::TEXT.into(), "<span");
189    let Some(inside) = rest.strip_suffix('>') else {
190        builder.token(SyntaxKind::TEXT.into(), rest);
191        return;
192    };
193    let bytes = inside.as_bytes();
194    // Split into leading WS, attribute body, trailing WS.
195    let leading_ws_end = bytes
196        .iter()
197        .position(|&b| !matches!(b, b' ' | b'\t' | b'\n' | b'\r'))
198        .unwrap_or(bytes.len());
199    let leading_ws = &inside[..leading_ws_end];
200    let after_leading = &inside[leading_ws_end..];
201    let trailing_ws_start = after_leading
202        .as_bytes()
203        .iter()
204        .rposition(|&b| !matches!(b, b' ' | b'\t' | b'\n' | b'\r'))
205        .map(|i| i + 1)
206        .unwrap_or(0);
207    let attrs_text = &after_leading[..trailing_ws_start];
208    let trailing_ws = &after_leading[trailing_ws_start..];
209
210    if !leading_ws.is_empty() {
211        builder.token(SyntaxKind::WHITESPACE.into(), leading_ws);
212    }
213    if !attrs_text.is_empty() {
214        builder.start_node(SyntaxKind::HTML_ATTRS.into());
215        builder.token(SyntaxKind::TEXT.into(), attrs_text);
216        builder.finish_node();
217    }
218    if !trailing_ws.is_empty() {
219        builder.token(SyntaxKind::WHITESPACE.into(), trailing_ws);
220    }
221    builder.token(SyntaxKind::TEXT.into(), ">");
222}
223
224#[cfg(test)]
225mod tests {
226    use super::*;
227
228    #[test]
229    fn test_parse_simple_span() {
230        let result = try_parse_native_span("<span>text</span>");
231        assert_eq!(result, Some((17, "text", String::new())));
232    }
233
234    #[test]
235    fn test_parse_span_with_class() {
236        let result = try_parse_native_span(r#"<span class="foo">text</span>"#);
237        assert_eq!(result, Some((29, "text", r#"class="foo""#.to_string())));
238    }
239
240    #[test]
241    fn test_parse_span_with_id() {
242        let result = try_parse_native_span(r#"<span id="bar">text</span>"#);
243        assert_eq!(result, Some((26, "text", r#"id="bar""#.to_string())));
244    }
245
246    #[test]
247    fn test_parse_span_with_multiple_attrs() {
248        let result = try_parse_native_span(r#"<span id="x" class="y z">text</span>"#);
249        assert_eq!(
250            result,
251            Some((36, "text", r#"id="x" class="y z""#.to_string()))
252        );
253    }
254
255    #[test]
256    fn test_parse_span_with_markdown() {
257        let result = try_parse_native_span("<span>*emphasis* and `code`</span>");
258        assert_eq!(result, Some((34, "*emphasis* and `code`", String::new())));
259    }
260
261    #[test]
262    fn test_parse_nested_spans() {
263        let result = try_parse_native_span("<span>outer <span>inner</span> text</span>");
264        assert_eq!(
265            result,
266            Some((42, "outer <span>inner</span> text", String::new()))
267        );
268    }
269
270    #[test]
271    fn test_parse_span_with_newlines_in_content() {
272        let result = try_parse_native_span("<span>line 1\nline 2</span>");
273        assert_eq!(result, Some((26, "line 1\nline 2", String::new())));
274    }
275
276    #[test]
277    fn test_not_span_no_closing_tag() {
278        let result = try_parse_native_span("<span>text");
279        assert_eq!(result, None);
280    }
281
282    #[test]
283    fn test_not_span_wrong_tag() {
284        let result = try_parse_native_span("<spanx>text</spanx>");
285        assert_eq!(result, None);
286    }
287
288    #[test]
289    fn test_not_span_no_space_after() {
290        // <spanner> should not be parsed as <span>
291        let result = try_parse_native_span("<spanner>text</spanner>");
292        assert_eq!(result, None);
293    }
294
295    #[test]
296    fn test_parse_span_with_quoted_attrs_containing_gt() {
297        let result = try_parse_native_span(r#"<span title="a > b">text</span>"#);
298        assert_eq!(result, Some((31, "text", r#"title="a > b""#.to_string())));
299    }
300
301    #[test]
302    fn test_parse_empty_span() {
303        let result = try_parse_native_span("<span></span>");
304        assert_eq!(result, Some((13, "", String::new())));
305    }
306
307    #[test]
308    fn test_parse_span_trailing_text() {
309        let result = try_parse_native_span("<span>text</span> more");
310        assert_eq!(result, Some((17, "text", String::new())));
311    }
312
313    #[test]
314    fn test_parse_span_with_non_ascii_content() {
315        let result = try_parse_native_span(r#"<span class="rtl">(شربنا من النيل)</span>"#);
316        assert_eq!(
317            result,
318            Some((53, "(شربنا من النيل)", r#"class="rtl""#.to_string()))
319        );
320    }
321}