Skip to main content

panache_parser/parser/inlines/
native_spans.rs

1//! Native span parsing for Pandoc's `native_spans` extension.
2//!
3//! Syntax: `<span class="foo">content</span>`
4//!
5//! When the `native_spans` extension is enabled, HTML `<span>` tags are
6//! treated as native Pandoc Span elements instead of raw HTML.
7
8use super::sink::InlineSink;
9use crate::options::{Dialect, ParserOptions};
10use crate::parser::utils::attributes::{emit_html_attrs_node, emit_html_span_attributes_node};
11use crate::syntax::SyntaxKind;
12
13use super::core::parse_inline_text;
14
15/// Try to parse a native HTML span starting at the current position.
16/// Returns Some((length, content, attributes)) if successful.
17///
18/// Native spans have the form: <span attrs...>content</span>
19/// The content can contain markdown that will be parsed recursively.
20pub(crate) fn try_parse_native_span(text: &str) -> Option<(usize, &str, String)> {
21    let bytes = text.as_bytes();
22
23    // Must start with <span
24    if !text.starts_with("<span") {
25        return None;
26    }
27
28    let mut pos = 5; // After "<span"
29
30    // Next char must be space, >, or end of tag
31    if pos >= text.len() {
32        return None;
33    }
34
35    let next_char = bytes[pos] as char;
36    if !matches!(next_char, ' ' | '\t' | '\n' | '\r' | '>') {
37        // Could be <spanx> or something else, not a span tag
38        return None;
39    }
40
41    // Parse attributes until we find >
42    let attr_start = pos;
43    while pos < text.len() && bytes[pos] != b'>' {
44        // Handle quoted attributes
45        if bytes[pos] == b'"' || bytes[pos] == b'\'' {
46            let quote = bytes[pos];
47            pos += 1;
48            // Skip until closing quote
49            while pos < text.len() && bytes[pos] != quote {
50                if bytes[pos] == b'\\' {
51                    pos += 2; // Skip escaped character
52                } else {
53                    pos += 1;
54                }
55            }
56            if pos < text.len() {
57                pos += 1; // Skip closing quote
58            }
59        } else {
60            pos += 1;
61        }
62    }
63
64    if pos >= text.len() {
65        // No closing > found
66        return None;
67    }
68
69    // Extract attributes
70    let attributes = text[attr_start..pos].trim().to_string();
71
72    // Skip the >
73    pos += 1;
74
75    // Now find the closing </span>
76    let content_start = pos;
77    let mut depth = 1;
78
79    while pos < text.len() && depth > 0 {
80        // Check for nested <span>
81        if bytes
82            .get(pos..)
83            .is_some_and(|tail| tail.starts_with(b"<span"))
84        {
85            // Make sure it's actually a span tag (space or > follows)
86            let check_pos = pos + 5;
87            if check_pos < text.len() {
88                let ch = bytes[check_pos] as char;
89                if matches!(ch, ' ' | '\t' | '\n' | '\r' | '>') {
90                    depth += 1;
91                    pos += 5;
92                    continue;
93                }
94            }
95        }
96
97        // Check for closing </span>
98        if bytes
99            .get(pos..)
100            .is_some_and(|tail| tail.starts_with(b"</span>"))
101        {
102            depth -= 1;
103            if depth == 0 {
104                // Found the matching closing tag
105                let content = &text[content_start..pos];
106                let total_len = pos + 7; // Include </span>
107                return Some((total_len, content, attributes));
108            }
109            pos += 7;
110            continue;
111        }
112
113        // Advance by UTF-8 char length so subsequent string slicing stays on
114        // char boundaries.
115        pos += text[pos..].chars().next().map_or(1, char::len_utf8);
116    }
117
118    // No matching closing tag found
119    None
120}
121
122/// Emit a native span node to the builder.
123///
124/// `raw` is the full byte slice of the matched span (`<span...>content</span>`)
125/// so the open-tag bytes can be tokenized byte-exactly. Under
126/// `Dialect::Pandoc`, the wrapper is `INLINE_HTML_SPAN` and the open tag's
127/// attribute region is exposed structurally as `HTML_ATTRS` (mirroring the
128/// `HTML_BLOCK_DIV` pattern). Under `Dialect::CommonMark` (with the
129/// `native_spans` extension explicitly enabled), the legacy `BRACKETED_SPAN`
130/// shape is preserved for backward compatibility.
131pub(crate) fn emit_native_span(
132    builder: &mut impl InlineSink,
133    raw: &str,
134    content: &str,
135    config: &ParserOptions,
136    suppress_footnote_refs: bool,
137) {
138    let close_tag = "</span>";
139    let open_tag_end = raw.len().saturating_sub(content.len() + close_tag.len());
140    let open_tag = &raw[..open_tag_end];
141
142    if config.dialect == Dialect::Pandoc {
143        builder.start_node(SyntaxKind::INLINE_HTML_SPAN.into());
144        emit_span_open_tag_tokens(builder, open_tag);
145        builder.start_node(SyntaxKind::SPAN_CONTENT.into());
146        parse_inline_text(builder, content, config, false, suppress_footnote_refs);
147        builder.finish_node();
148        builder.token(SyntaxKind::SPAN_BRACKET_CLOSE.into(), close_tag);
149        builder.finish_node();
150        return;
151    }
152
153    // Legacy CommonMark + native_spans extension path: keep the original
154    // BRACKETED_SPAN shape. (Note: this path collapses multi-whitespace
155    // attribute regions to a single space; it's a pre-existing minor
156    // losslessness divergence not worth diverging the legacy shape for.)
157    let attrs_text = open_tag
158        .strip_prefix("<span")
159        .and_then(|s| s.strip_suffix('>'))
160        .map(str::trim)
161        .unwrap_or("");
162    builder.start_node(SyntaxKind::BRACKETED_SPAN.into());
163    builder.token(SyntaxKind::SPAN_BRACKET_OPEN.into(), "<span");
164    if !attrs_text.is_empty() {
165        builder.token(SyntaxKind::WHITESPACE.into(), " ");
166        emit_html_span_attributes_node(builder, attrs_text);
167    }
168    builder.token(SyntaxKind::SPAN_BRACKET_OPEN.into(), ">");
169    builder.start_node(SyntaxKind::SPAN_CONTENT.into());
170    parse_inline_text(builder, content, config, false, suppress_footnote_refs);
171    builder.finish_node();
172    builder.token(SyntaxKind::SPAN_BRACKET_CLOSE.into(), close_tag);
173    builder.finish_node();
174}
175
176/// Tokenize the open tag of an inline `<span ...>` byte-exactly into:
177/// `TEXT("<span") + (WHITESPACE + HTML_ATTRS{TEXT(attrs)} + WHITESPACE?)?
178/// + TEXT(">")`.
179///
180/// Bytes are byte-identical to the source — only the tokenization
181/// granularity changes so `AttributeNode::cast(HTML_ATTRS)` can read the
182/// attribute region structurally. Mirrors `emit_div_open_tag_tokens`.
183fn emit_span_open_tag_tokens(builder: &mut impl InlineSink, open_tag: &str) {
184    let Some(rest) = open_tag.strip_prefix("<span") else {
185        // Defensive — shouldn't happen since try_parse_native_span gates on
186        // <span. Fall back to a single TEXT token to stay lossless.
187        builder.token(SyntaxKind::TEXT.into(), open_tag);
188        return;
189    };
190    builder.token(SyntaxKind::TEXT.into(), "<span");
191    let Some(inside) = rest.strip_suffix('>') else {
192        builder.token(SyntaxKind::TEXT.into(), rest);
193        return;
194    };
195    let bytes = inside.as_bytes();
196    // Split into leading WS, attribute body, trailing WS.
197    let leading_ws_end = bytes
198        .iter()
199        .position(|&b| !matches!(b, b' ' | b'\t' | b'\n' | b'\r'))
200        .unwrap_or(bytes.len());
201    let leading_ws = &inside[..leading_ws_end];
202    let after_leading = &inside[leading_ws_end..];
203    let trailing_ws_start = after_leading
204        .as_bytes()
205        .iter()
206        .rposition(|&b| !matches!(b, b' ' | b'\t' | b'\n' | b'\r'))
207        .map(|i| i + 1)
208        .unwrap_or(0);
209    let attrs_text = &after_leading[..trailing_ws_start];
210    let trailing_ws = &after_leading[trailing_ws_start..];
211
212    if !leading_ws.is_empty() {
213        builder.token(SyntaxKind::WHITESPACE.into(), leading_ws);
214    }
215    if !attrs_text.is_empty() {
216        emit_html_attrs_node(builder, attrs_text);
217    }
218    if !trailing_ws.is_empty() {
219        builder.token(SyntaxKind::WHITESPACE.into(), trailing_ws);
220    }
221    builder.token(SyntaxKind::TEXT.into(), ">");
222}
223
224#[cfg(test)]
225mod tests {
226    use super::*;
227
228    #[test]
229    fn test_parse_simple_span() {
230        let result = try_parse_native_span("<span>text</span>");
231        assert_eq!(result, Some((17, "text", String::new())));
232    }
233
234    #[test]
235    fn test_parse_span_with_class() {
236        let result = try_parse_native_span(r#"<span class="foo">text</span>"#);
237        assert_eq!(result, Some((29, "text", r#"class="foo""#.to_string())));
238    }
239
240    #[test]
241    fn test_parse_span_with_id() {
242        let result = try_parse_native_span(r#"<span id="bar">text</span>"#);
243        assert_eq!(result, Some((26, "text", r#"id="bar""#.to_string())));
244    }
245
246    #[test]
247    fn test_parse_span_with_multiple_attrs() {
248        let result = try_parse_native_span(r#"<span id="x" class="y z">text</span>"#);
249        assert_eq!(
250            result,
251            Some((36, "text", r#"id="x" class="y z""#.to_string()))
252        );
253    }
254
255    #[test]
256    fn test_parse_span_with_markdown() {
257        let result = try_parse_native_span("<span>*emphasis* and `code`</span>");
258        assert_eq!(result, Some((34, "*emphasis* and `code`", String::new())));
259    }
260
261    #[test]
262    fn test_parse_nested_spans() {
263        let result = try_parse_native_span("<span>outer <span>inner</span> text</span>");
264        assert_eq!(
265            result,
266            Some((42, "outer <span>inner</span> text", String::new()))
267        );
268    }
269
270    #[test]
271    fn test_parse_span_with_newlines_in_content() {
272        let result = try_parse_native_span("<span>line 1\nline 2</span>");
273        assert_eq!(result, Some((26, "line 1\nline 2", String::new())));
274    }
275
276    #[test]
277    fn test_not_span_no_closing_tag() {
278        let result = try_parse_native_span("<span>text");
279        assert_eq!(result, None);
280    }
281
282    #[test]
283    fn test_not_span_wrong_tag() {
284        let result = try_parse_native_span("<spanx>text</spanx>");
285        assert_eq!(result, None);
286    }
287
288    #[test]
289    fn test_not_span_no_space_after() {
290        // <spanner> should not be parsed as <span>
291        let result = try_parse_native_span("<spanner>text</spanner>");
292        assert_eq!(result, None);
293    }
294
295    #[test]
296    fn test_parse_span_with_quoted_attrs_containing_gt() {
297        let result = try_parse_native_span(r#"<span title="a > b">text</span>"#);
298        assert_eq!(result, Some((31, "text", r#"title="a > b""#.to_string())));
299    }
300
301    #[test]
302    fn test_parse_empty_span() {
303        let result = try_parse_native_span("<span></span>");
304        assert_eq!(result, Some((13, "", String::new())));
305    }
306
307    #[test]
308    fn test_parse_span_trailing_text() {
309        let result = try_parse_native_span("<span>text</span> more");
310        assert_eq!(result, Some((17, "text", String::new())));
311    }
312
313    #[test]
314    fn test_parse_span_with_non_ascii_content() {
315        let result = try_parse_native_span(r#"<span class="rtl">(شربنا من النيل)</span>"#);
316        assert_eq!(
317            result,
318            Some((53, "(شربنا من النيل)", r#"class="rtl""#.to_string()))
319        );
320    }
321}