Skip to main content

panache_parser/parser/inlines/
native_spans.rs

1//! Native span parsing for Pandoc's `native_spans` extension.
2//!
3//! Syntax: `<span class="foo">content</span>`
4//!
5//! When the `native_spans` extension is enabled, HTML `<span>` tags are
6//! treated as native Pandoc Span elements instead of raw HTML.
7
8use crate::options::ParserOptions;
9use crate::syntax::SyntaxKind;
10use rowan::GreenNodeBuilder;
11
12use super::core::parse_inline_text;
13
14/// Try to parse a native HTML span starting at the current position.
15/// Returns Some((length, content, attributes)) if successful.
16///
17/// Native spans have the form: <span attrs...>content</span>
18/// The content can contain markdown that will be parsed recursively.
19pub(crate) fn try_parse_native_span(text: &str) -> Option<(usize, &str, String)> {
20    let bytes = text.as_bytes();
21
22    // Must start with <span
23    if !text.starts_with("<span") {
24        return None;
25    }
26
27    let mut pos = 5; // After "<span"
28
29    // Next char must be space, >, or end of tag
30    if pos >= text.len() {
31        return None;
32    }
33
34    let next_char = bytes[pos] as char;
35    if !matches!(next_char, ' ' | '\t' | '\n' | '\r' | '>') {
36        // Could be <spanx> or something else, not a span tag
37        return None;
38    }
39
40    // Parse attributes until we find >
41    let attr_start = pos;
42    while pos < text.len() && bytes[pos] != b'>' {
43        // Handle quoted attributes
44        if bytes[pos] == b'"' || bytes[pos] == b'\'' {
45            let quote = bytes[pos];
46            pos += 1;
47            // Skip until closing quote
48            while pos < text.len() && bytes[pos] != quote {
49                if bytes[pos] == b'\\' {
50                    pos += 2; // Skip escaped character
51                } else {
52                    pos += 1;
53                }
54            }
55            if pos < text.len() {
56                pos += 1; // Skip closing quote
57            }
58        } else {
59            pos += 1;
60        }
61    }
62
63    if pos >= text.len() {
64        // No closing > found
65        return None;
66    }
67
68    // Extract attributes
69    let attributes = text[attr_start..pos].trim().to_string();
70
71    // Skip the >
72    pos += 1;
73
74    // Now find the closing </span>
75    let content_start = pos;
76    let mut depth = 1;
77
78    while pos < text.len() && depth > 0 {
79        // Check for nested <span>
80        if bytes
81            .get(pos..)
82            .is_some_and(|tail| tail.starts_with(b"<span"))
83        {
84            // Make sure it's actually a span tag (space or > follows)
85            let check_pos = pos + 5;
86            if check_pos < text.len() {
87                let ch = bytes[check_pos] as char;
88                if matches!(ch, ' ' | '\t' | '\n' | '\r' | '>') {
89                    depth += 1;
90                    pos += 5;
91                    continue;
92                }
93            }
94        }
95
96        // Check for closing </span>
97        if bytes
98            .get(pos..)
99            .is_some_and(|tail| tail.starts_with(b"</span>"))
100        {
101            depth -= 1;
102            if depth == 0 {
103                // Found the matching closing tag
104                let content = &text[content_start..pos];
105                let total_len = pos + 7; // Include </span>
106                return Some((total_len, content, attributes));
107            }
108            pos += 7;
109            continue;
110        }
111
112        // Advance by UTF-8 char length so subsequent string slicing stays on
113        // char boundaries.
114        pos += text[pos..].chars().next().map_or(1, char::len_utf8);
115    }
116
117    // No matching closing tag found
118    None
119}
120
121/// Emit a native span node to the builder.
122pub(crate) fn emit_native_span(
123    builder: &mut GreenNodeBuilder,
124    content: &str,
125    attributes: &str,
126    config: &ParserOptions,
127) {
128    builder.start_node(SyntaxKind::BRACKETED_SPAN.into());
129
130    // Opening tag
131    builder.token(SyntaxKind::SPAN_BRACKET_OPEN.into(), "<span");
132    if !attributes.is_empty() {
133        // Add space before attributes
134        builder.token(SyntaxKind::WHITESPACE.into(), " ");
135        builder.token(SyntaxKind::SPAN_ATTRIBUTES.into(), attributes);
136    }
137    builder.token(SyntaxKind::SPAN_BRACKET_OPEN.into(), ">");
138
139    // Parse the content recursively for inline markdown
140    builder.start_node(SyntaxKind::SPAN_CONTENT.into());
141    parse_inline_text(builder, content, config, false);
142    builder.finish_node();
143
144    // Closing tag
145    builder.token(SyntaxKind::SPAN_BRACKET_CLOSE.into(), "</span>");
146
147    builder.finish_node();
148}
149
150#[cfg(test)]
151mod tests {
152    use super::*;
153
154    #[test]
155    fn test_parse_simple_span() {
156        let result = try_parse_native_span("<span>text</span>");
157        assert_eq!(result, Some((17, "text", String::new())));
158    }
159
160    #[test]
161    fn test_parse_span_with_class() {
162        let result = try_parse_native_span(r#"<span class="foo">text</span>"#);
163        assert_eq!(result, Some((29, "text", r#"class="foo""#.to_string())));
164    }
165
166    #[test]
167    fn test_parse_span_with_id() {
168        let result = try_parse_native_span(r#"<span id="bar">text</span>"#);
169        assert_eq!(result, Some((26, "text", r#"id="bar""#.to_string())));
170    }
171
172    #[test]
173    fn test_parse_span_with_multiple_attrs() {
174        let result = try_parse_native_span(r#"<span id="x" class="y z">text</span>"#);
175        assert_eq!(
176            result,
177            Some((36, "text", r#"id="x" class="y z""#.to_string()))
178        );
179    }
180
181    #[test]
182    fn test_parse_span_with_markdown() {
183        let result = try_parse_native_span("<span>*emphasis* and `code`</span>");
184        assert_eq!(result, Some((34, "*emphasis* and `code`", String::new())));
185    }
186
187    #[test]
188    fn test_parse_nested_spans() {
189        let result = try_parse_native_span("<span>outer <span>inner</span> text</span>");
190        assert_eq!(
191            result,
192            Some((42, "outer <span>inner</span> text", String::new()))
193        );
194    }
195
196    #[test]
197    fn test_parse_span_with_newlines_in_content() {
198        let result = try_parse_native_span("<span>line 1\nline 2</span>");
199        assert_eq!(result, Some((26, "line 1\nline 2", String::new())));
200    }
201
202    #[test]
203    fn test_not_span_no_closing_tag() {
204        let result = try_parse_native_span("<span>text");
205        assert_eq!(result, None);
206    }
207
208    #[test]
209    fn test_not_span_wrong_tag() {
210        let result = try_parse_native_span("<spanx>text</spanx>");
211        assert_eq!(result, None);
212    }
213
214    #[test]
215    fn test_not_span_no_space_after() {
216        // <spanner> should not be parsed as <span>
217        let result = try_parse_native_span("<spanner>text</spanner>");
218        assert_eq!(result, None);
219    }
220
221    #[test]
222    fn test_parse_span_with_quoted_attrs_containing_gt() {
223        let result = try_parse_native_span(r#"<span title="a > b">text</span>"#);
224        assert_eq!(result, Some((31, "text", r#"title="a > b""#.to_string())));
225    }
226
227    #[test]
228    fn test_parse_empty_span() {
229        let result = try_parse_native_span("<span></span>");
230        assert_eq!(result, Some((13, "", String::new())));
231    }
232
233    #[test]
234    fn test_parse_span_trailing_text() {
235        let result = try_parse_native_span("<span>text</span> more");
236        assert_eq!(result, Some((17, "text", String::new())));
237    }
238
239    #[test]
240    fn test_parse_span_with_non_ascii_content() {
241        let result = try_parse_native_span(r#"<span class="rtl">(شربنا من النيل)</span>"#);
242        assert_eq!(
243            result,
244            Some((53, "(شربنا من النيل)", r#"class="rtl""#.to_string()))
245        );
246    }
247}