panache-parser 0.10.0

Lossless CST parser and syntax wrappers for Pandoc markdown, Quarto, and RMarkdown
Documentation
//! Native span parsing for Pandoc's `native_spans` extension.
//!
//! Syntax: `<span class="foo">content</span>`
//!
//! When the `native_spans` extension is enabled, HTML `<span>` tags are
//! treated as native Pandoc Span elements instead of raw HTML.

use crate::options::{Dialect, ParserOptions};
use crate::syntax::SyntaxKind;
use rowan::GreenNodeBuilder;

use super::core::parse_inline_text;

/// Try to parse a native HTML span starting at the current position.
/// Returns Some((length, content, attributes)) if successful.
///
/// Native spans have the form: <span attrs...>content</span>
/// The content can contain markdown that will be parsed recursively.
pub(crate) fn try_parse_native_span(text: &str) -> Option<(usize, &str, String)> {
    let bytes = text.as_bytes();

    // Must start with <span
    if !text.starts_with("<span") {
        return None;
    }

    let mut pos = 5; // After "<span"

    // Next char must be space, >, or end of tag
    if pos >= text.len() {
        return None;
    }

    let next_char = bytes[pos] as char;
    if !matches!(next_char, ' ' | '\t' | '\n' | '\r' | '>') {
        // Could be <spanx> or something else, not a span tag
        return None;
    }

    // Parse attributes until we find >
    let attr_start = pos;
    while pos < text.len() && bytes[pos] != b'>' {
        // Handle quoted attributes
        if bytes[pos] == b'"' || bytes[pos] == b'\'' {
            let quote = bytes[pos];
            pos += 1;
            // Skip until closing quote
            while pos < text.len() && bytes[pos] != quote {
                if bytes[pos] == b'\\' {
                    pos += 2; // Skip escaped character
                } else {
                    pos += 1;
                }
            }
            if pos < text.len() {
                pos += 1; // Skip closing quote
            }
        } else {
            pos += 1;
        }
    }

    if pos >= text.len() {
        // No closing > found
        return None;
    }

    // Extract attributes
    let attributes = text[attr_start..pos].trim().to_string();

    // Skip the >
    pos += 1;

    // Now find the closing </span>
    let content_start = pos;
    let mut depth = 1;

    while pos < text.len() && depth > 0 {
        // Check for nested <span>
        if bytes
            .get(pos..)
            .is_some_and(|tail| tail.starts_with(b"<span"))
        {
            // Make sure it's actually a span tag (space or > follows)
            let check_pos = pos + 5;
            if check_pos < text.len() {
                let ch = bytes[check_pos] as char;
                if matches!(ch, ' ' | '\t' | '\n' | '\r' | '>') {
                    depth += 1;
                    pos += 5;
                    continue;
                }
            }
        }

        // Check for closing </span>
        if bytes
            .get(pos..)
            .is_some_and(|tail| tail.starts_with(b"</span>"))
        {
            depth -= 1;
            if depth == 0 {
                // Found the matching closing tag
                let content = &text[content_start..pos];
                let total_len = pos + 7; // Include </span>
                return Some((total_len, content, attributes));
            }
            pos += 7;
            continue;
        }

        // Advance by UTF-8 char length so subsequent string slicing stays on
        // char boundaries.
        pos += text[pos..].chars().next().map_or(1, char::len_utf8);
    }

    // No matching closing tag found
    None
}

/// Emit a native span node to the builder.
///
/// `raw` is the full byte slice of the matched span (`<span...>content</span>`)
/// so the open-tag bytes can be tokenized byte-exactly. Under
/// `Dialect::Pandoc`, the wrapper is `INLINE_HTML_SPAN` and the open tag's
/// attribute region is exposed structurally as `HTML_ATTRS` (mirroring the
/// `HTML_BLOCK_DIV` pattern). Under `Dialect::CommonMark` (with the
/// `native_spans` extension explicitly enabled), the legacy `BRACKETED_SPAN`
/// shape is preserved for backward compatibility.
pub(crate) fn emit_native_span(
    builder: &mut GreenNodeBuilder,
    raw: &str,
    content: &str,
    config: &ParserOptions,
    suppress_footnote_refs: bool,
) {
    let close_tag = "</span>";
    let open_tag_end = raw.len().saturating_sub(content.len() + close_tag.len());
    let open_tag = &raw[..open_tag_end];

    if config.dialect == Dialect::Pandoc {
        builder.start_node(SyntaxKind::INLINE_HTML_SPAN.into());
        emit_span_open_tag_tokens(builder, open_tag);
        builder.start_node(SyntaxKind::SPAN_CONTENT.into());
        parse_inline_text(builder, content, config, false, suppress_footnote_refs);
        builder.finish_node();
        builder.token(SyntaxKind::SPAN_BRACKET_CLOSE.into(), close_tag);
        builder.finish_node();
        return;
    }

    // Legacy CommonMark + native_spans extension path: keep the original
    // BRACKETED_SPAN shape. (Note: this path collapses multi-whitespace
    // attribute regions to a single space; it's a pre-existing minor
    // losslessness divergence not worth diverging the legacy shape for.)
    let attrs_text = open_tag
        .strip_prefix("<span")
        .and_then(|s| s.strip_suffix('>'))
        .map(str::trim)
        .unwrap_or("");
    builder.start_node(SyntaxKind::BRACKETED_SPAN.into());
    builder.token(SyntaxKind::SPAN_BRACKET_OPEN.into(), "<span");
    if !attrs_text.is_empty() {
        builder.token(SyntaxKind::WHITESPACE.into(), " ");
        builder.token(SyntaxKind::SPAN_ATTRIBUTES.into(), attrs_text);
    }
    builder.token(SyntaxKind::SPAN_BRACKET_OPEN.into(), ">");
    builder.start_node(SyntaxKind::SPAN_CONTENT.into());
    parse_inline_text(builder, content, config, false, suppress_footnote_refs);
    builder.finish_node();
    builder.token(SyntaxKind::SPAN_BRACKET_CLOSE.into(), close_tag);
    builder.finish_node();
}

/// Tokenize the open tag of an inline `<span ...>` byte-exactly into:
/// `TEXT("<span") + (WHITESPACE + HTML_ATTRS{TEXT(attrs)} + WHITESPACE?)?
/// + TEXT(">")`.
///
/// Bytes are byte-identical to the source — only the tokenization
/// granularity changes so `AttributeNode::cast(HTML_ATTRS)` can read the
/// attribute region structurally. Mirrors `emit_div_open_tag_tokens`.
fn emit_span_open_tag_tokens(builder: &mut GreenNodeBuilder<'_>, open_tag: &str) {
    let Some(rest) = open_tag.strip_prefix("<span") else {
        // Defensive — shouldn't happen since try_parse_native_span gates on
        // <span. Fall back to a single TEXT token to stay lossless.
        builder.token(SyntaxKind::TEXT.into(), open_tag);
        return;
    };
    builder.token(SyntaxKind::TEXT.into(), "<span");
    let Some(inside) = rest.strip_suffix('>') else {
        builder.token(SyntaxKind::TEXT.into(), rest);
        return;
    };
    let bytes = inside.as_bytes();
    // Split into leading WS, attribute body, trailing WS.
    let leading_ws_end = bytes
        .iter()
        .position(|&b| !matches!(b, b' ' | b'\t' | b'\n' | b'\r'))
        .unwrap_or(bytes.len());
    let leading_ws = &inside[..leading_ws_end];
    let after_leading = &inside[leading_ws_end..];
    let trailing_ws_start = after_leading
        .as_bytes()
        .iter()
        .rposition(|&b| !matches!(b, b' ' | b'\t' | b'\n' | b'\r'))
        .map(|i| i + 1)
        .unwrap_or(0);
    let attrs_text = &after_leading[..trailing_ws_start];
    let trailing_ws = &after_leading[trailing_ws_start..];

    if !leading_ws.is_empty() {
        builder.token(SyntaxKind::WHITESPACE.into(), leading_ws);
    }
    if !attrs_text.is_empty() {
        builder.start_node(SyntaxKind::HTML_ATTRS.into());
        builder.token(SyntaxKind::TEXT.into(), attrs_text);
        builder.finish_node();
    }
    if !trailing_ws.is_empty() {
        builder.token(SyntaxKind::WHITESPACE.into(), trailing_ws);
    }
    builder.token(SyntaxKind::TEXT.into(), ">");
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_simple_span() {
        let result = try_parse_native_span("<span>text</span>");
        assert_eq!(result, Some((17, "text", String::new())));
    }

    #[test]
    fn test_parse_span_with_class() {
        let result = try_parse_native_span(r#"<span class="foo">text</span>"#);
        assert_eq!(result, Some((29, "text", r#"class="foo""#.to_string())));
    }

    #[test]
    fn test_parse_span_with_id() {
        let result = try_parse_native_span(r#"<span id="bar">text</span>"#);
        assert_eq!(result, Some((26, "text", r#"id="bar""#.to_string())));
    }

    #[test]
    fn test_parse_span_with_multiple_attrs() {
        let result = try_parse_native_span(r#"<span id="x" class="y z">text</span>"#);
        assert_eq!(
            result,
            Some((36, "text", r#"id="x" class="y z""#.to_string()))
        );
    }

    #[test]
    fn test_parse_span_with_markdown() {
        let result = try_parse_native_span("<span>*emphasis* and `code`</span>");
        assert_eq!(result, Some((34, "*emphasis* and `code`", String::new())));
    }

    #[test]
    fn test_parse_nested_spans() {
        let result = try_parse_native_span("<span>outer <span>inner</span> text</span>");
        assert_eq!(
            result,
            Some((42, "outer <span>inner</span> text", String::new()))
        );
    }

    #[test]
    fn test_parse_span_with_newlines_in_content() {
        let result = try_parse_native_span("<span>line 1\nline 2</span>");
        assert_eq!(result, Some((26, "line 1\nline 2", String::new())));
    }

    #[test]
    fn test_not_span_no_closing_tag() {
        let result = try_parse_native_span("<span>text");
        assert_eq!(result, None);
    }

    #[test]
    fn test_not_span_wrong_tag() {
        let result = try_parse_native_span("<spanx>text</spanx>");
        assert_eq!(result, None);
    }

    #[test]
    fn test_not_span_no_space_after() {
        // <spanner> should not be parsed as <span>
        let result = try_parse_native_span("<spanner>text</spanner>");
        assert_eq!(result, None);
    }

    #[test]
    fn test_parse_span_with_quoted_attrs_containing_gt() {
        let result = try_parse_native_span(r#"<span title="a > b">text</span>"#);
        assert_eq!(result, Some((31, "text", r#"title="a > b""#.to_string())));
    }

    #[test]
    fn test_parse_empty_span() {
        let result = try_parse_native_span("<span></span>");
        assert_eq!(result, Some((13, "", String::new())));
    }

    #[test]
    fn test_parse_span_trailing_text() {
        let result = try_parse_native_span("<span>text</span> more");
        assert_eq!(result, Some((17, "text", String::new())));
    }

    #[test]
    fn test_parse_span_with_non_ascii_content() {
        let result = try_parse_native_span(r#"<span class="rtl">(شربنا من النيل)</span>"#);
        assert_eq!(
            result,
            Some((53, "(شربنا من النيل)", r#"class="rtl""#.to_string()))
        );
    }
}