Skip to main content

panache_parser/parser/utils/
attributes.rs

1//! Parsing for Pandoc-style attributes: {#id .class key=value}
2//!
3//! Attributes can appear after headings, fenced code blocks, fenced divs, etc.
4//! Syntax: {#identifier .class1 .class2 key1=val1 key2="val2"}
5//!
6//! Rules:
7//! - Surrounded by { }
8//! - Identifier: #id (optional, only first one counts)
9//! - Classes: .class (can have multiple)
10//! - Key-value pairs: key=value or key="value" or key='value' (can have multiple)
11//! - Whitespace flexible between items
12
13use crate::parser::inlines::sink::InlineSink;
14use crate::syntax::SyntaxKind;
15#[cfg(test)]
16use rowan::GreenNodeBuilder;
17
18#[derive(Debug, PartialEq)]
19pub struct AttributeBlock {
20    pub identifier: Option<String>,
21    pub classes: Vec<String>,
22    pub key_values: Vec<(String, String)>,
23}
24
25/// Try to parse an attribute block from the end of a string
26/// Returns: (attribute_block, text_before_attributes)
27pub fn try_parse_trailing_attributes(text: &str) -> Option<(AttributeBlock, &str)> {
28    let (attrs, before, _) = try_parse_trailing_attributes_with_pos(text)?;
29    Some((attrs, before))
30}
31
32/// Try to parse an attribute block from the end of a string.
33/// Returns: (attribute_block, text_before_attributes, open_brace_position_in_trimmed_text)
34pub fn try_parse_trailing_attributes_with_pos(text: &str) -> Option<(AttributeBlock, &str, usize)> {
35    let trimmed = text.trim_end();
36
37    // Must end with }
38    if !trimmed.ends_with('}') {
39        return None;
40    }
41
42    // Find matching opening brace for the trailing attribute block, accounting
43    // for braces inside quoted attribute values.
44    let open_brace = find_matching_open_brace_for_trailing_block(trimmed)?;
45
46    // Check if this is a bracketed span like [text]{.class} rather than a heading attribute
47    // If the { is immediately after ] (with optional whitespace), this should be parsed as a span
48    let before_brace = &trimmed[..open_brace];
49    if before_brace.trim_end().ends_with(']') {
50        log::trace!("Skipping attribute parsing for bracketed span: {}", text);
51        return None;
52    }
53
54    // Parse the content between { and }
55    let attr_content = &trimmed[open_brace + 1..trimmed.len() - 1];
56    let attr_block = parse_attribute_content(attr_content)?;
57
58    // Get text before attributes (trim trailing whitespace)
59    let before_attrs = trimmed[..open_brace].trim_end();
60
61    Some((attr_block, before_attrs, open_brace))
62}
63
64fn find_matching_open_brace_for_trailing_block(text: &str) -> Option<usize> {
65    if !text.ends_with('}') {
66        return None;
67    }
68
69    let mut stack: Vec<usize> = Vec::new();
70    let mut in_quote: Option<char> = None;
71    let mut escaped = false;
72    let mut end_brace_open = None;
73
74    for (idx, ch) in text.char_indices() {
75        if let Some(q) = in_quote {
76            if escaped {
77                escaped = false;
78                continue;
79            }
80            if ch == '\\' {
81                escaped = true;
82                continue;
83            }
84            if ch == q {
85                in_quote = None;
86            }
87            continue;
88        }
89
90        match ch {
91            '\'' | '"' => in_quote = Some(ch),
92            '{' => stack.push(idx),
93            '}' => {
94                let open = stack.pop()?;
95                if idx == text.len() - 1 {
96                    end_brace_open = Some(open);
97                }
98            }
99            _ => {}
100        }
101    }
102
103    if in_quote.is_some() || !stack.is_empty() {
104        return None;
105    }
106
107    end_brace_open
108}
109
110/// One recognized component inside an attribute `{...}` body, as byte ranges
111/// relative to the `content` slice passed to [`attribute_content_spans`] (the
112/// bytes strictly between `{` and `}`). Marker bytes (`#`/`.`/`=`) and value
113/// quotes are kept INSIDE the ranges so the emitter can wrap the exact source
114/// bytes; the string-deriving helpers strip them.
115#[derive(Debug, Clone, PartialEq)]
116pub(crate) enum AttrComponent {
117    /// `#id` — range includes the leading `#`.
118    Id(std::ops::Range<usize>),
119    /// `.class` or `=format` — range includes the leading `.`/`=` marker.
120    Class(std::ops::Range<usize>),
121    /// `key=value`: key range, `=` byte index, value range (the value range
122    /// includes surrounding quotes when present).
123    KeyValue {
124        key: std::ops::Range<usize>,
125        eq: usize,
126        value: std::ops::Range<usize>,
127    },
128}
129
130/// Recognized components of an attribute `{...}` body, in source order. The
131/// single source of truth shared by detection ([`parse_attribute_content`],
132/// which derives owned strings) and emission (`emit_attribute_node`, which
133/// wraps these byte ranges in ATTR_* CST nodes) — one walk, no detect/emit
134/// drift. Bytes the scan skips (duplicate `#id`, malformed tokens, whitespace)
135/// are not components; the emitter recovers them from the gaps between ranges.
136#[derive(Debug, Clone, PartialEq)]
137pub(crate) struct AttributeSpans {
138    pub components: Vec<AttrComponent>,
139}
140
141/// Strip a matching pair of surrounding quotes (`"` or `'`) from an attribute
142/// value's raw bytes, yielding the semantic value. Mirrors the quote handling
143/// in the legacy [`parse_attribute_content`] walk: a leading quote is always
144/// dropped, and a trailing quote of the same kind is dropped when present (so
145/// unterminated quotes keep their tail).
146fn attr_value_string(raw: &str) -> String {
147    let bytes = raw.as_bytes();
148    if let Some(&q) = bytes.first()
149        && (q == b'"' || q == b'\'')
150    {
151        let inner = &raw[1..];
152        return inner.strip_suffix(q as char).unwrap_or(inner).to_string();
153    }
154    raw.to_string()
155}
156
157/// Scan an attribute `{...}` body into [`AttributeSpans`]. Returns `None` when
158/// no component is recognized (empty/whitespace-only/`{}` is not a valid
159/// attribute block). Offsets are relative to `content`.
160pub(crate) fn attribute_content_spans(content: &str) -> Option<AttributeSpans> {
161    let bytes = content.as_bytes();
162    let mut pos = 0;
163    let mut components: Vec<AttrComponent> = Vec::new();
164    let mut have_id = false;
165
166    while pos < bytes.len() {
167        // Skip whitespace.
168        while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
169            pos += 1;
170        }
171        if pos >= bytes.len() {
172            break;
173        }
174
175        if bytes[pos] == b'=' {
176            // {=format} raw-attribute marker — recorded as a class whose range
177            // includes the `=` (the string derivation keeps the `=`).
178            let start = pos;
179            pos += 1; // skip '='
180            while pos < bytes.len() && !bytes[pos].is_ascii_whitespace() && bytes[pos] != b'}' {
181                pos += 1;
182            }
183            if pos > start + 1 {
184                components.push(AttrComponent::Class(start..pos));
185            }
186        } else if bytes[pos] == b'#' {
187            let start = pos;
188            pos += 1; // skip '#'
189            while pos < bytes.len() && !bytes[pos].is_ascii_whitespace() && bytes[pos] != b'}' {
190                pos += 1;
191            }
192            // Only the first non-empty identifier counts; later `#…` runs and a
193            // bare `#` are scanned but not recorded (recovered from the gap).
194            if !have_id && pos > start + 1 {
195                components.push(AttrComponent::Id(start..pos));
196                have_id = true;
197            }
198        } else if bytes[pos] == b'.' {
199            let start = pos;
200            pos += 1; // skip '.'
201            while pos < bytes.len() && !bytes[pos].is_ascii_whitespace() && bytes[pos] != b'}' {
202                pos += 1;
203            }
204            if pos > start + 1 {
205                components.push(AttrComponent::Class(start..pos));
206            }
207        } else {
208            // key=value
209            let key_start = pos;
210            while pos < bytes.len() && bytes[pos] != b'=' && !bytes[pos].is_ascii_whitespace() {
211                pos += 1;
212            }
213            if pos >= bytes.len() || bytes[pos] != b'=' {
214                // Not a valid key=value: skip the token (recovered from the gap).
215                while pos < bytes.len() && !bytes[pos].is_ascii_whitespace() {
216                    pos += 1;
217                }
218                continue;
219            }
220            let key_end = pos;
221            let eq = pos;
222            pos += 1; // skip '='
223
224            let value_start = pos;
225            if pos < bytes.len() && (bytes[pos] == b'"' || bytes[pos] == b'\'') {
226                let quote = bytes[pos];
227                pos += 1; // opening quote
228                while pos < bytes.len() && bytes[pos] != quote {
229                    pos += 1;
230                }
231                if pos < bytes.len() {
232                    pos += 1; // closing quote
233                }
234            } else {
235                while pos < bytes.len() && !bytes[pos].is_ascii_whitespace() && bytes[pos] != b'}' {
236                    pos += 1;
237                }
238            }
239            if key_end > key_start {
240                components.push(AttrComponent::KeyValue {
241                    key: key_start..key_end,
242                    eq,
243                    value: value_start..pos,
244                });
245            }
246        }
247    }
248
249    if components.is_empty() {
250        return None;
251    }
252    Some(AttributeSpans { components })
253}
254
255/// Parse the content inside the attribute braces into owned strings. Thin
256/// wrapper over [`attribute_content_spans`] so detection and emission share one
257/// walk.
258pub fn parse_attribute_content(content: &str) -> Option<AttributeBlock> {
259    let spans = attribute_content_spans(content)?;
260    let mut identifier = None;
261    let mut classes = Vec::new();
262    let mut key_values = Vec::new();
263
264    for comp in &spans.components {
265        match comp {
266            AttrComponent::Id(r) => {
267                // Range includes '#'; the scanner guarantees a non-empty tail.
268                identifier = Some(content[r.start + 1..r.end].to_string());
269            }
270            AttrComponent::Class(r) => {
271                let raw = &content[r.clone()];
272                // `.class` → `class`; `=format` keeps its `=` prefix.
273                match raw.strip_prefix('.') {
274                    Some(class) => classes.push(class.to_string()),
275                    None => classes.push(raw.to_string()),
276                }
277            }
278            AttrComponent::KeyValue { key, value, .. } => {
279                key_values.push((
280                    content[key.clone()].to_string(),
281                    attr_value_string(&content[value.clone()]),
282                ));
283            }
284        }
285    }
286
287    Some(AttributeBlock {
288        identifier,
289        classes,
290        key_values,
291    })
292}
293
294/// Parse HTML-style attributes from a raw HTML opening tag text such as
295/// `<div id="x" class="a b" data-key="v">`, returning the same
296/// `AttributeBlock` shape as Pandoc-style brace attributes. Whitespace-
297/// separated `class="..."` is split into individual classes; `id="..."`
298/// becomes the identifier; everything else becomes a key/value pair.
299/// Returns `None` if the tag has no recognized attributes.
300///
301/// Self-closing slashes (`<div .../>`) and trailing whitespace are tolerated.
302/// The leading `<TAG` and trailing `>` are stripped; this routine does not
303/// validate the tag name.
304pub fn parse_html_tag_attributes(tag_text: &str) -> Option<AttributeBlock> {
305    let trimmed = tag_text.trim_start();
306    let after_lt = trimmed.strip_prefix('<')?;
307    // Find the end of the opening tag at the first `>` not inside a quoted
308    // attribute value. Anything after that `>` (e.g. inline content + close
309    // tag for a same-line `<div id="x">Content</div>`) is irrelevant.
310    let bytes = after_lt.as_bytes();
311    let mut tag_end = None;
312    let mut quote: Option<u8> = None;
313    for (i, &b) in bytes.iter().enumerate() {
314        match (quote, b) {
315            (None, b'"') | (None, b'\'') => quote = Some(b),
316            (Some(q), b2) if b2 == q => quote = None,
317            (None, b'>') => {
318                tag_end = Some(i);
319                break;
320            }
321            _ => {}
322        }
323    }
324    let tag_end = tag_end?;
325    let inner = &after_lt[..tag_end];
326    // Drop any trailing self-closing slash.
327    let inner = inner.trim_end().trim_end_matches('/').trim_end();
328    // Drop the tag name (alphanumeric run after `<`).
329    let bytes = inner.as_bytes();
330    let mut name_end = 0usize;
331    while name_end < bytes.len()
332        && !bytes[name_end].is_ascii_whitespace()
333        && bytes[name_end] != b'/'
334    {
335        name_end += 1;
336    }
337    let attrs_text = &inner[name_end..];
338    parse_html_attribute_list(attrs_text)
339}
340
341/// Parse a raw HTML attribute list (the bytes between a tag name and the
342/// closing `>`, exclusive). Accepts inputs like `id="x" class="a b"
343/// data-key=v` and produces an [`AttributeBlock`]. Returns `None` if no
344/// recognized attributes are present.
345///
346/// Used by [`parse_html_tag_attributes`] (which strips `<TAG ...>`
347/// surrounding chrome before delegating here) and by
348/// `AttributeNode::id` for the structural `HTML_ATTRS` CST node, whose
349/// text holds JUST the attribute region.
350pub fn parse_html_attribute_list(attrs_text: &str) -> Option<AttributeBlock> {
351    let comps = html_attribute_spans(attrs_text);
352    if comps.is_empty() {
353        return None;
354    }
355    let mut identifier: Option<String> = None;
356    let mut classes: Vec<String> = Vec::new();
357    let mut key_values: Vec<(String, String)> = Vec::new();
358    for comp in &comps {
359        match comp {
360            HtmlAttrComponent::Id(r) => {
361                if identifier.is_none() {
362                    identifier = Some(attrs_text[r.clone()].to_string());
363                }
364            }
365            HtmlAttrComponent::Class(r) => classes.push(attrs_text[r.clone()].to_string()),
366            HtmlAttrComponent::KeyValue { key, value, .. } => {
367                key_values.push((
368                    attrs_text[key.clone()].to_string(),
369                    attr_value_string(&attrs_text[value.clone()]),
370                ));
371            }
372            HtmlAttrComponent::Flag(r) => {
373                key_values.push((attrs_text[r.clone()].to_string(), String::new()));
374            }
375        }
376    }
377    if identifier.is_none() && classes.is_empty() && key_values.is_empty() {
378        return None;
379    }
380    Some(AttributeBlock {
381        identifier,
382        classes,
383        key_values,
384    })
385}
386
387/// One recognized HTML attribute, as byte ranges relative to the attribute
388/// body passed to [`html_attribute_spans`] (the bytes between a tag name and
389/// the closing `>`, exclusive). Range semantics match the `ATTR_*` token each
390/// becomes: `Id`/`Class` wrap the bare value (quotes excluded — the reader uses
391/// the text verbatim, since HTML has no `#`/`.` marker), while `KeyValue` keeps
392/// the value's quotes (the reader strips them), mirroring the Pandoc
393/// convention. The single source of truth shared by [`parse_html_attribute_list`]
394/// (string derivation) and [`emit_html_attrs_node`] (CST emission).
395#[derive(Debug, Clone, PartialEq)]
396enum HtmlAttrComponent {
397    /// `id="x"` → range covers the bare id value (`x`); only the first counts.
398    Id(std::ops::Range<usize>),
399    /// One whitespace-separated word of a `class="a b"` value.
400    Class(std::ops::Range<usize>),
401    /// `key="v"` / `key=v` → key range, `=` byte index, value range (value
402    /// includes surrounding quotes when present).
403    KeyValue {
404        key: std::ops::Range<usize>,
405        eq: usize,
406        value: std::ops::Range<usize>,
407    },
408    /// A valueless attribute (`hidden`) → key range only (projects to `(key,"")`).
409    Flag(std::ops::Range<usize>),
410}
411
412/// Strip a matching surrounding quote pair from `[start, end)` of `content`,
413/// returning the inner range. An unterminated opening quote drops just the
414/// opening; unquoted ranges are returned unchanged. Mirrors the quote handling
415/// in [`attr_value_string`].
416fn html_value_inner_range(content: &str, start: usize, end: usize) -> std::ops::Range<usize> {
417    let b = content.as_bytes();
418    if end > start && (b[start] == b'"' || b[start] == b'\'') {
419        let q = b[start];
420        if end > start + 1 && b[end - 1] == q {
421            return (start + 1)..(end - 1);
422        }
423        return (start + 1)..end;
424    }
425    start..end
426}
427
428/// Whitespace-separated word ranges within `[start, end)` of `content`.
429fn html_word_ranges(content: &str, start: usize, end: usize) -> Vec<std::ops::Range<usize>> {
430    let b = content.as_bytes();
431    let mut out = Vec::new();
432    let mut i = start;
433    while i < end {
434        while i < end && b[i].is_ascii_whitespace() {
435            i += 1;
436        }
437        if i >= end {
438            break;
439        }
440        let ws = i;
441        while i < end && !b[i].is_ascii_whitespace() {
442            i += 1;
443        }
444        out.push(ws..i);
445    }
446    out
447}
448
449/// Scan an HTML attribute body into [`HtmlAttrComponent`]s in source order.
450/// Recognizes `id="x"`, `class="a b"` (split per word), `key="v"`/`key=v`, and
451/// valueless flags. Bytes that aren't part of a component (attribute names,
452/// `=`, quotes, whitespace, `/`) are recovered by the emitter from the gaps.
453fn html_attribute_spans(content: &str) -> Vec<HtmlAttrComponent> {
454    let bytes = content.as_bytes();
455    let mut i = 0usize;
456    let mut comps: Vec<HtmlAttrComponent> = Vec::new();
457    let mut have_id = false;
458
459    while i < bytes.len() {
460        match bytes[i] {
461            b' ' | b'\t' | b'\n' | b'\r' | b'/' => {
462                i += 1;
463            }
464            _ => {
465                let key_start = i;
466                while i < bytes.len()
467                    && !matches!(bytes[i], b' ' | b'\t' | b'\n' | b'\r' | b'=' | b'/')
468                {
469                    i += 1;
470                }
471                let key_end = i;
472                let key = &content[key_start..key_end];
473
474                if i < bytes.len() && bytes[i] == b'=' {
475                    let eq = i;
476                    i += 1; // skip '='
477                    let value_start = i;
478                    if i < bytes.len() && (bytes[i] == b'"' || bytes[i] == b'\'') {
479                        let quote = bytes[i];
480                        i += 1; // opening quote
481                        while i < bytes.len() && bytes[i] != quote {
482                            i += 1;
483                        }
484                        if i < bytes.len() {
485                            i += 1; // closing quote
486                        }
487                    } else {
488                        while i < bytes.len()
489                            && !matches!(bytes[i], b' ' | b'\t' | b'\n' | b'\r' | b'/')
490                        {
491                            i += 1;
492                        }
493                    }
494                    let value_end = i;
495                    match key {
496                        "id" => {
497                            if !have_id {
498                                let inner = html_value_inner_range(content, value_start, value_end);
499                                if inner.end > inner.start {
500                                    comps.push(HtmlAttrComponent::Id(inner));
501                                    have_id = true;
502                                }
503                            }
504                        }
505                        "class" => {
506                            let inner = html_value_inner_range(content, value_start, value_end);
507                            for w in html_word_ranges(content, inner.start, inner.end) {
508                                comps.push(HtmlAttrComponent::Class(w));
509                            }
510                        }
511                        _ => comps.push(HtmlAttrComponent::KeyValue {
512                            key: key_start..key_end,
513                            eq,
514                            value: value_start..value_end,
515                        }),
516                    }
517                } else if key_end > key_start {
518                    comps.push(HtmlAttrComponent::Flag(key_start..key_end));
519                }
520            }
521        }
522    }
523
524    comps
525}
526
527/// Emit a structural `HTML_ATTRS` node, wrapping the source bytes of each
528/// recognized HTML attribute in `ATTR_ID` / `ATTR_CLASS` / `ATTR_KEY_VALUE`
529/// children (bare values — HTML has no `#`/`.` marker). Bytes between/around
530/// components (names, `=`, quotes, whitespace, `/`) become gap tokens, so
531/// `node.text()` is exactly `attrs_text`. An unrecognized/empty body falls back
532/// to a single opaque `TEXT` token.
533pub fn emit_html_attrs_node(builder: &mut impl InlineSink, attrs_text: &str) {
534    emit_html_attrs_with_kind(builder, SyntaxKind::HTML_ATTRS, attrs_text);
535}
536
537/// As [`emit_html_attrs_node`] but for the legacy native-span `SPAN_ATTRIBUTES`
538/// node, which carries HTML `class="..."` syntax (not Pandoc `{...}`).
539pub fn emit_html_span_attributes_node(builder: &mut impl InlineSink, attrs_text: &str) {
540    emit_html_attrs_with_kind(builder, SyntaxKind::SPAN_ATTRIBUTES, attrs_text);
541}
542
543fn emit_html_attrs_with_kind(
544    builder: &mut impl InlineSink,
545    node_kind: SyntaxKind,
546    attrs_text: &str,
547) {
548    builder.start_node(node_kind.into());
549    let comps = html_attribute_spans(attrs_text);
550    if comps.is_empty() {
551        builder.token(SyntaxKind::TEXT.into(), attrs_text);
552    } else {
553        let mut cursor = 0usize;
554        for comp in &comps {
555            let (start, end) = match comp {
556                HtmlAttrComponent::Id(r)
557                | HtmlAttrComponent::Class(r)
558                | HtmlAttrComponent::Flag(r) => (r.start, r.end),
559                HtmlAttrComponent::KeyValue { key, value, .. } => (key.start, value.end),
560            };
561            emit_attribute_gap(builder, &attrs_text[cursor..start]);
562            match comp {
563                HtmlAttrComponent::Id(r) => {
564                    builder.token(SyntaxKind::ATTR_ID.into(), &attrs_text[r.clone()]);
565                }
566                HtmlAttrComponent::Class(r) => {
567                    builder.token(SyntaxKind::ATTR_CLASS.into(), &attrs_text[r.clone()]);
568                }
569                HtmlAttrComponent::Flag(r) => {
570                    builder.start_node(SyntaxKind::ATTR_KEY_VALUE.into());
571                    builder.token(SyntaxKind::ATTR_KEY.into(), &attrs_text[r.clone()]);
572                    builder.finish_node();
573                }
574                HtmlAttrComponent::KeyValue { key, eq, value } => {
575                    builder.start_node(SyntaxKind::ATTR_KEY_VALUE.into());
576                    builder.token(SyntaxKind::ATTR_KEY.into(), &attrs_text[key.clone()]);
577                    builder.token(SyntaxKind::TEXT.into(), &attrs_text[*eq..value.start]);
578                    if value.end > value.start {
579                        builder.token(SyntaxKind::ATTR_VALUE.into(), &attrs_text[value.clone()]);
580                    }
581                    builder.finish_node();
582                }
583            }
584            cursor = end;
585        }
586        emit_attribute_gap(builder, &attrs_text[cursor..]);
587    }
588    builder.finish_node();
589}
590
591/// Emit a Pandoc `{...}` ATTRIBUTE node by STRUCTURING the raw source slice
592/// into ATTR_* children that wrap the original bytes (no synthesis). Markers
593/// and quotes stay inside their tokens; whitespace/newlines between components,
594/// and any bytes the scanner skips (duplicate `#id`, malformed tokens), become
595/// standalone WHITESPACE/NEWLINE/TEXT tokens — so `node.text()` is exactly the
596/// source slice. Non-`{...}`-shaped or unrecognized input (MMD `[#id]` header
597/// brackets, raw-inline `{=format}`, empty `{}`) falls back to a single opaque
598/// ATTRIBUTE token, preserving the prior shape.
599pub fn emit_attribute_node(builder: &mut impl InlineSink, raw_attr_text: &str) {
600    emit_attribute_node_with_kinds(
601        builder,
602        SyntaxKind::ATTRIBUTE,
603        SyntaxKind::ATTRIBUTE,
604        raw_attr_text,
605    );
606}
607
608/// Emit a fenced-div `DIV_INFO` node, structuring the Pandoc `{...}` body the
609/// same way [`emit_attribute_node`] does. Bare-word shorthand (`::: Warning`)
610/// and malformed/empty bodies fall back to a single opaque `TEXT` token,
611/// preserving the prior `DIV_INFO { TEXT(...) }` shape (and the bare-word
612/// class semantics the projector reads via `parse_div_info`).
613pub fn emit_div_info_node(builder: &mut impl InlineSink, raw_attr_text: &str) {
614    emit_attribute_node_with_kinds(
615        builder,
616        SyntaxKind::DIV_INFO,
617        SyntaxKind::TEXT,
618        raw_attr_text,
619    );
620}
621
622/// Emit a bracketed-span `SPAN_ATTRIBUTES` node, structuring the Pandoc `{...}`
623/// body the same way [`emit_attribute_node`] does. Malformed/empty bodies fall
624/// back to a single opaque `TEXT` token, preserving the prior
625/// `SPAN_ATTRIBUTES { TEXT(...) }` shape.
626pub fn emit_span_attributes_node(builder: &mut impl InlineSink, raw_attr_text: &str) {
627    emit_attribute_node_with_kinds(
628        builder,
629        SyntaxKind::SPAN_ATTRIBUTES,
630        SyntaxKind::TEXT,
631        raw_attr_text,
632    );
633}
634
635/// Structure a code-block info-string region containing a `{...}` attribute
636/// block into `ATTR_*` children wrapping the source bytes, the same way
637/// [`emit_attribute_node`] does — but with a language carve-out: when
638/// `carve_first_class_as_language` is set, the first `.class` component is
639/// emitted as `TEXT "."` + `CODE_LANGUAGE <lang>` (Pandoc's `{.python …}`
640/// language-first shape) instead of an `ATTR_CLASS`.
641///
642/// `text` is the full region that may surround the braces with gap bytes (e.g.
643/// `" {.numberLines}"` after a shortcut language). The `{`/`}` and any
644/// surrounding/interior gap bytes are emitted as gap tokens so `node.text()`
645/// reconstructs `text` exactly. Returns `true` when it structured the block, and
646/// `false` — having emitted **nothing** — when `text` has no `{...}` or its body
647/// is unrecognized (empty/whitespace-only), so callers fall back to a single
648/// opaque token and preserve the prior shape.
649///
650/// A leading `=format` raw marker is never carved as a language; the raw case is
651/// handled before this is reached, but the carve defensively skips `=`-prefixed
652/// components.
653pub fn emit_code_info_attrs(
654    builder: &mut impl InlineSink,
655    text: &str,
656    carve_first_class_as_language: bool,
657) -> bool {
658    let Some(open) = text.find('{') else {
659        return false;
660    };
661    let Some(close) = text.rfind('}') else {
662        return false;
663    };
664    if close < open {
665        return false;
666    }
667    let body = &text[open + 1..close];
668    let Some(spans) = attribute_content_spans(body) else {
669        return false;
670    };
671
672    // Leading gap before `{` (e.g. the space in `python {.x}`).
673    emit_attribute_gap(builder, &text[..open]);
674    builder.token(SyntaxKind::TEXT.into(), "{");
675
676    let mut carved = false;
677    let mut cursor = 0usize;
678    for comp in &spans.components {
679        let (start, end) = match comp {
680            AttrComponent::Id(r) | AttrComponent::Class(r) => (r.start, r.end),
681            AttrComponent::KeyValue { key, value, .. } => (key.start, value.end),
682        };
683        emit_attribute_gap(builder, &body[cursor..start]);
684        match comp {
685            AttrComponent::Id(r) => {
686                builder.token(SyntaxKind::ATTR_ID.into(), &body[r.clone()]);
687            }
688            AttrComponent::Class(r) => {
689                let is_dot_class = body.as_bytes().get(r.start) == Some(&b'.');
690                if carve_first_class_as_language && !carved && is_dot_class {
691                    // `.python` → `TEXT "."` + `CODE_LANGUAGE "python"`. Slice the
692                    // actual `.` byte rather than synthesizing it.
693                    builder.token(SyntaxKind::TEXT.into(), &body[r.start..r.start + 1]);
694                    builder.token(SyntaxKind::CODE_LANGUAGE.into(), &body[r.start + 1..r.end]);
695                    carved = true;
696                } else {
697                    builder.token(SyntaxKind::ATTR_CLASS.into(), &body[r.clone()]);
698                }
699            }
700            AttrComponent::KeyValue { key, eq, value } => {
701                builder.start_node(SyntaxKind::ATTR_KEY_VALUE.into());
702                builder.token(SyntaxKind::ATTR_KEY.into(), &body[key.clone()]);
703                builder.token(SyntaxKind::TEXT.into(), &body[*eq..*eq + 1]);
704                if value.end > value.start {
705                    builder.token(SyntaxKind::ATTR_VALUE.into(), &body[value.clone()]);
706                }
707                builder.finish_node();
708            }
709        }
710        cursor = end;
711    }
712    emit_attribute_gap(builder, &body[cursor..]);
713    builder.token(SyntaxKind::TEXT.into(), "}");
714    // Trailing gap after `}`.
715    emit_attribute_gap(builder, &text[close + 1..]);
716    true
717}
718
719/// Shared structuring core for attribute-bearing nodes. `node_kind` is the outer
720/// wrapper (`ATTRIBUTE`, `DIV_INFO`, …); `opaque_token_kind` is the single token
721/// the non-`{...}`/unrecognized fallback emits (so each caller keeps its prior
722/// opaque shape). The structured `{...}` path is identical across callers.
723fn emit_attribute_node_with_kinds(
724    builder: &mut impl InlineSink,
725    node_kind: SyntaxKind,
726    opaque_token_kind: SyntaxKind,
727    raw_attr_text: &str,
728) {
729    builder.start_node(node_kind.into());
730
731    let body = raw_attr_text
732        .strip_prefix('{')
733        .and_then(|s| s.strip_suffix('}'));
734    let spans = body.and_then(attribute_content_spans);
735
736    match (body, spans) {
737        (Some(body), Some(spans)) => {
738            builder.token(SyntaxKind::TEXT.into(), "{");
739            let mut cursor = 0usize;
740            for comp in &spans.components {
741                let (start, end) = match comp {
742                    AttrComponent::Id(r) | AttrComponent::Class(r) => (r.start, r.end),
743                    AttrComponent::KeyValue { key, value, .. } => (key.start, value.end),
744                };
745                emit_attribute_gap(builder, &body[cursor..start]);
746                match comp {
747                    AttrComponent::Id(r) => {
748                        builder.token(SyntaxKind::ATTR_ID.into(), &body[r.clone()]);
749                    }
750                    AttrComponent::Class(r) => {
751                        builder.token(SyntaxKind::ATTR_CLASS.into(), &body[r.clone()]);
752                    }
753                    AttrComponent::KeyValue { key, eq, value } => {
754                        builder.start_node(SyntaxKind::ATTR_KEY_VALUE.into());
755                        builder.token(SyntaxKind::ATTR_KEY.into(), &body[key.clone()]);
756                        builder.token(SyntaxKind::TEXT.into(), &body[*eq..*eq + 1]);
757                        if value.end > value.start {
758                            builder.token(SyntaxKind::ATTR_VALUE.into(), &body[value.clone()]);
759                        }
760                        builder.finish_node();
761                    }
762                }
763                cursor = end;
764            }
765            emit_attribute_gap(builder, &body[cursor..]);
766            builder.token(SyntaxKind::TEXT.into(), "}");
767        }
768        _ => {
769            // Opaque fallback: keep the whole slice as one token of the
770            // caller's chosen kind, preserving the prior shape.
771            builder.token(opaque_token_kind.into(), raw_attr_text);
772        }
773    }
774
775    builder.finish_node();
776}
777
778/// Emit the bytes between/around structured attribute components, splitting on
779/// newline boundaries: `\n`/`\r\n`/`\r` → NEWLINE, other whitespace runs →
780/// WHITESPACE, non-whitespace runs → TEXT. Every byte is preserved.
781fn emit_attribute_gap(builder: &mut impl InlineSink, gap: &str) {
782    let bytes = gap.as_bytes();
783    let mut i = 0;
784    while i < bytes.len() {
785        match bytes[i] {
786            b'\n' => {
787                builder.token(SyntaxKind::NEWLINE.into(), "\n");
788                i += 1;
789            }
790            b'\r' => {
791                if i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
792                    builder.token(SyntaxKind::NEWLINE.into(), "\r\n");
793                    i += 2;
794                } else {
795                    builder.token(SyntaxKind::NEWLINE.into(), "\r");
796                    i += 1;
797                }
798            }
799            b if b.is_ascii_whitespace() => {
800                let start = i;
801                while i < bytes.len()
802                    && bytes[i].is_ascii_whitespace()
803                    && bytes[i] != b'\n'
804                    && bytes[i] != b'\r'
805                {
806                    i += 1;
807                }
808                builder.token(SyntaxKind::WHITESPACE.into(), &gap[start..i]);
809            }
810            _ => {
811                let start = i;
812                while i < bytes.len() && !bytes[i].is_ascii_whitespace() {
813                    i += 1;
814                }
815                builder.token(SyntaxKind::TEXT.into(), &gap[start..i]);
816            }
817        }
818    }
819}
820
821#[cfg(test)]
822mod tests {
823    use super::*;
824
825    #[test]
826    fn test_simple_id() {
827        let result = try_parse_trailing_attributes("Heading {#my-id}");
828        assert!(result.is_some());
829        let (attrs, before) = result.unwrap();
830        assert_eq!(before, "Heading");
831        assert_eq!(attrs.identifier, Some("my-id".to_string()));
832        assert!(attrs.classes.is_empty());
833        assert!(attrs.key_values.is_empty());
834    }
835
836    #[test]
837    fn test_single_class() {
838        let result = try_parse_trailing_attributes("Text {.myclass}");
839        assert!(result.is_some());
840        let (attrs, _) = result.unwrap();
841        assert_eq!(attrs.classes, vec!["myclass"]);
842    }
843
844    #[test]
845    fn test_multiple_classes() {
846        let result = try_parse_trailing_attributes("Text {.class1 .class2 .class3}");
847        assert!(result.is_some());
848        let (attrs, _) = result.unwrap();
849        assert_eq!(attrs.classes, vec!["class1", "class2", "class3"]);
850    }
851
852    #[test]
853    fn test_key_value_unquoted() {
854        let result = try_parse_trailing_attributes("Text {key=value}");
855        assert!(result.is_some());
856        let (attrs, _) = result.unwrap();
857        assert_eq!(
858            attrs.key_values,
859            vec![("key".to_string(), "value".to_string())]
860        );
861    }
862
863    #[test]
864    fn test_key_value_quoted() {
865        let result = try_parse_trailing_attributes("Text {key=\"value with spaces\"}");
866        assert!(result.is_some());
867        let (attrs, _) = result.unwrap();
868        assert_eq!(
869            attrs.key_values,
870            vec![("key".to_string(), "value with spaces".to_string())]
871        );
872    }
873
874    #[test]
875    fn test_full_attributes() {
876        let result =
877            try_parse_trailing_attributes("Heading {#id .class1 .class2 key1=val1 key2=\"val 2\"}");
878        assert!(result.is_some());
879        let (attrs, before) = result.unwrap();
880        assert_eq!(before, "Heading");
881        assert_eq!(attrs.identifier, Some("id".to_string()));
882        assert_eq!(attrs.classes, vec!["class1", "class2"]);
883        assert_eq!(attrs.key_values.len(), 2);
884        assert_eq!(
885            attrs.key_values[0],
886            ("key1".to_string(), "val1".to_string())
887        );
888        assert_eq!(
889            attrs.key_values[1],
890            ("key2".to_string(), "val 2".to_string())
891        );
892    }
893
894    #[test]
895    fn test_trailing_attributes_with_shortcode_in_quoted_value() {
896        let text = "Slide Title {background-image='{{< placeholder 100 100 >}}' background-size=\"100px\"}";
897        let result = try_parse_trailing_attributes(text);
898        assert!(result.is_some());
899        let (attrs, before) = result.unwrap();
900        assert_eq!(before, "Slide Title");
901        assert_eq!(attrs.key_values.len(), 2);
902        assert_eq!(
903            attrs.key_values[0],
904            (
905                "background-image".to_string(),
906                "{{< placeholder 100 100 >}}".to_string()
907            )
908        );
909        assert_eq!(
910            attrs.key_values[1],
911            ("background-size".to_string(), "100px".to_string())
912        );
913    }
914
915    #[test]
916    fn test_no_attributes() {
917        let result = try_parse_trailing_attributes("Heading with no attributes");
918        assert!(result.is_none());
919    }
920
921    #[test]
922    fn test_empty_braces() {
923        let result = try_parse_trailing_attributes("Heading {}");
924        assert!(result.is_none());
925    }
926
927    #[test]
928    fn test_only_first_id_counts() {
929        let result = try_parse_trailing_attributes("Text {#id1 #id2}");
930        assert!(result.is_some());
931        let (attrs, _) = result.unwrap();
932        assert_eq!(attrs.identifier, Some("id1".to_string()));
933    }
934
935    #[test]
936    fn test_whitespace_handling() {
937        let result = try_parse_trailing_attributes("Text {  #id   .class   key=val  }");
938        assert!(result.is_some());
939        let (attrs, _) = result.unwrap();
940        assert_eq!(attrs.identifier, Some("id".to_string()));
941        assert_eq!(attrs.classes, vec!["class"]);
942        assert_eq!(
943            attrs.key_values,
944            vec![("key".to_string(), "val".to_string())]
945        );
946    }
947
948    #[test]
949    fn test_parse_html_tag_attributes_id_only() {
950        let attrs = parse_html_tag_attributes(r#"<div id="anchor-c">"#).unwrap();
951        assert_eq!(attrs.identifier.as_deref(), Some("anchor-c"));
952        assert!(attrs.classes.is_empty());
953        assert!(attrs.key_values.is_empty());
954    }
955
956    #[test]
957    fn test_parse_html_tag_attributes_inline_content_after_open() {
958        // For a same-line block `<div id="x">Content</div>`, the entire
959        // line is in the HTML_BLOCK_TAG. The parser must terminate at the
960        // first unquoted `>` and ignore the trailing content + close tag.
961        let attrs = parse_html_tag_attributes(r#"<div id="anchor-c">Content.</div>"#).unwrap();
962        assert_eq!(attrs.identifier.as_deref(), Some("anchor-c"));
963    }
964
965    #[test]
966    fn test_parse_html_tag_attributes_class_and_kv() {
967        let attrs = parse_html_tag_attributes(r#"<div id="x" class="a b" data-key="v">"#).unwrap();
968        assert_eq!(attrs.identifier.as_deref(), Some("x"));
969        assert_eq!(attrs.classes, vec!["a", "b"]);
970        assert_eq!(
971            attrs.key_values,
972            vec![("data-key".to_string(), "v".to_string())]
973        );
974    }
975
976    #[test]
977    fn test_parse_html_tag_attributes_no_attrs() {
978        assert!(parse_html_tag_attributes("<div>").is_none());
979    }
980
981    #[test]
982    fn test_trailing_whitespace_before_attrs() {
983        let result = try_parse_trailing_attributes("Heading   {#id}");
984        assert!(result.is_some());
985        let (_, before) = result.unwrap();
986        assert_eq!(before, "Heading");
987    }
988
989    /// Regression: the inline-code attribute path used to reconstruct a
990    /// normalized `{...}` string (reordering id-first, force-quoting values),
991    /// which inflated the CST past the input and broke losslessness. The
992    /// structured emitter must wrap the original bytes verbatim.
993    #[test]
994    fn inline_code_attribute_is_lossless() {
995        let input = "`code`{.r #x key=v}\n";
996        let tree = crate::parse(input, None);
997        assert_eq!(tree.text().to_string(), input);
998    }
999
1000    fn structured_attr(raw: &str) -> crate::syntax::SyntaxNode {
1001        let mut builder = GreenNodeBuilder::new();
1002        emit_attribute_node(&mut builder, raw);
1003        crate::syntax::SyntaxNode::new_root(builder.finish())
1004    }
1005
1006    #[test]
1007    fn emit_attribute_node_is_lossless_over_shapes() {
1008        // Interior whitespace, duplicate id, malformed/empty bodies, mixed
1009        // quotes, and `=format` must all round-trip byte-for-byte.
1010        for raw in [
1011            "{#id}",
1012            "{.a .b}",
1013            "{key=\"v w\"}",
1014            "{ #id  .c }",
1015            "{#id1 #id2}",
1016            "{key}",
1017            "{=html}",
1018            "{#id .a key=v key2='x'}",
1019            "{key=}",
1020            "{}",
1021            "{   }",
1022        ] {
1023            let node = structured_attr(raw);
1024            assert_eq!(node.text().to_string(), raw, "lossless emit for {raw:?}");
1025            assert_eq!(node.kind(), SyntaxKind::ATTRIBUTE);
1026        }
1027    }
1028
1029    #[test]
1030    fn emit_attribute_node_structures_children() {
1031        let node = structured_attr("{#x .a .b k=v}");
1032        let kinds: Vec<_> = node.children_with_tokens().map(|c| c.kind()).collect();
1033        assert_eq!(
1034            kinds.iter().filter(|k| **k == SyntaxKind::ATTR_ID).count(),
1035            1
1036        );
1037        assert_eq!(
1038            kinds
1039                .iter()
1040                .filter(|k| **k == SyntaxKind::ATTR_CLASS)
1041                .count(),
1042            2
1043        );
1044        assert_eq!(
1045            kinds
1046                .iter()
1047                .filter(|k| **k == SyntaxKind::ATTR_KEY_VALUE)
1048                .count(),
1049            1
1050        );
1051    }
1052
1053    fn structured_html_attrs(raw: &str) -> crate::syntax::SyntaxNode {
1054        let mut builder = GreenNodeBuilder::new();
1055        emit_html_attrs_node(&mut builder, raw);
1056        crate::syntax::SyntaxNode::new_root(builder.finish())
1057    }
1058
1059    #[test]
1060    fn emit_html_attrs_node_is_lossless_over_shapes() {
1061        for raw in [
1062            r#"id="x""#,
1063            r#"id="x" class="a b" data-key="v""#,
1064            r#"class='a  b'"#,
1065            r#"id=bare class=one"#,
1066            "hidden",
1067            r#"id="x" hidden data-n="1""#,
1068            r#"  id="x"  /"#,
1069            r#"id="""#,
1070            "",
1071            "   ",
1072        ] {
1073            let node = structured_html_attrs(raw);
1074            assert_eq!(node.text().to_string(), raw, "lossless emit for {raw:?}");
1075            assert_eq!(node.kind(), SyntaxKind::HTML_ATTRS);
1076        }
1077    }
1078
1079    #[test]
1080    fn emit_html_attrs_node_structures_children() {
1081        let node = structured_html_attrs(r#"id="x" class="a b" data-key="v" hidden"#);
1082        let kinds: Vec<_> = node.children_with_tokens().map(|c| c.kind()).collect();
1083        assert_eq!(
1084            kinds.iter().filter(|k| **k == SyntaxKind::ATTR_ID).count(),
1085            1
1086        );
1087        assert_eq!(
1088            kinds
1089                .iter()
1090                .filter(|k| **k == SyntaxKind::ATTR_CLASS)
1091                .count(),
1092            2,
1093            "class=\"a b\" splits into two ATTR_CLASS tokens"
1094        );
1095        // `data-key="v"` and the `hidden` flag are both ATTR_KEY_VALUE nodes.
1096        assert_eq!(
1097            node.children()
1098                .filter(|n| n.kind() == SyntaxKind::ATTR_KEY_VALUE)
1099                .count(),
1100            2
1101        );
1102    }
1103
1104    /// The structured walker and the string-deriving parser must agree.
1105    #[test]
1106    fn html_attribute_list_parse_parity() {
1107        let attrs =
1108            parse_html_attribute_list(r#"id="x" class="a b" data-key='v w' hidden"#).unwrap();
1109        assert_eq!(attrs.identifier.as_deref(), Some("x"));
1110        assert_eq!(attrs.classes, vec!["a", "b"]);
1111        assert_eq!(
1112            attrs.key_values,
1113            vec![
1114                ("data-key".to_string(), "v w".to_string()),
1115                ("hidden".to_string(), String::new()),
1116            ]
1117        );
1118        assert!(parse_html_attribute_list("   ").is_none());
1119        assert!(parse_html_attribute_list(r#"id="""#).is_none());
1120    }
1121
1122    fn structured_div_info(raw: &str) -> crate::syntax::SyntaxNode {
1123        let mut builder = GreenNodeBuilder::new();
1124        emit_div_info_node(&mut builder, raw);
1125        crate::syntax::SyntaxNode::new_root(builder.finish())
1126    }
1127
1128    #[test]
1129    fn emit_div_info_node_is_lossless_and_structures_brace_body() {
1130        // `{...}` bodies structure into ATTR_* children; bare-word shorthand
1131        // and malformed/empty bodies stay one opaque TEXT token. All round-trip.
1132        for raw in ["{#id .a .b key=val key2=\"v w\"}", "Warning", "{}", "{   }"] {
1133            let node = structured_div_info(raw);
1134            assert_eq!(node.text().to_string(), raw, "lossless emit for {raw:?}");
1135            assert_eq!(node.kind(), SyntaxKind::DIV_INFO);
1136        }
1137
1138        let structured = structured_div_info("{#id .a .b key=val key2=\"v w\"}");
1139        let kinds: Vec<_> = structured
1140            .children_with_tokens()
1141            .map(|c| c.kind())
1142            .collect();
1143        assert_eq!(
1144            kinds.iter().filter(|k| **k == SyntaxKind::ATTR_ID).count(),
1145            1
1146        );
1147        assert_eq!(
1148            kinds
1149                .iter()
1150                .filter(|k| **k == SyntaxKind::ATTR_CLASS)
1151                .count(),
1152            2
1153        );
1154        assert_eq!(
1155            kinds
1156                .iter()
1157                .filter(|k| **k == SyntaxKind::ATTR_KEY_VALUE)
1158                .count(),
1159            2
1160        );
1161
1162        // Bare-word fallback: a single opaque TEXT token, no ATTR_* children.
1163        let bare = structured_div_info("Warning");
1164        let bare_kinds: Vec<_> = bare.children_with_tokens().map(|c| c.kind()).collect();
1165        assert_eq!(bare_kinds, vec![SyntaxKind::TEXT]);
1166    }
1167}