Skip to main content

panache_parser/parser/utils/
attributes.rs

1//! Parsing for Pandoc-style attributes: {#id .class key=value}
2//!
3//! Attributes can appear after headings, fenced code blocks, fenced divs, etc.
4//! Syntax: {#identifier .class1 .class2 key1=val1 key2="val2"}
5//!
6//! Rules:
7//! - Surrounded by { }
8//! - Identifier: #id (optional, only first one counts)
9//! - Classes: .class (can have multiple)
10//! - Key-value pairs: key=value or key="value" or key='value' (can have multiple)
11//! - Whitespace flexible between items
12
13use crate::syntax::SyntaxKind;
14use rowan::GreenNodeBuilder;
15
16#[derive(Debug, PartialEq)]
17pub struct AttributeBlock {
18    pub identifier: Option<String>,
19    pub classes: Vec<String>,
20    pub key_values: Vec<(String, String)>,
21}
22
23/// Try to parse an attribute block from the end of a string
24/// Returns: (attribute_block, text_before_attributes)
25pub fn try_parse_trailing_attributes(text: &str) -> Option<(AttributeBlock, &str)> {
26    let (attrs, before, _) = try_parse_trailing_attributes_with_pos(text)?;
27    Some((attrs, before))
28}
29
30/// Try to parse an attribute block from the end of a string.
31/// Returns: (attribute_block, text_before_attributes, open_brace_position_in_trimmed_text)
32pub fn try_parse_trailing_attributes_with_pos(text: &str) -> Option<(AttributeBlock, &str, usize)> {
33    let trimmed = text.trim_end();
34
35    // Must end with }
36    if !trimmed.ends_with('}') {
37        return None;
38    }
39
40    // Find matching opening brace for the trailing attribute block, accounting
41    // for braces inside quoted attribute values.
42    let open_brace = find_matching_open_brace_for_trailing_block(trimmed)?;
43
44    // Check if this is a bracketed span like [text]{.class} rather than a heading attribute
45    // If the { is immediately after ] (with optional whitespace), this should be parsed as a span
46    let before_brace = &trimmed[..open_brace];
47    if before_brace.trim_end().ends_with(']') {
48        log::trace!("Skipping attribute parsing for bracketed span: {}", text);
49        return None;
50    }
51
52    // Parse the content between { and }
53    let attr_content = &trimmed[open_brace + 1..trimmed.len() - 1];
54    let attr_block = parse_attribute_content(attr_content)?;
55
56    // Get text before attributes (trim trailing whitespace)
57    let before_attrs = trimmed[..open_brace].trim_end();
58
59    Some((attr_block, before_attrs, open_brace))
60}
61
62fn find_matching_open_brace_for_trailing_block(text: &str) -> Option<usize> {
63    if !text.ends_with('}') {
64        return None;
65    }
66
67    let mut stack: Vec<usize> = Vec::new();
68    let mut in_quote: Option<char> = None;
69    let mut escaped = false;
70    let mut end_brace_open = None;
71
72    for (idx, ch) in text.char_indices() {
73        if let Some(q) = in_quote {
74            if escaped {
75                escaped = false;
76                continue;
77            }
78            if ch == '\\' {
79                escaped = true;
80                continue;
81            }
82            if ch == q {
83                in_quote = None;
84            }
85            continue;
86        }
87
88        match ch {
89            '\'' | '"' => in_quote = Some(ch),
90            '{' => stack.push(idx),
91            '}' => {
92                let open = stack.pop()?;
93                if idx == text.len() - 1 {
94                    end_brace_open = Some(open);
95                }
96            }
97            _ => {}
98        }
99    }
100
101    if in_quote.is_some() || !stack.is_empty() {
102        return None;
103    }
104
105    end_brace_open
106}
107
108/// One recognized component inside an attribute `{...}` body, as byte ranges
109/// relative to the `content` slice passed to [`attribute_content_spans`] (the
110/// bytes strictly between `{` and `}`). Marker bytes (`#`/`.`/`=`) and value
111/// quotes are kept INSIDE the ranges so the emitter can wrap the exact source
112/// bytes; the string-deriving helpers strip them.
113#[derive(Debug, Clone, PartialEq)]
114pub(crate) enum AttrComponent {
115    /// `#id` — range includes the leading `#`.
116    Id(std::ops::Range<usize>),
117    /// `.class` or `=format` — range includes the leading `.`/`=` marker.
118    Class(std::ops::Range<usize>),
119    /// `key=value`: key range, `=` byte index, value range (the value range
120    /// includes surrounding quotes when present).
121    KeyValue {
122        key: std::ops::Range<usize>,
123        eq: usize,
124        value: std::ops::Range<usize>,
125    },
126}
127
128/// Recognized components of an attribute `{...}` body, in source order. The
129/// single source of truth shared by detection ([`parse_attribute_content`],
130/// which derives owned strings) and emission (`emit_attribute_node`, which
131/// wraps these byte ranges in ATTR_* CST nodes) — one walk, no detect/emit
132/// drift. Bytes the scan skips (duplicate `#id`, malformed tokens, whitespace)
133/// are not components; the emitter recovers them from the gaps between ranges.
134#[derive(Debug, Clone, PartialEq)]
135pub(crate) struct AttributeSpans {
136    pub components: Vec<AttrComponent>,
137}
138
139/// Strip a matching pair of surrounding quotes (`"` or `'`) from an attribute
140/// value's raw bytes, yielding the semantic value. Mirrors the quote handling
141/// in the legacy [`parse_attribute_content`] walk: a leading quote is always
142/// dropped, and a trailing quote of the same kind is dropped when present (so
143/// unterminated quotes keep their tail).
144fn attr_value_string(raw: &str) -> String {
145    let bytes = raw.as_bytes();
146    if let Some(&q) = bytes.first()
147        && (q == b'"' || q == b'\'')
148    {
149        let inner = &raw[1..];
150        return inner.strip_suffix(q as char).unwrap_or(inner).to_string();
151    }
152    raw.to_string()
153}
154
155/// Scan an attribute `{...}` body into [`AttributeSpans`]. Returns `None` when
156/// no component is recognized (empty/whitespace-only/`{}` is not a valid
157/// attribute block). Offsets are relative to `content`.
158pub(crate) fn attribute_content_spans(content: &str) -> Option<AttributeSpans> {
159    let bytes = content.as_bytes();
160    let mut pos = 0;
161    let mut components: Vec<AttrComponent> = Vec::new();
162    let mut have_id = false;
163
164    while pos < bytes.len() {
165        // Skip whitespace.
166        while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
167            pos += 1;
168        }
169        if pos >= bytes.len() {
170            break;
171        }
172
173        if bytes[pos] == b'=' {
174            // {=format} raw-attribute marker — recorded as a class whose range
175            // includes the `=` (the string derivation keeps the `=`).
176            let start = pos;
177            pos += 1; // skip '='
178            while pos < bytes.len() && !bytes[pos].is_ascii_whitespace() && bytes[pos] != b'}' {
179                pos += 1;
180            }
181            if pos > start + 1 {
182                components.push(AttrComponent::Class(start..pos));
183            }
184        } else if bytes[pos] == b'#' {
185            let start = pos;
186            pos += 1; // skip '#'
187            while pos < bytes.len() && !bytes[pos].is_ascii_whitespace() && bytes[pos] != b'}' {
188                pos += 1;
189            }
190            // Only the first non-empty identifier counts; later `#…` runs and a
191            // bare `#` are scanned but not recorded (recovered from the gap).
192            if !have_id && pos > start + 1 {
193                components.push(AttrComponent::Id(start..pos));
194                have_id = true;
195            }
196        } else if bytes[pos] == b'.' {
197            let start = pos;
198            pos += 1; // skip '.'
199            while pos < bytes.len() && !bytes[pos].is_ascii_whitespace() && bytes[pos] != b'}' {
200                pos += 1;
201            }
202            if pos > start + 1 {
203                components.push(AttrComponent::Class(start..pos));
204            }
205        } else {
206            // key=value
207            let key_start = pos;
208            while pos < bytes.len() && bytes[pos] != b'=' && !bytes[pos].is_ascii_whitespace() {
209                pos += 1;
210            }
211            if pos >= bytes.len() || bytes[pos] != b'=' {
212                // Not a valid key=value: skip the token (recovered from the gap).
213                while pos < bytes.len() && !bytes[pos].is_ascii_whitespace() {
214                    pos += 1;
215                }
216                continue;
217            }
218            let key_end = pos;
219            let eq = pos;
220            pos += 1; // skip '='
221
222            let value_start = pos;
223            if pos < bytes.len() && (bytes[pos] == b'"' || bytes[pos] == b'\'') {
224                let quote = bytes[pos];
225                pos += 1; // opening quote
226                while pos < bytes.len() && bytes[pos] != quote {
227                    pos += 1;
228                }
229                if pos < bytes.len() {
230                    pos += 1; // closing quote
231                }
232            } else {
233                while pos < bytes.len() && !bytes[pos].is_ascii_whitespace() && bytes[pos] != b'}' {
234                    pos += 1;
235                }
236            }
237            if key_end > key_start {
238                components.push(AttrComponent::KeyValue {
239                    key: key_start..key_end,
240                    eq,
241                    value: value_start..pos,
242                });
243            }
244        }
245    }
246
247    if components.is_empty() {
248        return None;
249    }
250    Some(AttributeSpans { components })
251}
252
253/// Parse the content inside the attribute braces into owned strings. Thin
254/// wrapper over [`attribute_content_spans`] so detection and emission share one
255/// walk.
256pub fn parse_attribute_content(content: &str) -> Option<AttributeBlock> {
257    let spans = attribute_content_spans(content)?;
258    let mut identifier = None;
259    let mut classes = Vec::new();
260    let mut key_values = Vec::new();
261
262    for comp in &spans.components {
263        match comp {
264            AttrComponent::Id(r) => {
265                // Range includes '#'; the scanner guarantees a non-empty tail.
266                identifier = Some(content[r.start + 1..r.end].to_string());
267            }
268            AttrComponent::Class(r) => {
269                let raw = &content[r.clone()];
270                // `.class` → `class`; `=format` keeps its `=` prefix.
271                match raw.strip_prefix('.') {
272                    Some(class) => classes.push(class.to_string()),
273                    None => classes.push(raw.to_string()),
274                }
275            }
276            AttrComponent::KeyValue { key, value, .. } => {
277                key_values.push((
278                    content[key.clone()].to_string(),
279                    attr_value_string(&content[value.clone()]),
280                ));
281            }
282        }
283    }
284
285    Some(AttributeBlock {
286        identifier,
287        classes,
288        key_values,
289    })
290}
291
292/// Parse HTML-style attributes from a raw HTML opening tag text such as
293/// `<div id="x" class="a b" data-key="v">`, returning the same
294/// `AttributeBlock` shape as Pandoc-style brace attributes. Whitespace-
295/// separated `class="..."` is split into individual classes; `id="..."`
296/// becomes the identifier; everything else becomes a key/value pair.
297/// Returns `None` if the tag has no recognized attributes.
298///
299/// Self-closing slashes (`<div .../>`) and trailing whitespace are tolerated.
300/// The leading `<TAG` and trailing `>` are stripped; this routine does not
301/// validate the tag name.
302pub fn parse_html_tag_attributes(tag_text: &str) -> Option<AttributeBlock> {
303    let trimmed = tag_text.trim_start();
304    let after_lt = trimmed.strip_prefix('<')?;
305    // Find the end of the opening tag at the first `>` not inside a quoted
306    // attribute value. Anything after that `>` (e.g. inline content + close
307    // tag for a same-line `<div id="x">Content</div>`) is irrelevant.
308    let bytes = after_lt.as_bytes();
309    let mut tag_end = None;
310    let mut quote: Option<u8> = None;
311    for (i, &b) in bytes.iter().enumerate() {
312        match (quote, b) {
313            (None, b'"') | (None, b'\'') => quote = Some(b),
314            (Some(q), b2) if b2 == q => quote = None,
315            (None, b'>') => {
316                tag_end = Some(i);
317                break;
318            }
319            _ => {}
320        }
321    }
322    let tag_end = tag_end?;
323    let inner = &after_lt[..tag_end];
324    // Drop any trailing self-closing slash.
325    let inner = inner.trim_end().trim_end_matches('/').trim_end();
326    // Drop the tag name (alphanumeric run after `<`).
327    let bytes = inner.as_bytes();
328    let mut name_end = 0usize;
329    while name_end < bytes.len()
330        && !bytes[name_end].is_ascii_whitespace()
331        && bytes[name_end] != b'/'
332    {
333        name_end += 1;
334    }
335    let attrs_text = &inner[name_end..];
336    parse_html_attribute_list(attrs_text)
337}
338
339/// Parse a raw HTML attribute list (the bytes between a tag name and the
340/// closing `>`, exclusive). Accepts inputs like `id="x" class="a b"
341/// data-key=v` and produces an [`AttributeBlock`]. Returns `None` if no
342/// recognized attributes are present.
343///
344/// Used by [`parse_html_tag_attributes`] (which strips `<TAG ...>`
345/// surrounding chrome before delegating here) and by
346/// `AttributeNode::id` for the structural `HTML_ATTRS` CST node, whose
347/// text holds JUST the attribute region.
348pub fn parse_html_attribute_list(attrs_text: &str) -> Option<AttributeBlock> {
349    let comps = html_attribute_spans(attrs_text);
350    if comps.is_empty() {
351        return None;
352    }
353    let mut identifier: Option<String> = None;
354    let mut classes: Vec<String> = Vec::new();
355    let mut key_values: Vec<(String, String)> = Vec::new();
356    for comp in &comps {
357        match comp {
358            HtmlAttrComponent::Id(r) => {
359                if identifier.is_none() {
360                    identifier = Some(attrs_text[r.clone()].to_string());
361                }
362            }
363            HtmlAttrComponent::Class(r) => classes.push(attrs_text[r.clone()].to_string()),
364            HtmlAttrComponent::KeyValue { key, value, .. } => {
365                key_values.push((
366                    attrs_text[key.clone()].to_string(),
367                    attr_value_string(&attrs_text[value.clone()]),
368                ));
369            }
370            HtmlAttrComponent::Flag(r) => {
371                key_values.push((attrs_text[r.clone()].to_string(), String::new()));
372            }
373        }
374    }
375    if identifier.is_none() && classes.is_empty() && key_values.is_empty() {
376        return None;
377    }
378    Some(AttributeBlock {
379        identifier,
380        classes,
381        key_values,
382    })
383}
384
385/// One recognized HTML attribute, as byte ranges relative to the attribute
386/// body passed to [`html_attribute_spans`] (the bytes between a tag name and
387/// the closing `>`, exclusive). Range semantics match the `ATTR_*` token each
388/// becomes: `Id`/`Class` wrap the bare value (quotes excluded — the reader uses
389/// the text verbatim, since HTML has no `#`/`.` marker), while `KeyValue` keeps
390/// the value's quotes (the reader strips them), mirroring the Pandoc
391/// convention. The single source of truth shared by [`parse_html_attribute_list`]
392/// (string derivation) and [`emit_html_attrs_node`] (CST emission).
393#[derive(Debug, Clone, PartialEq)]
394enum HtmlAttrComponent {
395    /// `id="x"` → range covers the bare id value (`x`); only the first counts.
396    Id(std::ops::Range<usize>),
397    /// One whitespace-separated word of a `class="a b"` value.
398    Class(std::ops::Range<usize>),
399    /// `key="v"` / `key=v` → key range, `=` byte index, value range (value
400    /// includes surrounding quotes when present).
401    KeyValue {
402        key: std::ops::Range<usize>,
403        eq: usize,
404        value: std::ops::Range<usize>,
405    },
406    /// A valueless attribute (`hidden`) → key range only (projects to `(key,"")`).
407    Flag(std::ops::Range<usize>),
408}
409
410/// Strip a matching surrounding quote pair from `[start, end)` of `content`,
411/// returning the inner range. An unterminated opening quote drops just the
412/// opening; unquoted ranges are returned unchanged. Mirrors the quote handling
413/// in [`attr_value_string`].
414fn html_value_inner_range(content: &str, start: usize, end: usize) -> std::ops::Range<usize> {
415    let b = content.as_bytes();
416    if end > start && (b[start] == b'"' || b[start] == b'\'') {
417        let q = b[start];
418        if end > start + 1 && b[end - 1] == q {
419            return (start + 1)..(end - 1);
420        }
421        return (start + 1)..end;
422    }
423    start..end
424}
425
426/// Whitespace-separated word ranges within `[start, end)` of `content`.
427fn html_word_ranges(content: &str, start: usize, end: usize) -> Vec<std::ops::Range<usize>> {
428    let b = content.as_bytes();
429    let mut out = Vec::new();
430    let mut i = start;
431    while i < end {
432        while i < end && b[i].is_ascii_whitespace() {
433            i += 1;
434        }
435        if i >= end {
436            break;
437        }
438        let ws = i;
439        while i < end && !b[i].is_ascii_whitespace() {
440            i += 1;
441        }
442        out.push(ws..i);
443    }
444    out
445}
446
447/// Scan an HTML attribute body into [`HtmlAttrComponent`]s in source order.
448/// Recognizes `id="x"`, `class="a b"` (split per word), `key="v"`/`key=v`, and
449/// valueless flags. Bytes that aren't part of a component (attribute names,
450/// `=`, quotes, whitespace, `/`) are recovered by the emitter from the gaps.
451fn html_attribute_spans(content: &str) -> Vec<HtmlAttrComponent> {
452    let bytes = content.as_bytes();
453    let mut i = 0usize;
454    let mut comps: Vec<HtmlAttrComponent> = Vec::new();
455    let mut have_id = false;
456
457    while i < bytes.len() {
458        match bytes[i] {
459            b' ' | b'\t' | b'\n' | b'\r' | b'/' => {
460                i += 1;
461            }
462            _ => {
463                let key_start = i;
464                while i < bytes.len()
465                    && !matches!(bytes[i], b' ' | b'\t' | b'\n' | b'\r' | b'=' | b'/')
466                {
467                    i += 1;
468                }
469                let key_end = i;
470                let key = &content[key_start..key_end];
471
472                if i < bytes.len() && bytes[i] == b'=' {
473                    let eq = i;
474                    i += 1; // skip '='
475                    let value_start = i;
476                    if i < bytes.len() && (bytes[i] == b'"' || bytes[i] == b'\'') {
477                        let quote = bytes[i];
478                        i += 1; // opening quote
479                        while i < bytes.len() && bytes[i] != quote {
480                            i += 1;
481                        }
482                        if i < bytes.len() {
483                            i += 1; // closing quote
484                        }
485                    } else {
486                        while i < bytes.len()
487                            && !matches!(bytes[i], b' ' | b'\t' | b'\n' | b'\r' | b'/')
488                        {
489                            i += 1;
490                        }
491                    }
492                    let value_end = i;
493                    match key {
494                        "id" => {
495                            if !have_id {
496                                let inner = html_value_inner_range(content, value_start, value_end);
497                                if inner.end > inner.start {
498                                    comps.push(HtmlAttrComponent::Id(inner));
499                                    have_id = true;
500                                }
501                            }
502                        }
503                        "class" => {
504                            let inner = html_value_inner_range(content, value_start, value_end);
505                            for w in html_word_ranges(content, inner.start, inner.end) {
506                                comps.push(HtmlAttrComponent::Class(w));
507                            }
508                        }
509                        _ => comps.push(HtmlAttrComponent::KeyValue {
510                            key: key_start..key_end,
511                            eq,
512                            value: value_start..value_end,
513                        }),
514                    }
515                } else if key_end > key_start {
516                    comps.push(HtmlAttrComponent::Flag(key_start..key_end));
517                }
518            }
519        }
520    }
521
522    comps
523}
524
525/// Emit a structural `HTML_ATTRS` node, wrapping the source bytes of each
526/// recognized HTML attribute in `ATTR_ID` / `ATTR_CLASS` / `ATTR_KEY_VALUE`
527/// children (bare values — HTML has no `#`/`.` marker). Bytes between/around
528/// components (names, `=`, quotes, whitespace, `/`) become gap tokens, so
529/// `node.text()` is exactly `attrs_text`. An unrecognized/empty body falls back
530/// to a single opaque `TEXT` token.
531pub fn emit_html_attrs_node(builder: &mut GreenNodeBuilder, attrs_text: &str) {
532    emit_html_attrs_with_kind(builder, SyntaxKind::HTML_ATTRS, attrs_text);
533}
534
535/// As [`emit_html_attrs_node`] but for the legacy native-span `SPAN_ATTRIBUTES`
536/// node, which carries HTML `class="..."` syntax (not Pandoc `{...}`).
537pub fn emit_html_span_attributes_node(builder: &mut GreenNodeBuilder, attrs_text: &str) {
538    emit_html_attrs_with_kind(builder, SyntaxKind::SPAN_ATTRIBUTES, attrs_text);
539}
540
541fn emit_html_attrs_with_kind(
542    builder: &mut GreenNodeBuilder,
543    node_kind: SyntaxKind,
544    attrs_text: &str,
545) {
546    builder.start_node(node_kind.into());
547    let comps = html_attribute_spans(attrs_text);
548    if comps.is_empty() {
549        builder.token(SyntaxKind::TEXT.into(), attrs_text);
550    } else {
551        let mut cursor = 0usize;
552        for comp in &comps {
553            let (start, end) = match comp {
554                HtmlAttrComponent::Id(r)
555                | HtmlAttrComponent::Class(r)
556                | HtmlAttrComponent::Flag(r) => (r.start, r.end),
557                HtmlAttrComponent::KeyValue { key, value, .. } => (key.start, value.end),
558            };
559            emit_attribute_gap(builder, &attrs_text[cursor..start]);
560            match comp {
561                HtmlAttrComponent::Id(r) => {
562                    builder.token(SyntaxKind::ATTR_ID.into(), &attrs_text[r.clone()]);
563                }
564                HtmlAttrComponent::Class(r) => {
565                    builder.token(SyntaxKind::ATTR_CLASS.into(), &attrs_text[r.clone()]);
566                }
567                HtmlAttrComponent::Flag(r) => {
568                    builder.start_node(SyntaxKind::ATTR_KEY_VALUE.into());
569                    builder.token(SyntaxKind::ATTR_KEY.into(), &attrs_text[r.clone()]);
570                    builder.finish_node();
571                }
572                HtmlAttrComponent::KeyValue { key, eq, value } => {
573                    builder.start_node(SyntaxKind::ATTR_KEY_VALUE.into());
574                    builder.token(SyntaxKind::ATTR_KEY.into(), &attrs_text[key.clone()]);
575                    builder.token(SyntaxKind::TEXT.into(), &attrs_text[*eq..value.start]);
576                    if value.end > value.start {
577                        builder.token(SyntaxKind::ATTR_VALUE.into(), &attrs_text[value.clone()]);
578                    }
579                    builder.finish_node();
580                }
581            }
582            cursor = end;
583        }
584        emit_attribute_gap(builder, &attrs_text[cursor..]);
585    }
586    builder.finish_node();
587}
588
589/// Emit a Pandoc `{...}` ATTRIBUTE node by STRUCTURING the raw source slice
590/// into ATTR_* children that wrap the original bytes (no synthesis). Markers
591/// and quotes stay inside their tokens; whitespace/newlines between components,
592/// and any bytes the scanner skips (duplicate `#id`, malformed tokens), become
593/// standalone WHITESPACE/NEWLINE/TEXT tokens — so `node.text()` is exactly the
594/// source slice. Non-`{...}`-shaped or unrecognized input (MMD `[#id]` header
595/// brackets, raw-inline `{=format}`, empty `{}`) falls back to a single opaque
596/// ATTRIBUTE token, preserving the prior shape.
597pub fn emit_attribute_node(builder: &mut GreenNodeBuilder, raw_attr_text: &str) {
598    emit_attribute_node_with_kinds(
599        builder,
600        SyntaxKind::ATTRIBUTE,
601        SyntaxKind::ATTRIBUTE,
602        raw_attr_text,
603    );
604}
605
606/// Emit a fenced-div `DIV_INFO` node, structuring the Pandoc `{...}` body the
607/// same way [`emit_attribute_node`] does. Bare-word shorthand (`::: Warning`)
608/// and malformed/empty bodies fall back to a single opaque `TEXT` token,
609/// preserving the prior `DIV_INFO { TEXT(...) }` shape (and the bare-word
610/// class semantics the projector reads via `parse_div_info`).
611pub fn emit_div_info_node(builder: &mut GreenNodeBuilder, raw_attr_text: &str) {
612    emit_attribute_node_with_kinds(
613        builder,
614        SyntaxKind::DIV_INFO,
615        SyntaxKind::TEXT,
616        raw_attr_text,
617    );
618}
619
620/// Emit a bracketed-span `SPAN_ATTRIBUTES` node, structuring the Pandoc `{...}`
621/// body the same way [`emit_attribute_node`] does. Malformed/empty bodies fall
622/// back to a single opaque `TEXT` token, preserving the prior
623/// `SPAN_ATTRIBUTES { TEXT(...) }` shape.
624pub fn emit_span_attributes_node(builder: &mut GreenNodeBuilder, raw_attr_text: &str) {
625    emit_attribute_node_with_kinds(
626        builder,
627        SyntaxKind::SPAN_ATTRIBUTES,
628        SyntaxKind::TEXT,
629        raw_attr_text,
630    );
631}
632
633/// Shared structuring core for attribute-bearing nodes. `node_kind` is the outer
634/// wrapper (`ATTRIBUTE`, `DIV_INFO`, …); `opaque_token_kind` is the single token
635/// the non-`{...}`/unrecognized fallback emits (so each caller keeps its prior
636/// opaque shape). The structured `{...}` path is identical across callers.
637fn emit_attribute_node_with_kinds(
638    builder: &mut GreenNodeBuilder,
639    node_kind: SyntaxKind,
640    opaque_token_kind: SyntaxKind,
641    raw_attr_text: &str,
642) {
643    builder.start_node(node_kind.into());
644
645    let body = raw_attr_text
646        .strip_prefix('{')
647        .and_then(|s| s.strip_suffix('}'));
648    let spans = body.and_then(attribute_content_spans);
649
650    match (body, spans) {
651        (Some(body), Some(spans)) => {
652            builder.token(SyntaxKind::TEXT.into(), "{");
653            let mut cursor = 0usize;
654            for comp in &spans.components {
655                let (start, end) = match comp {
656                    AttrComponent::Id(r) | AttrComponent::Class(r) => (r.start, r.end),
657                    AttrComponent::KeyValue { key, value, .. } => (key.start, value.end),
658                };
659                emit_attribute_gap(builder, &body[cursor..start]);
660                match comp {
661                    AttrComponent::Id(r) => {
662                        builder.token(SyntaxKind::ATTR_ID.into(), &body[r.clone()]);
663                    }
664                    AttrComponent::Class(r) => {
665                        builder.token(SyntaxKind::ATTR_CLASS.into(), &body[r.clone()]);
666                    }
667                    AttrComponent::KeyValue { key, eq, value } => {
668                        builder.start_node(SyntaxKind::ATTR_KEY_VALUE.into());
669                        builder.token(SyntaxKind::ATTR_KEY.into(), &body[key.clone()]);
670                        builder.token(SyntaxKind::TEXT.into(), &body[*eq..*eq + 1]);
671                        if value.end > value.start {
672                            builder.token(SyntaxKind::ATTR_VALUE.into(), &body[value.clone()]);
673                        }
674                        builder.finish_node();
675                    }
676                }
677                cursor = end;
678            }
679            emit_attribute_gap(builder, &body[cursor..]);
680            builder.token(SyntaxKind::TEXT.into(), "}");
681        }
682        _ => {
683            // Opaque fallback: keep the whole slice as one token of the
684            // caller's chosen kind, preserving the prior shape.
685            builder.token(opaque_token_kind.into(), raw_attr_text);
686        }
687    }
688
689    builder.finish_node();
690}
691
692/// Emit the bytes between/around structured attribute components, splitting on
693/// newline boundaries: `\n`/`\r\n`/`\r` → NEWLINE, other whitespace runs →
694/// WHITESPACE, non-whitespace runs → TEXT. Every byte is preserved.
695fn emit_attribute_gap(builder: &mut GreenNodeBuilder, gap: &str) {
696    let bytes = gap.as_bytes();
697    let mut i = 0;
698    while i < bytes.len() {
699        match bytes[i] {
700            b'\n' => {
701                builder.token(SyntaxKind::NEWLINE.into(), "\n");
702                i += 1;
703            }
704            b'\r' => {
705                if i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
706                    builder.token(SyntaxKind::NEWLINE.into(), "\r\n");
707                    i += 2;
708                } else {
709                    builder.token(SyntaxKind::NEWLINE.into(), "\r");
710                    i += 1;
711                }
712            }
713            b if b.is_ascii_whitespace() => {
714                let start = i;
715                while i < bytes.len()
716                    && bytes[i].is_ascii_whitespace()
717                    && bytes[i] != b'\n'
718                    && bytes[i] != b'\r'
719                {
720                    i += 1;
721                }
722                builder.token(SyntaxKind::WHITESPACE.into(), &gap[start..i]);
723            }
724            _ => {
725                let start = i;
726                while i < bytes.len() && !bytes[i].is_ascii_whitespace() {
727                    i += 1;
728                }
729                builder.token(SyntaxKind::TEXT.into(), &gap[start..i]);
730            }
731        }
732    }
733}
734
735#[cfg(test)]
736mod tests {
737    use super::*;
738
739    #[test]
740    fn test_simple_id() {
741        let result = try_parse_trailing_attributes("Heading {#my-id}");
742        assert!(result.is_some());
743        let (attrs, before) = result.unwrap();
744        assert_eq!(before, "Heading");
745        assert_eq!(attrs.identifier, Some("my-id".to_string()));
746        assert!(attrs.classes.is_empty());
747        assert!(attrs.key_values.is_empty());
748    }
749
750    #[test]
751    fn test_single_class() {
752        let result = try_parse_trailing_attributes("Text {.myclass}");
753        assert!(result.is_some());
754        let (attrs, _) = result.unwrap();
755        assert_eq!(attrs.classes, vec!["myclass"]);
756    }
757
758    #[test]
759    fn test_multiple_classes() {
760        let result = try_parse_trailing_attributes("Text {.class1 .class2 .class3}");
761        assert!(result.is_some());
762        let (attrs, _) = result.unwrap();
763        assert_eq!(attrs.classes, vec!["class1", "class2", "class3"]);
764    }
765
766    #[test]
767    fn test_key_value_unquoted() {
768        let result = try_parse_trailing_attributes("Text {key=value}");
769        assert!(result.is_some());
770        let (attrs, _) = result.unwrap();
771        assert_eq!(
772            attrs.key_values,
773            vec![("key".to_string(), "value".to_string())]
774        );
775    }
776
777    #[test]
778    fn test_key_value_quoted() {
779        let result = try_parse_trailing_attributes("Text {key=\"value with spaces\"}");
780        assert!(result.is_some());
781        let (attrs, _) = result.unwrap();
782        assert_eq!(
783            attrs.key_values,
784            vec![("key".to_string(), "value with spaces".to_string())]
785        );
786    }
787
788    #[test]
789    fn test_full_attributes() {
790        let result =
791            try_parse_trailing_attributes("Heading {#id .class1 .class2 key1=val1 key2=\"val 2\"}");
792        assert!(result.is_some());
793        let (attrs, before) = result.unwrap();
794        assert_eq!(before, "Heading");
795        assert_eq!(attrs.identifier, Some("id".to_string()));
796        assert_eq!(attrs.classes, vec!["class1", "class2"]);
797        assert_eq!(attrs.key_values.len(), 2);
798        assert_eq!(
799            attrs.key_values[0],
800            ("key1".to_string(), "val1".to_string())
801        );
802        assert_eq!(
803            attrs.key_values[1],
804            ("key2".to_string(), "val 2".to_string())
805        );
806    }
807
808    #[test]
809    fn test_trailing_attributes_with_shortcode_in_quoted_value() {
810        let text = "Slide Title {background-image='{{< placeholder 100 100 >}}' background-size=\"100px\"}";
811        let result = try_parse_trailing_attributes(text);
812        assert!(result.is_some());
813        let (attrs, before) = result.unwrap();
814        assert_eq!(before, "Slide Title");
815        assert_eq!(attrs.key_values.len(), 2);
816        assert_eq!(
817            attrs.key_values[0],
818            (
819                "background-image".to_string(),
820                "{{< placeholder 100 100 >}}".to_string()
821            )
822        );
823        assert_eq!(
824            attrs.key_values[1],
825            ("background-size".to_string(), "100px".to_string())
826        );
827    }
828
829    #[test]
830    fn test_no_attributes() {
831        let result = try_parse_trailing_attributes("Heading with no attributes");
832        assert!(result.is_none());
833    }
834
835    #[test]
836    fn test_empty_braces() {
837        let result = try_parse_trailing_attributes("Heading {}");
838        assert!(result.is_none());
839    }
840
841    #[test]
842    fn test_only_first_id_counts() {
843        let result = try_parse_trailing_attributes("Text {#id1 #id2}");
844        assert!(result.is_some());
845        let (attrs, _) = result.unwrap();
846        assert_eq!(attrs.identifier, Some("id1".to_string()));
847    }
848
849    #[test]
850    fn test_whitespace_handling() {
851        let result = try_parse_trailing_attributes("Text {  #id   .class   key=val  }");
852        assert!(result.is_some());
853        let (attrs, _) = result.unwrap();
854        assert_eq!(attrs.identifier, Some("id".to_string()));
855        assert_eq!(attrs.classes, vec!["class"]);
856        assert_eq!(
857            attrs.key_values,
858            vec![("key".to_string(), "val".to_string())]
859        );
860    }
861
862    #[test]
863    fn test_parse_html_tag_attributes_id_only() {
864        let attrs = parse_html_tag_attributes(r#"<div id="anchor-c">"#).unwrap();
865        assert_eq!(attrs.identifier.as_deref(), Some("anchor-c"));
866        assert!(attrs.classes.is_empty());
867        assert!(attrs.key_values.is_empty());
868    }
869
870    #[test]
871    fn test_parse_html_tag_attributes_inline_content_after_open() {
872        // For a same-line block `<div id="x">Content</div>`, the entire
873        // line is in the HTML_BLOCK_TAG. The parser must terminate at the
874        // first unquoted `>` and ignore the trailing content + close tag.
875        let attrs = parse_html_tag_attributes(r#"<div id="anchor-c">Content.</div>"#).unwrap();
876        assert_eq!(attrs.identifier.as_deref(), Some("anchor-c"));
877    }
878
879    #[test]
880    fn test_parse_html_tag_attributes_class_and_kv() {
881        let attrs = parse_html_tag_attributes(r#"<div id="x" class="a b" data-key="v">"#).unwrap();
882        assert_eq!(attrs.identifier.as_deref(), Some("x"));
883        assert_eq!(attrs.classes, vec!["a", "b"]);
884        assert_eq!(
885            attrs.key_values,
886            vec![("data-key".to_string(), "v".to_string())]
887        );
888    }
889
890    #[test]
891    fn test_parse_html_tag_attributes_no_attrs() {
892        assert!(parse_html_tag_attributes("<div>").is_none());
893    }
894
895    #[test]
896    fn test_trailing_whitespace_before_attrs() {
897        let result = try_parse_trailing_attributes("Heading   {#id}");
898        assert!(result.is_some());
899        let (_, before) = result.unwrap();
900        assert_eq!(before, "Heading");
901    }
902
903    /// Regression: the inline-code attribute path used to reconstruct a
904    /// normalized `{...}` string (reordering id-first, force-quoting values),
905    /// which inflated the CST past the input and broke losslessness. The
906    /// structured emitter must wrap the original bytes verbatim.
907    #[test]
908    fn inline_code_attribute_is_lossless() {
909        let input = "`code`{.r #x key=v}\n";
910        let tree = crate::parse(input, None);
911        assert_eq!(tree.text().to_string(), input);
912    }
913
914    fn structured_attr(raw: &str) -> crate::syntax::SyntaxNode {
915        let mut builder = GreenNodeBuilder::new();
916        emit_attribute_node(&mut builder, raw);
917        crate::syntax::SyntaxNode::new_root(builder.finish())
918    }
919
920    #[test]
921    fn emit_attribute_node_is_lossless_over_shapes() {
922        // Interior whitespace, duplicate id, malformed/empty bodies, mixed
923        // quotes, and `=format` must all round-trip byte-for-byte.
924        for raw in [
925            "{#id}",
926            "{.a .b}",
927            "{key=\"v w\"}",
928            "{ #id  .c }",
929            "{#id1 #id2}",
930            "{key}",
931            "{=html}",
932            "{#id .a key=v key2='x'}",
933            "{key=}",
934            "{}",
935            "{   }",
936        ] {
937            let node = structured_attr(raw);
938            assert_eq!(node.text().to_string(), raw, "lossless emit for {raw:?}");
939            assert_eq!(node.kind(), SyntaxKind::ATTRIBUTE);
940        }
941    }
942
943    #[test]
944    fn emit_attribute_node_structures_children() {
945        let node = structured_attr("{#x .a .b k=v}");
946        let kinds: Vec<_> = node.children_with_tokens().map(|c| c.kind()).collect();
947        assert_eq!(
948            kinds.iter().filter(|k| **k == SyntaxKind::ATTR_ID).count(),
949            1
950        );
951        assert_eq!(
952            kinds
953                .iter()
954                .filter(|k| **k == SyntaxKind::ATTR_CLASS)
955                .count(),
956            2
957        );
958        assert_eq!(
959            kinds
960                .iter()
961                .filter(|k| **k == SyntaxKind::ATTR_KEY_VALUE)
962                .count(),
963            1
964        );
965    }
966
967    fn structured_html_attrs(raw: &str) -> crate::syntax::SyntaxNode {
968        let mut builder = GreenNodeBuilder::new();
969        emit_html_attrs_node(&mut builder, raw);
970        crate::syntax::SyntaxNode::new_root(builder.finish())
971    }
972
973    #[test]
974    fn emit_html_attrs_node_is_lossless_over_shapes() {
975        for raw in [
976            r#"id="x""#,
977            r#"id="x" class="a b" data-key="v""#,
978            r#"class='a  b'"#,
979            r#"id=bare class=one"#,
980            "hidden",
981            r#"id="x" hidden data-n="1""#,
982            r#"  id="x"  /"#,
983            r#"id="""#,
984            "",
985            "   ",
986        ] {
987            let node = structured_html_attrs(raw);
988            assert_eq!(node.text().to_string(), raw, "lossless emit for {raw:?}");
989            assert_eq!(node.kind(), SyntaxKind::HTML_ATTRS);
990        }
991    }
992
993    #[test]
994    fn emit_html_attrs_node_structures_children() {
995        let node = structured_html_attrs(r#"id="x" class="a b" data-key="v" hidden"#);
996        let kinds: Vec<_> = node.children_with_tokens().map(|c| c.kind()).collect();
997        assert_eq!(
998            kinds.iter().filter(|k| **k == SyntaxKind::ATTR_ID).count(),
999            1
1000        );
1001        assert_eq!(
1002            kinds
1003                .iter()
1004                .filter(|k| **k == SyntaxKind::ATTR_CLASS)
1005                .count(),
1006            2,
1007            "class=\"a b\" splits into two ATTR_CLASS tokens"
1008        );
1009        // `data-key="v"` and the `hidden` flag are both ATTR_KEY_VALUE nodes.
1010        assert_eq!(
1011            node.children()
1012                .filter(|n| n.kind() == SyntaxKind::ATTR_KEY_VALUE)
1013                .count(),
1014            2
1015        );
1016    }
1017
1018    /// The structured walker and the string-deriving parser must agree.
1019    #[test]
1020    fn html_attribute_list_parse_parity() {
1021        let attrs =
1022            parse_html_attribute_list(r#"id="x" class="a b" data-key='v w' hidden"#).unwrap();
1023        assert_eq!(attrs.identifier.as_deref(), Some("x"));
1024        assert_eq!(attrs.classes, vec!["a", "b"]);
1025        assert_eq!(
1026            attrs.key_values,
1027            vec![
1028                ("data-key".to_string(), "v w".to_string()),
1029                ("hidden".to_string(), String::new()),
1030            ]
1031        );
1032        assert!(parse_html_attribute_list("   ").is_none());
1033        assert!(parse_html_attribute_list(r#"id="""#).is_none());
1034    }
1035
1036    fn structured_div_info(raw: &str) -> crate::syntax::SyntaxNode {
1037        let mut builder = GreenNodeBuilder::new();
1038        emit_div_info_node(&mut builder, raw);
1039        crate::syntax::SyntaxNode::new_root(builder.finish())
1040    }
1041
1042    #[test]
1043    fn emit_div_info_node_is_lossless_and_structures_brace_body() {
1044        // `{...}` bodies structure into ATTR_* children; bare-word shorthand
1045        // and malformed/empty bodies stay one opaque TEXT token. All round-trip.
1046        for raw in ["{#id .a .b key=val key2=\"v w\"}", "Warning", "{}", "{   }"] {
1047            let node = structured_div_info(raw);
1048            assert_eq!(node.text().to_string(), raw, "lossless emit for {raw:?}");
1049            assert_eq!(node.kind(), SyntaxKind::DIV_INFO);
1050        }
1051
1052        let structured = structured_div_info("{#id .a .b key=val key2=\"v w\"}");
1053        let kinds: Vec<_> = structured
1054            .children_with_tokens()
1055            .map(|c| c.kind())
1056            .collect();
1057        assert_eq!(
1058            kinds.iter().filter(|k| **k == SyntaxKind::ATTR_ID).count(),
1059            1
1060        );
1061        assert_eq!(
1062            kinds
1063                .iter()
1064                .filter(|k| **k == SyntaxKind::ATTR_CLASS)
1065                .count(),
1066            2
1067        );
1068        assert_eq!(
1069            kinds
1070                .iter()
1071                .filter(|k| **k == SyntaxKind::ATTR_KEY_VALUE)
1072                .count(),
1073            2
1074        );
1075
1076        // Bare-word fallback: a single opaque TEXT token, no ATTR_* children.
1077        let bare = structured_div_info("Warning");
1078        let bare_kinds: Vec<_> = bare.children_with_tokens().map(|c| c.kind()).collect();
1079        assert_eq!(bare_kinds, vec![SyntaxKind::TEXT]);
1080    }
1081}