Skip to main content

panache_parser/parser/utils/
attributes.rs

1//! Parsing for Pandoc-style attributes: {#id .class key=value}
2//!
3//! Attributes can appear after headings, fenced code blocks, fenced divs, etc.
4//! Syntax: {#identifier .class1 .class2 key1=val1 key2="val2"}
5//!
6//! Rules:
7//! - Surrounded by { }
8//! - Identifier: #id (optional, only first one counts)
9//! - Classes: .class (can have multiple)
10//! - Key-value pairs: key=value or key="value" or key='value' (can have multiple)
11//! - Whitespace flexible between items
12
13use crate::syntax::SyntaxKind;
14use rowan::GreenNodeBuilder;
15
16#[derive(Debug, PartialEq)]
17pub struct AttributeBlock {
18    pub identifier: Option<String>,
19    pub classes: Vec<String>,
20    pub key_values: Vec<(String, String)>,
21}
22
23/// Try to parse an attribute block from the end of a string
24/// Returns: (attribute_block, text_before_attributes)
25pub fn try_parse_trailing_attributes(text: &str) -> Option<(AttributeBlock, &str)> {
26    let (attrs, before, _) = try_parse_trailing_attributes_with_pos(text)?;
27    Some((attrs, before))
28}
29
30/// Try to parse an attribute block from the end of a string.
31/// Returns: (attribute_block, text_before_attributes, open_brace_position_in_trimmed_text)
32pub fn try_parse_trailing_attributes_with_pos(text: &str) -> Option<(AttributeBlock, &str, usize)> {
33    let trimmed = text.trim_end();
34
35    // Must end with }
36    if !trimmed.ends_with('}') {
37        return None;
38    }
39
40    // Find matching opening brace for the trailing attribute block, accounting
41    // for braces inside quoted attribute values.
42    let open_brace = find_matching_open_brace_for_trailing_block(trimmed)?;
43
44    // Check if this is a bracketed span like [text]{.class} rather than a heading attribute
45    // If the { is immediately after ] (with optional whitespace), this should be parsed as a span
46    let before_brace = &trimmed[..open_brace];
47    if before_brace.trim_end().ends_with(']') {
48        log::trace!("Skipping attribute parsing for bracketed span: {}", text);
49        return None;
50    }
51
52    // Parse the content between { and }
53    let attr_content = &trimmed[open_brace + 1..trimmed.len() - 1];
54    let attr_block = parse_attribute_content(attr_content)?;
55
56    // Get text before attributes (trim trailing whitespace)
57    let before_attrs = trimmed[..open_brace].trim_end();
58
59    Some((attr_block, before_attrs, open_brace))
60}
61
62fn find_matching_open_brace_for_trailing_block(text: &str) -> Option<usize> {
63    if !text.ends_with('}') {
64        return None;
65    }
66
67    let mut stack: Vec<usize> = Vec::new();
68    let mut in_quote: Option<char> = None;
69    let mut escaped = false;
70    let mut end_brace_open = None;
71
72    for (idx, ch) in text.char_indices() {
73        if let Some(q) = in_quote {
74            if escaped {
75                escaped = false;
76                continue;
77            }
78            if ch == '\\' {
79                escaped = true;
80                continue;
81            }
82            if ch == q {
83                in_quote = None;
84            }
85            continue;
86        }
87
88        match ch {
89            '\'' | '"' => in_quote = Some(ch),
90            '{' => stack.push(idx),
91            '}' => {
92                let open = stack.pop()?;
93                if idx == text.len() - 1 {
94                    end_brace_open = Some(open);
95                }
96            }
97            _ => {}
98        }
99    }
100
101    if in_quote.is_some() || !stack.is_empty() {
102        return None;
103    }
104
105    end_brace_open
106}
107
108/// Parse the content inside the attribute braces
109pub fn parse_attribute_content(content: &str) -> Option<AttributeBlock> {
110    let mut identifier = None;
111    let mut classes = Vec::new();
112    let mut key_values = Vec::new();
113
114    let content = content.trim();
115    if content.is_empty() {
116        return None; // Empty {} is not valid
117    }
118
119    let mut pos = 0;
120    let bytes = content.as_bytes();
121
122    while pos < bytes.len() {
123        // Skip whitespace
124        while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
125            pos += 1;
126        }
127
128        if pos >= bytes.len() {
129            break;
130        }
131
132        // Check what kind of attribute this is
133        if bytes[pos] == b'=' {
134            // Special case: {=format} for raw attributes
135            // This is treated as a class ".=format" for compatibility
136            pos += 1; // Skip =
137            let start = pos;
138            while pos < bytes.len() && !bytes[pos].is_ascii_whitespace() && bytes[pos] != b'}' {
139                pos += 1;
140            }
141            if pos > start {
142                // Store as "=format" class (with the = prefix)
143                classes.push(format!("={}", &content[start..pos]));
144            }
145        } else if bytes[pos] == b'#' {
146            // Identifier (only take first one)
147            if identifier.is_none() {
148                pos += 1; // Skip #
149                let start = pos;
150                while pos < bytes.len() && !bytes[pos].is_ascii_whitespace() && bytes[pos] != b'}' {
151                    pos += 1;
152                }
153                if pos > start {
154                    identifier = Some(content[start..pos].to_string());
155                }
156            } else {
157                // Skip duplicate identifiers
158                pos += 1;
159                while pos < bytes.len() && !bytes[pos].is_ascii_whitespace() && bytes[pos] != b'}' {
160                    pos += 1;
161                }
162            }
163        } else if bytes[pos] == b'.' {
164            // Class
165            pos += 1; // Skip .
166            let start = pos;
167            while pos < bytes.len() && !bytes[pos].is_ascii_whitespace() && bytes[pos] != b'}' {
168                pos += 1;
169            }
170            if pos > start {
171                classes.push(content[start..pos].to_string());
172            }
173        } else {
174            // Key-value pair
175            let key_start = pos;
176            while pos < bytes.len() && bytes[pos] != b'=' && !bytes[pos].is_ascii_whitespace() {
177                pos += 1;
178            }
179
180            if pos >= bytes.len() || bytes[pos] != b'=' {
181                // Not a valid key=value, skip this token
182                while pos < bytes.len() && !bytes[pos].is_ascii_whitespace() {
183                    pos += 1;
184                }
185                continue;
186            }
187
188            let key = content[key_start..pos].to_string();
189            pos += 1; // Skip =
190
191            // Parse value (may be quoted)
192            let value = if pos < bytes.len() && (bytes[pos] == b'"' || bytes[pos] == b'\'') {
193                let quote = bytes[pos];
194                pos += 1; // Skip opening quote
195                let val_start = pos;
196                while pos < bytes.len() && bytes[pos] != quote {
197                    pos += 1;
198                }
199                let val = content[val_start..pos].to_string();
200                if pos < bytes.len() {
201                    pos += 1; // Skip closing quote
202                }
203                val
204            } else {
205                // Unquoted value
206                let val_start = pos;
207                while pos < bytes.len() && !bytes[pos].is_ascii_whitespace() && bytes[pos] != b'}' {
208                    pos += 1;
209                }
210                content[val_start..pos].to_string()
211            };
212
213            if !key.is_empty() {
214                key_values.push((key, value));
215            }
216        }
217    }
218
219    // At least one attribute must be present
220    if identifier.is_none() && classes.is_empty() && key_values.is_empty() {
221        return None;
222    }
223
224    Some(AttributeBlock {
225        identifier,
226        classes,
227        key_values,
228    })
229}
230
231/// Parse HTML-style attributes from a raw HTML opening tag text such as
232/// `<div id="x" class="a b" data-key="v">`, returning the same
233/// `AttributeBlock` shape as Pandoc-style brace attributes. Whitespace-
234/// separated `class="..."` is split into individual classes; `id="..."`
235/// becomes the identifier; everything else becomes a key/value pair.
236/// Returns `None` if the tag has no recognized attributes.
237///
238/// Self-closing slashes (`<div .../>`) and trailing whitespace are tolerated.
239/// The leading `<TAG` and trailing `>` are stripped; this routine does not
240/// validate the tag name.
241pub fn parse_html_tag_attributes(tag_text: &str) -> Option<AttributeBlock> {
242    let trimmed = tag_text.trim_start();
243    let after_lt = trimmed.strip_prefix('<')?;
244    // Find the end of the opening tag at the first `>` not inside a quoted
245    // attribute value. Anything after that `>` (e.g. inline content + close
246    // tag for a same-line `<div id="x">Content</div>`) is irrelevant.
247    let bytes = after_lt.as_bytes();
248    let mut tag_end = None;
249    let mut quote: Option<u8> = None;
250    for (i, &b) in bytes.iter().enumerate() {
251        match (quote, b) {
252            (None, b'"') | (None, b'\'') => quote = Some(b),
253            (Some(q), b2) if b2 == q => quote = None,
254            (None, b'>') => {
255                tag_end = Some(i);
256                break;
257            }
258            _ => {}
259        }
260    }
261    let tag_end = tag_end?;
262    let inner = &after_lt[..tag_end];
263    // Drop any trailing self-closing slash.
264    let inner = inner.trim_end().trim_end_matches('/').trim_end();
265    // Drop the tag name (alphanumeric run after `<`).
266    let bytes = inner.as_bytes();
267    let mut name_end = 0usize;
268    while name_end < bytes.len()
269        && !bytes[name_end].is_ascii_whitespace()
270        && bytes[name_end] != b'/'
271    {
272        name_end += 1;
273    }
274    let attrs_text = &inner[name_end..];
275    parse_html_attribute_list(attrs_text)
276}
277
278/// Parse a raw HTML attribute list (the bytes between a tag name and the
279/// closing `>`, exclusive). Accepts inputs like `id="x" class="a b"
280/// data-key=v` and produces an [`AttributeBlock`]. Returns `None` if no
281/// recognized attributes are present.
282///
283/// Used by [`parse_html_tag_attributes`] (which strips `<TAG ...>`
284/// surrounding chrome before delegating here) and by
285/// `AttributeNode::id` for the structural `HTML_ATTRS` CST node, whose
286/// text holds JUST the attribute region.
287pub fn parse_html_attribute_list(attrs_text: &str) -> Option<AttributeBlock> {
288    let mut identifier: Option<String> = None;
289    let mut classes: Vec<String> = Vec::new();
290    let mut key_values: Vec<(String, String)> = Vec::new();
291
292    let bytes = attrs_text.as_bytes();
293    let mut i = 0usize;
294    while i < bytes.len() {
295        match bytes[i] {
296            b' ' | b'\t' | b'\n' | b'\r' | b'/' => {
297                i += 1;
298            }
299            _ => {
300                let key_start = i;
301                while i < bytes.len()
302                    && !matches!(bytes[i], b' ' | b'\t' | b'\n' | b'\r' | b'=' | b'/')
303                {
304                    i += 1;
305                }
306                let key = &attrs_text[key_start..i];
307                let value = if i < bytes.len() && bytes[i] == b'=' {
308                    i += 1;
309                    if i < bytes.len() && (bytes[i] == b'"' || bytes[i] == b'\'') {
310                        let quote = bytes[i];
311                        i += 1;
312                        let v_start = i;
313                        while i < bytes.len() && bytes[i] != quote {
314                            i += 1;
315                        }
316                        let v = attrs_text[v_start..i].to_string();
317                        if i < bytes.len() {
318                            i += 1;
319                        }
320                        v
321                    } else {
322                        let v_start = i;
323                        while i < bytes.len()
324                            && !matches!(bytes[i], b' ' | b'\t' | b'\n' | b'\r' | b'/')
325                        {
326                            i += 1;
327                        }
328                        attrs_text[v_start..i].to_string()
329                    }
330                } else {
331                    String::new()
332                };
333                if key.is_empty() {
334                    continue;
335                }
336                match key {
337                    "id" => {
338                        if identifier.is_none() && !value.is_empty() {
339                            identifier = Some(value);
340                        }
341                    }
342                    "class" => {
343                        for c in value.split_ascii_whitespace() {
344                            classes.push(c.to_string());
345                        }
346                    }
347                    _ => key_values.push((key.to_string(), value)),
348                }
349            }
350        }
351    }
352
353    if identifier.is_none() && classes.is_empty() && key_values.is_empty() {
354        return None;
355    }
356    Some(AttributeBlock {
357        identifier,
358        classes,
359        key_values,
360    })
361}
362
363/// Emit attribute block as AST nodes
364pub fn emit_attributes(builder: &mut GreenNodeBuilder, attrs: &AttributeBlock) {
365    builder.start_node(SyntaxKind::ATTRIBUTE.into());
366
367    // Build the attribute string to emit
368    let mut attr_str = String::from("{");
369
370    if let Some(ref id) = attrs.identifier {
371        attr_str.push('#');
372        attr_str.push_str(id);
373    }
374
375    for class in &attrs.classes {
376        if attr_str.len() > 1 {
377            attr_str.push(' ');
378        }
379        // Special case: if class starts with =, it's a raw format specifier
380        // Emit as {=format} not {.=format}
381        if class.starts_with('=') {
382            attr_str.push_str(class);
383        } else {
384            attr_str.push('.');
385            attr_str.push_str(class);
386        }
387    }
388
389    for (key, value) in &attrs.key_values {
390        if attr_str.len() > 1 {
391            attr_str.push(' ');
392        }
393        attr_str.push_str(key);
394        attr_str.push('=');
395
396        // Always quote attribute values to match Pandoc's behavior
397        attr_str.push('"');
398        attr_str.push_str(&value.replace('"', "\\\""));
399        attr_str.push('"');
400    }
401
402    attr_str.push('}');
403
404    builder.token(SyntaxKind::ATTRIBUTE.into(), &attr_str);
405    builder.finish_node();
406}
407
408#[cfg(test)]
409mod tests {
410    use super::*;
411
412    #[test]
413    fn test_simple_id() {
414        let result = try_parse_trailing_attributes("Heading {#my-id}");
415        assert!(result.is_some());
416        let (attrs, before) = result.unwrap();
417        assert_eq!(before, "Heading");
418        assert_eq!(attrs.identifier, Some("my-id".to_string()));
419        assert!(attrs.classes.is_empty());
420        assert!(attrs.key_values.is_empty());
421    }
422
423    #[test]
424    fn test_single_class() {
425        let result = try_parse_trailing_attributes("Text {.myclass}");
426        assert!(result.is_some());
427        let (attrs, _) = result.unwrap();
428        assert_eq!(attrs.classes, vec!["myclass"]);
429    }
430
431    #[test]
432    fn test_multiple_classes() {
433        let result = try_parse_trailing_attributes("Text {.class1 .class2 .class3}");
434        assert!(result.is_some());
435        let (attrs, _) = result.unwrap();
436        assert_eq!(attrs.classes, vec!["class1", "class2", "class3"]);
437    }
438
439    #[test]
440    fn test_key_value_unquoted() {
441        let result = try_parse_trailing_attributes("Text {key=value}");
442        assert!(result.is_some());
443        let (attrs, _) = result.unwrap();
444        assert_eq!(
445            attrs.key_values,
446            vec![("key".to_string(), "value".to_string())]
447        );
448    }
449
450    #[test]
451    fn test_key_value_quoted() {
452        let result = try_parse_trailing_attributes("Text {key=\"value with spaces\"}");
453        assert!(result.is_some());
454        let (attrs, _) = result.unwrap();
455        assert_eq!(
456            attrs.key_values,
457            vec![("key".to_string(), "value with spaces".to_string())]
458        );
459    }
460
461    #[test]
462    fn test_full_attributes() {
463        let result =
464            try_parse_trailing_attributes("Heading {#id .class1 .class2 key1=val1 key2=\"val 2\"}");
465        assert!(result.is_some());
466        let (attrs, before) = result.unwrap();
467        assert_eq!(before, "Heading");
468        assert_eq!(attrs.identifier, Some("id".to_string()));
469        assert_eq!(attrs.classes, vec!["class1", "class2"]);
470        assert_eq!(attrs.key_values.len(), 2);
471        assert_eq!(
472            attrs.key_values[0],
473            ("key1".to_string(), "val1".to_string())
474        );
475        assert_eq!(
476            attrs.key_values[1],
477            ("key2".to_string(), "val 2".to_string())
478        );
479    }
480
481    #[test]
482    fn test_trailing_attributes_with_shortcode_in_quoted_value() {
483        let text = "Slide Title {background-image='{{< placeholder 100 100 >}}' background-size=\"100px\"}";
484        let result = try_parse_trailing_attributes(text);
485        assert!(result.is_some());
486        let (attrs, before) = result.unwrap();
487        assert_eq!(before, "Slide Title");
488        assert_eq!(attrs.key_values.len(), 2);
489        assert_eq!(
490            attrs.key_values[0],
491            (
492                "background-image".to_string(),
493                "{{< placeholder 100 100 >}}".to_string()
494            )
495        );
496        assert_eq!(
497            attrs.key_values[1],
498            ("background-size".to_string(), "100px".to_string())
499        );
500    }
501
502    #[test]
503    fn test_no_attributes() {
504        let result = try_parse_trailing_attributes("Heading with no attributes");
505        assert!(result.is_none());
506    }
507
508    #[test]
509    fn test_empty_braces() {
510        let result = try_parse_trailing_attributes("Heading {}");
511        assert!(result.is_none());
512    }
513
514    #[test]
515    fn test_only_first_id_counts() {
516        let result = try_parse_trailing_attributes("Text {#id1 #id2}");
517        assert!(result.is_some());
518        let (attrs, _) = result.unwrap();
519        assert_eq!(attrs.identifier, Some("id1".to_string()));
520    }
521
522    #[test]
523    fn test_whitespace_handling() {
524        let result = try_parse_trailing_attributes("Text {  #id   .class   key=val  }");
525        assert!(result.is_some());
526        let (attrs, _) = result.unwrap();
527        assert_eq!(attrs.identifier, Some("id".to_string()));
528        assert_eq!(attrs.classes, vec!["class"]);
529        assert_eq!(
530            attrs.key_values,
531            vec![("key".to_string(), "val".to_string())]
532        );
533    }
534
535    #[test]
536    fn test_parse_html_tag_attributes_id_only() {
537        let attrs = parse_html_tag_attributes(r#"<div id="anchor-c">"#).unwrap();
538        assert_eq!(attrs.identifier.as_deref(), Some("anchor-c"));
539        assert!(attrs.classes.is_empty());
540        assert!(attrs.key_values.is_empty());
541    }
542
543    #[test]
544    fn test_parse_html_tag_attributes_inline_content_after_open() {
545        // For a same-line block `<div id="x">Content</div>`, the entire
546        // line is in the HTML_BLOCK_TAG. The parser must terminate at the
547        // first unquoted `>` and ignore the trailing content + close tag.
548        let attrs = parse_html_tag_attributes(r#"<div id="anchor-c">Content.</div>"#).unwrap();
549        assert_eq!(attrs.identifier.as_deref(), Some("anchor-c"));
550    }
551
552    #[test]
553    fn test_parse_html_tag_attributes_class_and_kv() {
554        let attrs = parse_html_tag_attributes(r#"<div id="x" class="a b" data-key="v">"#).unwrap();
555        assert_eq!(attrs.identifier.as_deref(), Some("x"));
556        assert_eq!(attrs.classes, vec!["a", "b"]);
557        assert_eq!(
558            attrs.key_values,
559            vec![("data-key".to_string(), "v".to_string())]
560        );
561    }
562
563    #[test]
564    fn test_parse_html_tag_attributes_no_attrs() {
565        assert!(parse_html_tag_attributes("<div>").is_none());
566    }
567
568    #[test]
569    fn test_trailing_whitespace_before_attrs() {
570        let result = try_parse_trailing_attributes("Heading   {#id}");
571        assert!(result.is_some());
572        let (_, before) = result.unwrap();
573        assert_eq!(before, "Heading");
574    }
575}