Skip to main content

rumdl_lib/utils/
mkdocs_attr_list.rs

1/// MkDocs attr_list extension support
2///
3/// This module provides support for the Python-Markdown attr_list extension,
4/// which allows adding custom attributes to Markdown elements including:
5/// - Custom IDs: `{#custom-id}`
6/// - Classes: `{.my-class}`
7/// - Key-value pairs: `{key="value"}`
8///
9/// ## Syntax
10///
11/// ### Headings with custom anchors
12/// ```markdown
13/// # Heading {#custom-anchor}
14/// # Heading {.class-name}
15/// # Heading {#id .class key=value}
16/// ```
17///
18/// ### Block attributes (on separate line)
19/// ```markdown
20/// Paragraph text here.
21/// {: #id .class }
22/// ```
23///
24/// ### Inline attributes
25/// ```markdown
26/// [link text](url){: .external target="_blank" }
27/// *emphasis*{: .special }
28/// ```
29///
30/// ## References
31///
32/// - [Python-Markdown attr_list](https://python-markdown.github.io/extensions/attr_list/)
33/// - [MkDocs Material - Anchor Links](https://squidfunk.github.io/mkdocs-material/reference/annotations/#anchor-links)
34use regex::Regex;
35use std::sync::LazyLock;
36
37/// Pattern to match attr_list syntax: `{: #id .class key="value" }`
38/// The `:` prefix is optional (kramdown style uses it, but attr_list accepts both)
39/// Requirements for valid attr_list:
40/// - Must start with `{` and optional `:` with optional whitespace
41/// - Must contain at least one of: #id, .class, or key="value"
42/// - Must end with `}`
43static ATTR_LIST_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
44    // Pattern requires at least one attribute (id, class, or key=value)
45    // to avoid matching plain text in braces like {word}
46    Regex::new(r#"\{:?\s*(?:(?:[#.][a-zA-Z_][a-zA-Z0-9_-]*|[a-zA-Z_][a-zA-Z0-9_-]*=["'][^"']*["'])\s*)+\}"#).unwrap()
47});
48
49/// Pattern to extract custom ID from attr_list: `#id`
50static CUSTOM_ID_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"#([a-zA-Z_][a-zA-Z0-9_-]*)").unwrap());
51
52/// Pattern to extract classes from attr_list: `.class`
53static CLASS_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\.([a-zA-Z_][a-zA-Z0-9_-]*)").unwrap());
54
55/// Pattern to extract key-value pairs: `key="value"` or `key='value'`
56static KEY_VALUE_PATTERN: LazyLock<Regex> =
57    LazyLock::new(|| Regex::new(r#"([a-zA-Z_][a-zA-Z0-9_-]*)=["']([^"']*)["']"#).unwrap());
58
59/// Parsed attribute list containing IDs, classes, and key-value pairs
60#[derive(Debug, Clone, Default, PartialEq)]
61pub struct AttrList {
62    /// Custom ID (e.g., `custom-id` from `{#custom-id}`)
63    pub id: Option<String>,
64    /// CSS classes (e.g., `["class1", "class2"]` from `{.class1 .class2}`)
65    pub classes: Vec<String>,
66    /// Key-value attributes (e.g., `[("target", "_blank")]`)
67    pub attributes: Vec<(String, String)>,
68    /// Start position in the line (0-indexed)
69    pub start: usize,
70    /// End position in the line (0-indexed, exclusive)
71    pub end: usize,
72}
73
74impl AttrList {
75    /// Create a new empty AttrList
76    pub fn new() -> Self {
77        Self::default()
78    }
79
80    /// Check if this attr_list has a custom ID
81    #[inline]
82    pub fn has_id(&self) -> bool {
83        self.id.is_some()
84    }
85
86    /// Check if this attr_list has any classes
87    #[inline]
88    pub fn has_classes(&self) -> bool {
89        !self.classes.is_empty()
90    }
91
92    /// Check if this attr_list has any attributes
93    #[inline]
94    pub fn has_attributes(&self) -> bool {
95        !self.attributes.is_empty()
96    }
97
98    /// Check if this attr_list is empty (no id, classes, or attributes)
99    #[inline]
100    pub fn is_empty(&self) -> bool {
101        self.id.is_none() && self.classes.is_empty() && self.attributes.is_empty()
102    }
103}
104
105/// Check if a line contains attr_list syntax
106#[inline]
107pub fn contains_attr_list(line: &str) -> bool {
108    // Fast path: check for opening brace first
109    if !line.contains('{') {
110        return false;
111    }
112    ATTR_LIST_PATTERN.is_match(line)
113}
114
115/// Check if a line is a standalone block attr_list (on its own line)
116/// This is used for block-level attributes like:
117/// ```markdown
118/// Paragraph text.
119/// { .class-name }
120/// ```
121/// or with colon:
122/// ```markdown
123/// Paragraph text.
124/// {: .class-name }
125/// ```
126#[inline]
127pub fn is_standalone_attr_list(line: &str) -> bool {
128    let trimmed = line.trim();
129    // Must start with { and end with }
130    if !trimmed.starts_with('{') || !trimmed.ends_with('}') {
131        return false;
132    }
133    // Must be a valid attr_list (not just random braces)
134    ATTR_LIST_PATTERN.is_match(trimmed)
135}
136
137/// Extract all attr_lists from a line
138pub fn find_attr_lists(line: &str) -> Vec<AttrList> {
139    if !line.contains('{') {
140        return Vec::new();
141    }
142
143    let mut results = Vec::new();
144
145    for m in ATTR_LIST_PATTERN.find_iter(line) {
146        let attr_text = m.as_str();
147        let mut attr_list = AttrList {
148            start: m.start(),
149            end: m.end(),
150            ..Default::default()
151        };
152
153        // Extract custom ID (first one wins per HTML spec)
154        if let Some(caps) = CUSTOM_ID_PATTERN.captures(attr_text)
155            && let Some(id_match) = caps.get(1)
156        {
157            attr_list.id = Some(id_match.as_str().to_string());
158        }
159
160        // Extract all classes
161        for caps in CLASS_PATTERN.captures_iter(attr_text) {
162            if let Some(class_match) = caps.get(1) {
163                attr_list.classes.push(class_match.as_str().to_string());
164            }
165        }
166
167        // Extract key-value pairs
168        for caps in KEY_VALUE_PATTERN.captures_iter(attr_text) {
169            if let Some(key) = caps.get(1)
170                && let Some(value) = caps.get(2)
171            {
172                attr_list
173                    .attributes
174                    .push((key.as_str().to_string(), value.as_str().to_string()));
175            }
176        }
177
178        if !attr_list.is_empty() {
179            results.push(attr_list);
180        }
181    }
182
183    results
184}
185
186/// Extract custom ID from a heading line with attr_list syntax
187///
188/// Returns the custom ID if found, or None if no custom ID is present.
189///
190/// # Examples
191/// ```
192/// use rumdl_lib::utils::mkdocs_attr_list::extract_heading_custom_id;
193///
194/// assert_eq!(extract_heading_custom_id("# Heading {#my-id}"), Some("my-id".to_string()));
195/// assert_eq!(extract_heading_custom_id("## Title {#custom .class}"), Some("custom".to_string()));
196/// assert_eq!(extract_heading_custom_id("# No ID here"), None);
197/// ```
198pub fn extract_heading_custom_id(line: &str) -> Option<String> {
199    let attrs = find_attr_lists(line);
200    attrs.into_iter().find_map(|a| a.id)
201}
202
203/// Strip attr_list syntax from a heading text
204///
205/// Returns the heading text without the trailing attr_list.
206///
207/// # Examples
208/// ```
209/// use rumdl_lib::utils::mkdocs_attr_list::strip_attr_list_from_heading;
210///
211/// assert_eq!(strip_attr_list_from_heading("Heading {#my-id}"), "Heading");
212/// assert_eq!(strip_attr_list_from_heading("Title {#id .class}"), "Title");
213/// assert_eq!(strip_attr_list_from_heading("No attributes"), "No attributes");
214/// ```
215pub fn strip_attr_list_from_heading(text: &str) -> String {
216    if let Some(m) = ATTR_LIST_PATTERN.find(text) {
217        // Only strip if at the end of the text (with optional whitespace)
218        let after = &text[m.end()..];
219        if after.trim().is_empty() {
220            return text[..m.start()].trim_end().to_string();
221        }
222    }
223    text.to_string()
224}
225
226/// Check if a position in a line is within an attr_list
227pub fn is_in_attr_list(line: &str, position: usize) -> bool {
228    for m in ATTR_LIST_PATTERN.find_iter(line) {
229        if m.start() <= position && position < m.end() {
230            return true;
231        }
232    }
233    false
234}
235
236/// Extract all custom anchor IDs from a document
237///
238/// This function finds all custom IDs defined using attr_list syntax throughout
239/// the document. These IDs can be used as fragment link targets.
240///
241/// # Arguments
242/// * `content` - The full document content
243///
244/// # Returns
245/// A vector of (custom_id, line_number) tuples, where line_number is 1-indexed
246pub fn extract_all_custom_anchors(content: &str) -> Vec<(String, usize)> {
247    let mut anchors = Vec::new();
248
249    for (line_idx, line) in content.lines().enumerate() {
250        let line_num = line_idx + 1;
251
252        for attr_list in find_attr_lists(line) {
253            if let Some(id) = attr_list.id {
254                anchors.push((id, line_num));
255            }
256        }
257    }
258
259    anchors
260}
261
262#[cfg(test)]
263mod tests {
264    use super::*;
265
266    #[test]
267    fn test_contains_attr_list() {
268        // Valid attr_list syntax
269        assert!(contains_attr_list("# Heading {#custom-id}"));
270        assert!(contains_attr_list("# Heading {.my-class}"));
271        assert!(contains_attr_list("# Heading {#id .class}"));
272        assert!(contains_attr_list("Text {: #id}"));
273        assert!(contains_attr_list("Link {target=\"_blank\"}"));
274
275        // Not attr_list
276        assert!(!contains_attr_list("# Regular heading"));
277        assert!(!contains_attr_list("Code with {braces}"));
278        assert!(!contains_attr_list("Empty {}"));
279        assert!(!contains_attr_list("Just text"));
280    }
281
282    #[test]
283    fn test_find_attr_lists_basic() {
284        let attrs = find_attr_lists("# Heading {#custom-id}");
285        assert_eq!(attrs.len(), 1);
286        assert_eq!(attrs[0].id, Some("custom-id".to_string()));
287        assert!(attrs[0].classes.is_empty());
288    }
289
290    #[test]
291    fn test_find_attr_lists_with_class() {
292        let attrs = find_attr_lists("# Heading {.highlight}");
293        assert_eq!(attrs.len(), 1);
294        assert!(attrs[0].id.is_none());
295        assert_eq!(attrs[0].classes, vec!["highlight"]);
296    }
297
298    #[test]
299    fn test_find_attr_lists_complex() {
300        let attrs = find_attr_lists("# Heading {#my-id .class1 .class2 data-value=\"test\"}");
301        assert_eq!(attrs.len(), 1);
302        assert_eq!(attrs[0].id, Some("my-id".to_string()));
303        assert_eq!(attrs[0].classes, vec!["class1", "class2"]);
304        assert_eq!(
305            attrs[0].attributes,
306            vec![("data-value".to_string(), "test".to_string())]
307        );
308    }
309
310    #[test]
311    fn test_find_attr_lists_kramdown_style() {
312        // With colon prefix (kramdown style)
313        let attrs = find_attr_lists("Paragraph {: #para-id .special }");
314        assert_eq!(attrs.len(), 1);
315        assert_eq!(attrs[0].id, Some("para-id".to_string()));
316        assert_eq!(attrs[0].classes, vec!["special"]);
317    }
318
319    #[test]
320    fn test_extract_heading_custom_id() {
321        assert_eq!(
322            extract_heading_custom_id("# Heading {#my-anchor}"),
323            Some("my-anchor".to_string())
324        );
325        assert_eq!(
326            extract_heading_custom_id("## Title {#title .class}"),
327            Some("title".to_string())
328        );
329        assert_eq!(extract_heading_custom_id("# No ID {.class-only}"), None);
330        assert_eq!(extract_heading_custom_id("# Plain heading"), None);
331    }
332
333    #[test]
334    fn test_strip_attr_list_from_heading() {
335        assert_eq!(strip_attr_list_from_heading("Heading {#my-id}"), "Heading");
336        assert_eq!(strip_attr_list_from_heading("Title {#id .class}"), "Title");
337        assert_eq!(
338            strip_attr_list_from_heading("Multi Word Title {#anchor}"),
339            "Multi Word Title"
340        );
341        assert_eq!(strip_attr_list_from_heading("No attributes"), "No attributes");
342        // Attr list in middle should not be stripped
343        assert_eq!(strip_attr_list_from_heading("Before {#id} after"), "Before {#id} after");
344    }
345
346    #[test]
347    fn test_is_in_attr_list() {
348        let line = "Some text {#my-id} more text";
349        assert!(!is_in_attr_list(line, 0)); // "S"
350        assert!(!is_in_attr_list(line, 8)); // " "
351        assert!(is_in_attr_list(line, 10)); // "{"
352        assert!(is_in_attr_list(line, 15)); // "i"
353        assert!(!is_in_attr_list(line, 19)); // " "
354    }
355
356    #[test]
357    fn test_extract_all_custom_anchors() {
358        let content = r#"# First Heading {#first}
359
360Some paragraph {: #para-id}
361
362## Second {#second .class}
363
364No ID here.
365
366### Third {.class-only}
367
368{#standalone-id}
369"#;
370        let anchors = extract_all_custom_anchors(content);
371
372        assert_eq!(anchors.len(), 4);
373        assert_eq!(anchors[0], ("first".to_string(), 1));
374        assert_eq!(anchors[1], ("para-id".to_string(), 3));
375        assert_eq!(anchors[2], ("second".to_string(), 5));
376        assert_eq!(anchors[3], ("standalone-id".to_string(), 11));
377    }
378
379    #[test]
380    fn test_multiple_attr_lists_same_line() {
381        let attrs = find_attr_lists("[link]{#link-id} and [other]{#other-id}");
382        assert_eq!(attrs.len(), 2);
383        assert_eq!(attrs[0].id, Some("link-id".to_string()));
384        assert_eq!(attrs[1].id, Some("other-id".to_string()));
385    }
386
387    #[test]
388    fn test_attr_list_positions() {
389        let line = "Text {#my-id} more";
390        let attrs = find_attr_lists(line);
391        assert_eq!(attrs.len(), 1);
392        assert_eq!(attrs[0].start, 5);
393        assert_eq!(attrs[0].end, 13);
394        assert_eq!(&line[attrs[0].start..attrs[0].end], "{#my-id}");
395    }
396
397    #[test]
398    fn test_underscore_in_identifiers() {
399        let attrs = find_attr_lists("# Heading {#my_custom_id .my_class}");
400        assert_eq!(attrs.len(), 1);
401        assert_eq!(attrs[0].id, Some("my_custom_id".to_string()));
402        assert_eq!(attrs[0].classes, vec!["my_class"]);
403    }
404
405    /// Test for issue #337: Standalone attr_lists should be detected
406    /// These should be treated as paragraph boundaries in reflow
407    #[test]
408    fn test_is_standalone_attr_list() {
409        // Valid standalone attr_lists (on their own line)
410        assert!(is_standalone_attr_list("{ .class-name }"));
411        assert!(is_standalone_attr_list("{: .class-name }"));
412        assert!(is_standalone_attr_list("{#custom-id}"));
413        assert!(is_standalone_attr_list("{: #custom-id .class }"));
414        assert!(is_standalone_attr_list("  { .indented }  ")); // With whitespace
415
416        // Not standalone (part of other content)
417        assert!(!is_standalone_attr_list("Some text {#id}"));
418        assert!(!is_standalone_attr_list("{#id} more text"));
419        assert!(!is_standalone_attr_list("# Heading {#id}"));
420
421        // Not valid attr_lists (just braces)
422        assert!(!is_standalone_attr_list("{ }"));
423        assert!(!is_standalone_attr_list("{}"));
424        assert!(!is_standalone_attr_list("{ random text }"));
425
426        // Empty line
427        assert!(!is_standalone_attr_list(""));
428        assert!(!is_standalone_attr_list("   "));
429    }
430}