Skip to main content

rumdl_lib/utils/
mkdocs_attr_list.rs

1/// MkDocs attr_list extension support
2///
3/// This module provides support for the Python-Markdown attr_list extension,
4/// which allows adding custom attributes to Markdown elements including:
5/// - Custom IDs: `{#custom-id}`
6/// - Classes: `{.my-class}`
7/// - Key-value pairs: `{key="value"}`
8///
9/// ## Syntax
10///
11/// ### Headings with custom anchors
12/// ```markdown
13/// # Heading {#custom-anchor}
14/// # Heading {.class-name}
15/// # Heading {#id .class key=value}
16/// ```
17///
18/// ### Block attributes (on separate line)
19/// ```markdown
20/// Paragraph text here.
21/// {: #id .class }
22/// ```
23///
24/// ### Inline attributes
25/// ```markdown
26/// [link text](url){: .external target="_blank" }
27/// *emphasis*{: .special }
28/// ```
29///
30/// ## References
31///
32/// - [Python-Markdown attr_list](https://python-markdown.github.io/extensions/attr_list/)
33/// - [MkDocs Material - Anchor Links](https://squidfunk.github.io/mkdocs-material/reference/annotations/#anchor-links)
34use regex::Regex;
35use std::sync::LazyLock;
36
37/// Pattern to match attr_list syntax: `{: #id .class key="value" }`
38/// The `:` prefix is optional (kramdown style uses it, but attr_list accepts both)
39/// Requirements for valid attr_list:
40/// - Must start with `{` and optional `:` with optional whitespace
41/// - Must contain at least one of: #id, .class, or key="value"
42/// - Must end with `}`
43pub static ATTR_LIST_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
44    // Pattern requires at least one attribute (id, class, or key=value)
45    // to avoid matching plain text in braces like {word}
46    Regex::new(r#"\{:?\s*(?:(?:#[a-zA-Z0-9_][a-zA-Z0-9_-]*|\.[a-zA-Z_][a-zA-Z0-9_-]*|[a-zA-Z_][a-zA-Z0-9_-]*=["'][^"']*["'])\s*)+\}"#).unwrap()
47});
48
49/// Pattern to extract custom ID from attr_list: `#id`
50static CUSTOM_ID_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"#([a-zA-Z0-9_][a-zA-Z0-9_-]*)").unwrap());
51
52/// Pattern to extract classes from attr_list: `.class`
53static CLASS_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\.([a-zA-Z_][a-zA-Z0-9_-]*)").unwrap());
54
55/// Pattern to extract key-value pairs: `key="value"` or `key='value'`
56static KEY_VALUE_PATTERN: LazyLock<Regex> =
57    LazyLock::new(|| Regex::new(r#"([a-zA-Z_][a-zA-Z0-9_-]*)=["']([^"']*)["']"#).unwrap());
58
59/// Parsed attribute list containing IDs, classes, and key-value pairs
60#[derive(Debug, Clone, Default, PartialEq)]
61pub struct AttrList {
62    /// Custom ID (e.g., `custom-id` from `{#custom-id}`)
63    pub id: Option<String>,
64    /// CSS classes (e.g., `["class1", "class2"]` from `{.class1 .class2}`)
65    pub classes: Vec<String>,
66    /// Key-value attributes (e.g., `[("target", "_blank")]`)
67    pub attributes: Vec<(String, String)>,
68    /// Start position in the line (0-indexed)
69    pub start: usize,
70    /// End position in the line (0-indexed, exclusive)
71    pub end: usize,
72}
73
74impl AttrList {
75    /// Create a new empty AttrList
76    pub fn new() -> Self {
77        Self::default()
78    }
79
80    /// Check if this attr_list has a custom ID
81    #[inline]
82    pub fn has_id(&self) -> bool {
83        self.id.is_some()
84    }
85
86    /// Check if this attr_list has any classes
87    #[inline]
88    pub fn has_classes(&self) -> bool {
89        !self.classes.is_empty()
90    }
91
92    /// Check if this attr_list has any attributes
93    #[inline]
94    pub fn has_attributes(&self) -> bool {
95        !self.attributes.is_empty()
96    }
97
98    /// Check if this attr_list is empty (no id, classes, or attributes)
99    #[inline]
100    pub fn is_empty(&self) -> bool {
101        self.id.is_none() && self.classes.is_empty() && self.attributes.is_empty()
102    }
103}
104
105/// Check if a line contains attr_list syntax
106#[inline]
107pub fn contains_attr_list(line: &str) -> bool {
108    // Fast path: check for opening brace first
109    if !line.contains('{') {
110        return false;
111    }
112    ATTR_LIST_PATTERN.is_match(line)
113}
114
115/// Check if a line is a standalone block attr_list (on its own line)
116/// This is used for block-level attributes like:
117/// ```markdown
118/// Paragraph text.
119/// { .class-name }
120/// ```
121/// or with colon:
122/// ```markdown
123/// Paragraph text.
124/// {: .class-name }
125/// ```
126#[inline]
127pub fn is_standalone_attr_list(line: &str) -> bool {
128    let trimmed = line.trim();
129    // Must start with { and end with }
130    if !trimmed.starts_with('{') || !trimmed.ends_with('}') {
131        return false;
132    }
133    // Must be a valid attr_list (not just random braces)
134    ATTR_LIST_PATTERN.is_match(trimmed)
135}
136
137/// Check if a line is a MkDocs anchor line (empty link with attr_list)
138///
139/// MkDocs anchor lines are used to create invisible anchor points in documentation.
140/// They consist of an empty link `[]()` followed by an attr_list containing an ID
141/// or class. These are rendered as `<a id="anchor"></a>` in the HTML output.
142///
143/// # Syntax
144///
145/// ```markdown
146/// [](){ #anchor-id }              <!-- Basic anchor -->
147/// [](){#anchor-id}                <!-- No spaces -->
148/// [](){ #id .class }              <!-- Anchor with class -->
149/// [](){: #id }                    <!-- Kramdown-style with colon -->
150/// [](){ .highlight }              <!-- Class-only (styling hook) -->
151/// ```
152///
153/// # Use Cases
154///
155/// 1. **Deep linking**: Create anchor points for linking to specific paragraphs
156/// 2. **Cross-references**: Target for mkdocs-autorefs links
157/// 3. **Styling hooks**: Apply CSS classes to following content
158///
159/// # Examples
160///
161/// ```
162/// use rumdl_lib::utils::mkdocs_attr_list::is_mkdocs_anchor_line;
163///
164/// // Valid anchor lines
165/// assert!(is_mkdocs_anchor_line("[](){ #example }"));
166/// assert!(is_mkdocs_anchor_line("[](){#example}"));
167/// assert!(is_mkdocs_anchor_line("[](){ #id .class }"));
168/// assert!(is_mkdocs_anchor_line("[](){: #anchor }"));
169///
170/// // NOT anchor lines
171/// assert!(!is_mkdocs_anchor_line("[link](url)"));           // Has URL
172/// assert!(!is_mkdocs_anchor_line("[](){ #id } text"));      // Has trailing content
173/// assert!(!is_mkdocs_anchor_line("[]()"));                  // No attr_list
174/// assert!(!is_mkdocs_anchor_line("[](){ }"));               // Empty attr_list
175/// ```
176///
177/// # References
178///
179/// - [Python-Markdown attr_list](https://python-markdown.github.io/extensions/attr_list/)
180/// - [MkDocs Material - Anchor Links](https://squidfunk.github.io/mkdocs-material/reference/annotations/#anchor-links)
181/// - [MkDocs discussions on paragraph anchors](https://github.com/mkdocs/mkdocs/discussions/3754)
182#[inline]
183pub fn is_mkdocs_anchor_line(line: &str) -> bool {
184    let trimmed = line.trim();
185
186    // Fast path: must contain the empty link pattern
187    if !trimmed.starts_with("[]()") {
188        return false;
189    }
190
191    // Extract the part after []()
192    let after_link = &trimmed[4..];
193
194    // Fast path: must contain opening brace for attr_list
195    if !after_link.contains('{') {
196        return false;
197    }
198
199    // Skip optional whitespace between []() and {
200    let attr_start = after_link.trim_start();
201
202    // Must start with { or {:
203    if !attr_start.starts_with('{') {
204        return false;
205    }
206
207    // Find the closing brace
208    let Some(close_idx) = attr_start.find('}') else {
209        return false;
210    };
211
212    // Nothing meaningful should follow the closing brace
213    if !attr_start[close_idx + 1..].trim().is_empty() {
214        return false;
215    }
216
217    // Extract and validate the attr_list content
218    let attr_content = &attr_start[..=close_idx];
219
220    // Use the existing attr_list validation - must be a valid attr_list
221    if !ATTR_LIST_PATTERN.is_match(attr_content) {
222        return false;
223    }
224
225    // Parse the attr_list to ensure it has meaningful content (ID or class)
226    let attrs = find_attr_lists(attr_content);
227    attrs.iter().any(|a| a.has_id() || a.has_classes())
228}
229
230/// Extract all attr_lists from a line
231pub fn find_attr_lists(line: &str) -> Vec<AttrList> {
232    if !line.contains('{') {
233        return Vec::new();
234    }
235
236    let mut results = Vec::new();
237
238    for m in ATTR_LIST_PATTERN.find_iter(line) {
239        let attr_text = m.as_str();
240        let mut attr_list = AttrList {
241            start: m.start(),
242            end: m.end(),
243            ..Default::default()
244        };
245
246        // Extract custom ID (first one wins per HTML spec)
247        if let Some(caps) = CUSTOM_ID_PATTERN.captures(attr_text)
248            && let Some(id_match) = caps.get(1)
249        {
250            attr_list.id = Some(id_match.as_str().to_string());
251        }
252
253        // Extract all classes
254        for caps in CLASS_PATTERN.captures_iter(attr_text) {
255            if let Some(class_match) = caps.get(1) {
256                attr_list.classes.push(class_match.as_str().to_string());
257            }
258        }
259
260        // Extract key-value pairs
261        for caps in KEY_VALUE_PATTERN.captures_iter(attr_text) {
262            if let Some(key) = caps.get(1)
263                && let Some(value) = caps.get(2)
264            {
265                attr_list
266                    .attributes
267                    .push((key.as_str().to_string(), value.as_str().to_string()));
268            }
269        }
270
271        if !attr_list.is_empty() {
272            results.push(attr_list);
273        }
274    }
275
276    results
277}
278
279#[cfg(test)]
280mod tests {
281    use super::*;
282
283    #[test]
284    fn test_contains_attr_list() {
285        // Valid attr_list syntax
286        assert!(contains_attr_list("# Heading {#custom-id}"));
287        assert!(contains_attr_list("# Heading {.my-class}"));
288        assert!(contains_attr_list("# Heading {#id .class}"));
289        assert!(contains_attr_list("Text {: #id}"));
290        assert!(contains_attr_list("Link {target=\"_blank\"}"));
291
292        // Not attr_list
293        assert!(!contains_attr_list("# Regular heading"));
294        assert!(!contains_attr_list("Code with {braces}"));
295        assert!(!contains_attr_list("Empty {}"));
296        assert!(!contains_attr_list("Just text"));
297    }
298
299    #[test]
300    fn test_find_attr_lists_basic() {
301        let attrs = find_attr_lists("# Heading {#custom-id}");
302        assert_eq!(attrs.len(), 1);
303        assert_eq!(attrs[0].id, Some("custom-id".to_string()));
304        assert!(attrs[0].classes.is_empty());
305    }
306
307    #[test]
308    fn test_find_attr_lists_with_class() {
309        let attrs = find_attr_lists("# Heading {.highlight}");
310        assert_eq!(attrs.len(), 1);
311        assert!(attrs[0].id.is_none());
312        assert_eq!(attrs[0].classes, vec!["highlight"]);
313    }
314
315    #[test]
316    fn test_find_attr_lists_complex() {
317        let attrs = find_attr_lists("# Heading {#my-id .class1 .class2 data-value=\"test\"}");
318        assert_eq!(attrs.len(), 1);
319        assert_eq!(attrs[0].id, Some("my-id".to_string()));
320        assert_eq!(attrs[0].classes, vec!["class1", "class2"]);
321        assert_eq!(
322            attrs[0].attributes,
323            vec![("data-value".to_string(), "test".to_string())]
324        );
325    }
326
327    #[test]
328    fn test_find_attr_lists_kramdown_style() {
329        // With colon prefix (kramdown style)
330        let attrs = find_attr_lists("Paragraph {: #para-id .special }");
331        assert_eq!(attrs.len(), 1);
332        assert_eq!(attrs[0].id, Some("para-id".to_string()));
333        assert_eq!(attrs[0].classes, vec!["special"]);
334    }
335
336    #[test]
337    fn test_multiple_attr_lists_same_line() {
338        let attrs = find_attr_lists("[link]{#link-id} and [other]{#other-id}");
339        assert_eq!(attrs.len(), 2);
340        assert_eq!(attrs[0].id, Some("link-id".to_string()));
341        assert_eq!(attrs[1].id, Some("other-id".to_string()));
342    }
343
344    #[test]
345    fn test_attr_list_positions() {
346        let line = "Text {#my-id} more";
347        let attrs = find_attr_lists(line);
348        assert_eq!(attrs.len(), 1);
349        assert_eq!(attrs[0].start, 5);
350        assert_eq!(attrs[0].end, 13);
351        assert_eq!(&line[attrs[0].start..attrs[0].end], "{#my-id}");
352    }
353
354    #[test]
355    fn test_underscore_in_identifiers() {
356        let attrs = find_attr_lists("# Heading {#my_custom_id .my_class}");
357        assert_eq!(attrs.len(), 1);
358        assert_eq!(attrs[0].id, Some("my_custom_id".to_string()));
359        assert_eq!(attrs[0].classes, vec!["my_class"]);
360    }
361
362    /// Test for issue #337: Standalone attr_lists should be detected
363    /// These should be treated as paragraph boundaries in reflow
364    #[test]
365    fn test_is_standalone_attr_list() {
366        // Valid standalone attr_lists (on their own line)
367        assert!(is_standalone_attr_list("{ .class-name }"));
368        assert!(is_standalone_attr_list("{: .class-name }"));
369        assert!(is_standalone_attr_list("{#custom-id}"));
370        assert!(is_standalone_attr_list("{: #custom-id .class }"));
371        assert!(is_standalone_attr_list("  { .indented }  ")); // With whitespace
372
373        // Not standalone (part of other content)
374        assert!(!is_standalone_attr_list("Some text {#id}"));
375        assert!(!is_standalone_attr_list("{#id} more text"));
376        assert!(!is_standalone_attr_list("# Heading {#id}"));
377
378        // Not valid attr_lists (just braces)
379        assert!(!is_standalone_attr_list("{ }"));
380        assert!(!is_standalone_attr_list("{}"));
381        assert!(!is_standalone_attr_list("{ random text }"));
382
383        // Empty line
384        assert!(!is_standalone_attr_list(""));
385        assert!(!is_standalone_attr_list("   "));
386    }
387
388    /// Test for issue #365: MkDocs anchor lines should be detected
389    /// Pattern: `[](){ #anchor }` creates invisible anchor points
390    #[test]
391    fn test_is_mkdocs_anchor_line_basic() {
392        // Valid anchor lines with ID
393        assert!(is_mkdocs_anchor_line("[](){ #example }"));
394        assert!(is_mkdocs_anchor_line("[](){#example}"));
395        assert!(is_mkdocs_anchor_line("[](){ #my-anchor }"));
396        assert!(is_mkdocs_anchor_line("[](){ #anchor_with_underscore }"));
397
398        // Valid anchor lines with class
399        assert!(is_mkdocs_anchor_line("[](){ .highlight }"));
400        assert!(is_mkdocs_anchor_line("[](){.my-class}"));
401
402        // Valid anchor lines with both ID and class
403        assert!(is_mkdocs_anchor_line("[](){ #anchor .class }"));
404        assert!(is_mkdocs_anchor_line("[](){ .class #anchor }"));
405        assert!(is_mkdocs_anchor_line("[](){ #id .class1 .class2 }"));
406    }
407
408    #[test]
409    fn test_is_mkdocs_anchor_line_kramdown_style() {
410        // Kramdown-style with colon prefix
411        assert!(is_mkdocs_anchor_line("[](){: #anchor }"));
412        assert!(is_mkdocs_anchor_line("[](){:#anchor}"));
413        assert!(is_mkdocs_anchor_line("[](){: .class }"));
414        assert!(is_mkdocs_anchor_line("[](){: #id .class }"));
415    }
416
417    #[test]
418    fn test_is_mkdocs_anchor_line_whitespace_variations() {
419        // Leading/trailing whitespace on line
420        assert!(is_mkdocs_anchor_line("  [](){ #example }"));
421        assert!(is_mkdocs_anchor_line("[](){ #example }  "));
422        assert!(is_mkdocs_anchor_line("  [](){ #example }  "));
423        assert!(is_mkdocs_anchor_line("\t[](){ #example }\t"));
424
425        // Whitespace between []() and {
426        assert!(is_mkdocs_anchor_line("[]()  { #example }"));
427        assert!(is_mkdocs_anchor_line("[]()\t{ #example }"));
428
429        // No whitespace (compact form)
430        assert!(is_mkdocs_anchor_line("[](){#example}"));
431    }
432
433    #[test]
434    fn test_is_mkdocs_anchor_line_not_anchor_lines() {
435        // Empty link without attr_list
436        assert!(!is_mkdocs_anchor_line("[]()"));
437
438        // Empty attr_list (no ID or class)
439        assert!(!is_mkdocs_anchor_line("[](){ }"));
440        assert!(!is_mkdocs_anchor_line("[](){}"));
441
442        // Regular link with URL
443        assert!(!is_mkdocs_anchor_line("[](url)"));
444        assert!(!is_mkdocs_anchor_line("[text](url)"));
445        assert!(!is_mkdocs_anchor_line("[text](url){ #id }"));
446
447        // Trailing content after attr_list
448        assert!(!is_mkdocs_anchor_line("[](){ #anchor } extra text"));
449        assert!(!is_mkdocs_anchor_line("[](){ #anchor } <!-- comment -->"));
450
451        // Leading content before link
452        assert!(!is_mkdocs_anchor_line("text [](){ #anchor }"));
453        assert!(!is_mkdocs_anchor_line("# Heading [](){ #anchor }"));
454
455        // Not a link at all
456        assert!(!is_mkdocs_anchor_line("# Heading"));
457        assert!(!is_mkdocs_anchor_line("Some paragraph text"));
458        assert!(!is_mkdocs_anchor_line("{ #standalone-attr }"));
459
460        // Malformed patterns
461        assert!(!is_mkdocs_anchor_line("[]{#anchor}")); // Missing ()
462        assert!(!is_mkdocs_anchor_line("[](#anchor)")); // ID in URL position
463        assert!(!is_mkdocs_anchor_line("[](){ #anchor")); // Unclosed brace
464    }
465
466    #[test]
467    fn test_is_mkdocs_anchor_line_edge_cases() {
468        // Empty line
469        assert!(!is_mkdocs_anchor_line(""));
470        assert!(!is_mkdocs_anchor_line("   "));
471        assert!(!is_mkdocs_anchor_line("\t"));
472
473        // Only braces
474        assert!(!is_mkdocs_anchor_line("{}"));
475        assert!(!is_mkdocs_anchor_line("{ }"));
476
477        // Key-value attributes (valid in MkDocs but unusual for anchors)
478        assert!(is_mkdocs_anchor_line("[](){ #id data-value=\"test\" }"));
479
480        // Multiple IDs (first one wins per HTML spec, but pattern is valid)
481        assert!(is_mkdocs_anchor_line("[](){ #first #second }"));
482
483        // Unicode in ID (should work per attr_list spec)
484        // Note: depends on regex pattern supporting unicode identifiers
485    }
486
487    #[test]
488    fn test_is_mkdocs_anchor_line_real_world_examples() {
489        // Examples from MkDocs Material documentation
490        assert!(is_mkdocs_anchor_line("[](){ #installation }"));
491        assert!(is_mkdocs_anchor_line("[](){ #getting-started }"));
492        assert!(is_mkdocs_anchor_line("[](){ #api-reference }"));
493
494        // Examples with styling classes
495        assert!(is_mkdocs_anchor_line("[](){ .annotate }"));
496        assert!(is_mkdocs_anchor_line("[](){ #note .warning }"));
497    }
498
499    #[test]
500    fn test_attr_list_pattern_digit_starting_ids() {
501        // HTML5 allows IDs starting with digits
502        assert!(contains_attr_list("{#3rd-party}"));
503        assert!(contains_attr_list("{ #3rd-party }"));
504        assert!(contains_attr_list("{#1}"));
505        assert!(contains_attr_list("{#123-foo}"));
506        assert!(contains_attr_list("{#1st-section}"));
507        assert!(contains_attr_list("{#2nd_item}"));
508
509        // Digit-starting ID combined with class
510        assert!(contains_attr_list("{#3rd-party .glossary}"));
511
512        // Kramdown style with colon
513        assert!(contains_attr_list("{: #3rd-party}"));
514    }
515
516    #[test]
517    fn test_custom_id_extraction_digit_starting() {
518        // extract_custom_id should extract IDs starting with digits
519        let attrs = find_attr_lists("{#3rd-party}");
520        assert_eq!(attrs.len(), 1);
521        assert_eq!(attrs[0].id, Some("3rd-party".to_string()));
522
523        let attrs = find_attr_lists("{#1}");
524        assert_eq!(attrs.len(), 1);
525        assert_eq!(attrs[0].id, Some("1".to_string()));
526
527        let attrs = find_attr_lists("{#123-foo}");
528        assert_eq!(attrs.len(), 1);
529        assert_eq!(attrs[0].id, Some("123-foo".to_string()));
530
531        let attrs = find_attr_lists("{#1st-section}");
532        assert_eq!(attrs.len(), 1);
533        assert_eq!(attrs[0].id, Some("1st-section".to_string()));
534
535        let attrs = find_attr_lists("{#2nd_item}");
536        assert_eq!(attrs.len(), 1);
537        assert_eq!(attrs[0].id, Some("2nd_item".to_string()));
538    }
539
540    #[test]
541    fn test_class_pattern_still_rejects_digit_starting() {
542        // CSS class names starting with digits are invalid, should not match
543        let attrs = find_attr_lists("{.3invalid}");
544        assert_eq!(attrs.len(), 0, "Digit-starting class names should not be matched");
545    }
546
547    #[test]
548    fn test_mkdocs_anchor_line_digit_starting_id() {
549        // Anchor lines with digit-starting IDs
550        assert!(is_mkdocs_anchor_line("[](){ #3rd-party }"));
551        assert!(is_mkdocs_anchor_line("[](){ #1 }"));
552        assert!(is_mkdocs_anchor_line("[](){ #123-section }"));
553    }
554}