rumdl_lib/utils/
header_id_utils.rs

1//! Utilities for extracting custom header IDs from various Markdown flavors
2//!
3//! This module supports multiple syntax formats for custom header IDs:
4//!
5//! ## Kramdown Format
6//! - `{#custom-id}` - Simple ID without colon
7//! - Example: `# Header {#my-id}`
8//!
9//! ## Python-markdown attr-list Format
10//! - `{:#custom-id}` - ID with colon, no spaces
11//! - `{: #custom-id}` - ID with colon and spaces
12//! - `{: #custom-id .class}` - ID with classes
13//! - `{: #custom-id .class data="value"}` - ID with full attributes
14//! - Example: `# Header {: #my-id .highlight}`
15//!
16//! ## Position Support
17//! - Inline: `# Header {#id}` (all formats)
18//! - Next-line: Jekyll/kramdown style where attr-list appears on the line after the header
19//!   ```markdown
20//!   # Header
21//!   {#next-line-id}
22//!   ```
23//!
24//! The module provides functions to detect and extract IDs from both inline
25//! and standalone (next-line) attr-list syntax.
26
27use regex::Regex;
28use std::sync::LazyLock;
29
30/// Pattern for custom header IDs supporting both kramdown and python-markdown attr-list formats
31/// Supports: {#id}, { #id }, {:#id}, {: #id } and full attr-list with classes/attributes
32/// Must contain #id but can have other attributes: {: #id .class data="value" }
33/// More conservative: only matches when there's actually a hash followed by valid ID characters
34static HEADER_ID_PATTERN: LazyLock<Regex> =
35    LazyLock::new(|| Regex::new(r"\s*\{\s*:?\s*([^}]*?#[^}]*?)\s*\}\s*$").unwrap());
36
37/// Pattern to validate that an ID contains only valid characters
38static ID_VALIDATE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[a-zA-Z0-9_\-:]+$").unwrap());
39
40/// Pattern for standalone attr-list lines (Jekyll/kramdown style on line after heading)
41/// Matches lines that are just attr-list syntax: {#id}, {: #id .class }, etc.
42static STANDALONE_ATTR_LIST_PATTERN: LazyLock<Regex> =
43    LazyLock::new(|| Regex::new(r"^\s*\{\s*:?\s*([^}]*#[a-zA-Z0-9_\-:]+[^}]*)\s*\}\s*$").unwrap());
44
45/// Extract custom header ID from a line if present, returning clean text and ID
46///
47/// Supports multiple formats:
48/// - Kramdown: `{#id}`
49/// - Python-markdown: `{:#id}`, `{: #id}`, `{: #id .class}`
50///
51/// # Examples
52/// ```
53/// use rumdl_lib::utils::header_id_utils::extract_header_id;
54///
55/// // Kramdown format
56/// let (text, id) = extract_header_id("# Header {#custom-id}");
57/// assert_eq!(text, "# Header");
58/// assert_eq!(id, Some("custom-id".to_string()));
59///
60/// // Python-markdown attr-list format
61/// let (text, id) = extract_header_id("# Header {: #my-id .highlight}");
62/// assert_eq!(text, "# Header");
63/// assert_eq!(id, Some("my-id".to_string()));
64/// ```
65pub fn extract_header_id(line: &str) -> (String, Option<String>) {
66    if let Some(captures) = HEADER_ID_PATTERN.captures(line)
67        && let Some(full_match) = captures.get(0)
68        && let Some(attr_content) = captures.get(1)
69    {
70        let attr_str = attr_content.as_str().trim();
71
72        // First, find all potential ID matches in the attr-list
73        if let Some(hash_pos) = attr_str.find('#') {
74            // Extract everything after the hash
75            let after_hash = &attr_str[hash_pos + 1..];
76
77            // For simple cases like {#id}, the ID goes to the end
78            // For complex cases like {: #id .class}, we need to find where the ID ends
79
80            // First check if this looks like a simple kramdown ID: {#id} with no spaces or attributes
81            let is_simple_format = !attr_str.contains(' ') && !attr_str.contains('=') && attr_str.starts_with('#');
82
83            if is_simple_format {
84                // Simple format: entire content after # should be the ID
85                let potential_id = after_hash;
86                if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
87                    let clean_text = line[..full_match.start()].trim_end().to_string();
88                    return (clean_text, Some(potential_id.to_string()));
89                }
90                // If validation fails, reject the entire attr-list
91            } else {
92                // Complex format: find proper delimiters (space for next attribute, dot for class)
93                if let Some(delimiter_pos) = after_hash.find(|c: char| c.is_whitespace() || c == '.' || c == '=') {
94                    let potential_id = &after_hash[..delimiter_pos];
95                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
96                        let clean_text = line[..full_match.start()].trim_end().to_string();
97                        return (clean_text, Some(potential_id.to_string()));
98                    }
99                } else {
100                    // No delimiter found in complex format, ID goes to end
101                    let potential_id = after_hash;
102                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
103                        let clean_text = line[..full_match.start()].trim_end().to_string();
104                        return (clean_text, Some(potential_id.to_string()));
105                    }
106                }
107            }
108        }
109    }
110    (line.to_string(), None)
111}
112
113/// Check if a line is a standalone attr-list (Jekyll/kramdown style)
114///
115/// This detects attr-list syntax that appears on its own line, typically
116/// the line after a header to provide additional attributes.
117///
118/// # Examples
119/// ```
120/// use rumdl_lib::utils::header_id_utils::is_standalone_attr_list;
121///
122/// assert!(is_standalone_attr_list("{#custom-id}"));
123/// assert!(is_standalone_attr_list("{: #spaced .class }"));
124/// assert!(!is_standalone_attr_list("Some text {#not-standalone}"));
125/// assert!(!is_standalone_attr_list(""));
126/// ```
127pub fn is_standalone_attr_list(line: &str) -> bool {
128    STANDALONE_ATTR_LIST_PATTERN.is_match(line)
129}
130
131/// Extract ID from a standalone attr-list line
132///
133/// Returns the ID if the line is a valid standalone attr-list with an ID.
134///
135/// # Examples
136/// ```
137/// use rumdl_lib::utils::header_id_utils::extract_standalone_attr_list_id;
138///
139/// assert_eq!(extract_standalone_attr_list_id("{#custom-id}"), Some("custom-id".to_string()));
140/// assert_eq!(extract_standalone_attr_list_id("{: #spaced .class }"), Some("spaced".to_string()));
141/// assert_eq!(extract_standalone_attr_list_id("not an attr-list"), None);
142/// ```
143pub fn extract_standalone_attr_list_id(line: &str) -> Option<String> {
144    if let Some(captures) = STANDALONE_ATTR_LIST_PATTERN.captures(line)
145        && let Some(attr_content) = captures.get(1)
146    {
147        let attr_str = attr_content.as_str().trim();
148
149        // Use the same logic as extract_header_id for consistency
150        if let Some(hash_pos) = attr_str.find('#') {
151            let after_hash = &attr_str[hash_pos + 1..];
152
153            // Check if this looks like a simple kramdown ID: {#id} with no spaces or attributes
154            let is_simple_format = !attr_str.contains(' ') && !attr_str.contains('=') && attr_str.starts_with('#');
155
156            if is_simple_format {
157                // Simple format: entire content after # should be the ID
158                let potential_id = after_hash;
159                if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
160                    return Some(potential_id.to_string());
161                }
162            } else {
163                // Complex format: find proper delimiters (space for next attribute, dot for class)
164                if let Some(delimiter_pos) = after_hash.find(|c: char| c.is_whitespace() || c == '.' || c == '=') {
165                    let potential_id = &after_hash[..delimiter_pos];
166                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
167                        return Some(potential_id.to_string());
168                    }
169                } else {
170                    // No delimiter found in complex format, ID goes to end
171                    let potential_id = after_hash;
172                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
173                        return Some(potential_id.to_string());
174                    }
175                }
176            }
177        }
178    }
179    None
180}
181
182#[cfg(test)]
183mod tests {
184    use super::*;
185
186    #[test]
187    fn test_kramdown_format_extraction() {
188        // Simple kramdown format
189        let (text, id) = extract_header_id("# Header {#simple}");
190        assert_eq!(text, "# Header");
191        assert_eq!(id, Some("simple".to_string()));
192
193        let (text, id) = extract_header_id("## Section {#section-id}");
194        assert_eq!(text, "## Section");
195        assert_eq!(id, Some("section-id".to_string()));
196    }
197
198    #[test]
199    fn test_python_markdown_attr_list_extraction() {
200        // Python-markdown formats
201        let (text, id) = extract_header_id("# Header {:#colon-id}");
202        assert_eq!(text, "# Header");
203        assert_eq!(id, Some("colon-id".to_string()));
204
205        let (text, id) = extract_header_id("# Header {: #spaced-id }");
206        assert_eq!(text, "# Header");
207        assert_eq!(id, Some("spaced-id".to_string()));
208    }
209
210    #[test]
211    fn test_extended_attr_list_extraction() {
212        // ID with single class
213        let (text, id) = extract_header_id("# Header {: #with-class .highlight }");
214        assert_eq!(text, "# Header");
215        assert_eq!(id, Some("with-class".to_string()));
216
217        // ID with multiple classes
218        let (text, id) = extract_header_id("## Section {: #multi .class1 .class2 }");
219        assert_eq!(text, "## Section");
220        assert_eq!(id, Some("multi".to_string()));
221
222        // ID with key-value attributes
223        let (text, id) = extract_header_id("### Subsection {: #with-attrs data-test=\"value\" style=\"color: red\" }");
224        assert_eq!(text, "### Subsection");
225        assert_eq!(id, Some("with-attrs".to_string()));
226
227        // Complex combination
228        let (text, id) = extract_header_id("#### Complex {: #complex .highlight data-role=\"button\" title=\"Test\" }");
229        assert_eq!(text, "#### Complex");
230        assert_eq!(id, Some("complex".to_string()));
231
232        // ID with quotes in attributes
233        let (text, id) = extract_header_id("##### Quotes {: #quotes title=\"Has \\\"nested\\\" quotes\" }");
234        assert_eq!(text, "##### Quotes");
235        assert_eq!(id, Some("quotes".to_string()));
236    }
237
238    #[test]
239    fn test_attr_list_detection_edge_cases() {
240        // Attr-list without ID should not match
241        let (text, id) = extract_header_id("# Header {: .class-only }");
242        assert_eq!(text, "# Header {: .class-only }");
243        assert_eq!(id, None);
244
245        // Malformed attr-list should not match
246        let (text, id) = extract_header_id("# Header { no-hash }");
247        assert_eq!(text, "# Header { no-hash }");
248        assert_eq!(id, None);
249
250        // Empty ID should not match
251        let (text, id) = extract_header_id("# Header {: # }");
252        assert_eq!(text, "# Header {: # }");
253        assert_eq!(id, None);
254
255        // ID in middle (not at end) should not match
256        let (text, id) = extract_header_id("# Header {: #middle } with more text");
257        assert_eq!(text, "# Header {: #middle } with more text");
258        assert_eq!(id, None);
259    }
260
261    #[test]
262    fn test_standalone_attr_list_detection() {
263        // Simple ID formats
264        assert!(is_standalone_attr_list("{#custom-id}"));
265        assert!(is_standalone_attr_list("{ #spaced-id }"));
266        assert!(is_standalone_attr_list("{:#colon-id}"));
267        assert!(is_standalone_attr_list("{: #full-format }"));
268
269        // With classes and attributes
270        assert!(is_standalone_attr_list("{: #with-class .highlight }"));
271        assert!(is_standalone_attr_list("{: #multi .class1 .class2 }"));
272        assert!(is_standalone_attr_list("{: #complex .highlight data-test=\"value\" }"));
273
274        // Should not match
275        assert!(!is_standalone_attr_list("Some text {#not-standalone}"));
276        assert!(!is_standalone_attr_list("Text before {#id}"));
277        assert!(!is_standalone_attr_list("{#id} text after"));
278        assert!(!is_standalone_attr_list(""));
279        assert!(!is_standalone_attr_list("   ")); // just spaces
280        assert!(!is_standalone_attr_list("{: .class-only }")); // no ID
281    }
282
283    #[test]
284    fn test_standalone_attr_list_id_extraction() {
285        // Basic formats
286        assert_eq!(extract_standalone_attr_list_id("{#simple}"), Some("simple".to_string()));
287        assert_eq!(
288            extract_standalone_attr_list_id("{ #spaced }"),
289            Some("spaced".to_string())
290        );
291        assert_eq!(extract_standalone_attr_list_id("{:#colon}"), Some("colon".to_string()));
292        assert_eq!(extract_standalone_attr_list_id("{: #full }"), Some("full".to_string()));
293
294        // With additional attributes
295        assert_eq!(
296            extract_standalone_attr_list_id("{: #with-class .highlight }"),
297            Some("with-class".to_string())
298        );
299        assert_eq!(
300            extract_standalone_attr_list_id("{: #complex .class1 .class2 data=\"value\" }"),
301            Some("complex".to_string())
302        );
303
304        // Should return None
305        assert_eq!(extract_standalone_attr_list_id("Not an attr-list"), None);
306        assert_eq!(extract_standalone_attr_list_id("Text {#not-standalone}"), None);
307        assert_eq!(extract_standalone_attr_list_id("{: .class-only }"), None);
308        assert_eq!(extract_standalone_attr_list_id(""), None);
309    }
310
311    #[test]
312    fn test_backward_compatibility() {
313        // Ensure all original kramdown formats still work
314        let test_cases = vec![
315            ("# Header {#a}", "# Header", Some("a".to_string())),
316            ("# Header {#simple-id}", "# Header", Some("simple-id".to_string())),
317            ("## Heading {#heading-2}", "## Heading", Some("heading-2".to_string())),
318            (
319                "### With-Hyphens {#with-hyphens}",
320                "### With-Hyphens",
321                Some("with-hyphens".to_string()),
322            ),
323        ];
324
325        for (input, expected_text, expected_id) in test_cases {
326            let (text, id) = extract_header_id(input);
327            assert_eq!(text, expected_text, "Text mismatch for input: {input}");
328            assert_eq!(id, expected_id, "ID mismatch for input: {input}");
329        }
330    }
331
332    #[test]
333    fn test_invalid_id_with_dots() {
334        // IDs with dots should not be extracted (dots are not valid ID characters)
335        let (text, id) = extract_header_id("## Another. {#id.with.dots}");
336        assert_eq!(text, "## Another. {#id.with.dots}"); // Should not strip invalid ID
337        assert_eq!(id, None); // Should not extract invalid ID
338
339        // Test that only the part before the dot would be extracted if it was valid standalone
340        // But since it's in an invalid format, the whole thing should be rejected
341        let (text, id) = extract_header_id("## Another. {#id.more.dots}");
342        assert_eq!(text, "## Another. {#id.more.dots}");
343        assert_eq!(id, None);
344    }
345}