rumdl_lib/utils/
header_id_utils.rs

1//! Utilities for extracting custom header IDs from various Markdown flavors
2//!
3//! This module supports multiple syntax formats for custom header IDs:
4//!
5//! ## Kramdown Format
6//! - `{#custom-id}` - Simple ID without colon
7//! - Example: `# Header {#my-id}`
8//!
9//! ## Python-markdown attr-list Format
10//! - `{:#custom-id}` - ID with colon, no spaces
11//! - `{: #custom-id}` - ID with colon and spaces
12//! - `{: #custom-id .class}` - ID with classes
13//! - `{: #custom-id .class data="value"}` - ID with full attributes
14//! - Example: `# Header {: #my-id .highlight}`
15//!
16//! ## Position Support
17//! - Inline: `# Header {#id}` (all formats)
18//! - Next-line: Jekyll/kramdown style where attr-list appears on the line after the header
19//!   ```markdown
20//!   # Header
21//!   {#next-line-id}
22//!   ```
23//!
24//! The module provides functions to detect and extract IDs from both inline
25//! and standalone (next-line) attr-list syntax.
26
27use lazy_static::lazy_static;
28use regex::Regex;
29
30lazy_static! {
31    /// Pattern for custom header IDs supporting both kramdown and python-markdown attr-list formats
32    /// Supports: {#id}, { #id }, {:#id}, {: #id } and full attr-list with classes/attributes
33    /// Must contain #id but can have other attributes: {: #id .class data="value" }
34    /// More conservative: only matches when there's actually a hash followed by valid ID characters
35    static ref HEADER_ID_PATTERN: Regex = Regex::new(r"\s*\{\s*:?\s*([^}]*?#[^}]*?)\s*\}\s*$").unwrap();
36
37    /// Pattern to extract and validate ID from attr-list content
38    /// Finds #id and validates it contains only valid characters (no dots, etc.)
39    static ref ID_EXTRACT_PATTERN: Regex = Regex::new(r"#([a-zA-Z0-9_\-:]+)(?:\s|$|[^a-zA-Z0-9_\-:])").unwrap();
40
41    /// Pattern to validate that an ID contains only valid characters
42    static ref ID_VALIDATE_PATTERN: Regex = Regex::new(r"^[a-zA-Z0-9_\-:]+$").unwrap();
43
44    /// Pattern for standalone attr-list lines (Jekyll/kramdown style on line after heading)
45    /// Matches lines that are just attr-list syntax: {#id}, {: #id .class }, etc.
46    static ref STANDALONE_ATTR_LIST_PATTERN: Regex = Regex::new(r"^\s*\{\s*:?\s*([^}]*#[a-zA-Z0-9_\-:]+[^}]*)\s*\}\s*$").unwrap();
47}
48
49/// Extract custom header ID from a line if present, returning clean text and ID
50///
51/// Supports multiple formats:
52/// - Kramdown: `{#id}`
53/// - Python-markdown: `{:#id}`, `{: #id}`, `{: #id .class}`
54///
55/// # Examples
56/// ```
57/// use rumdl_lib::utils::header_id_utils::extract_header_id;
58///
59/// // Kramdown format
60/// let (text, id) = extract_header_id("# Header {#custom-id}");
61/// assert_eq!(text, "# Header");
62/// assert_eq!(id, Some("custom-id".to_string()));
63///
64/// // Python-markdown attr-list format
65/// let (text, id) = extract_header_id("# Header {: #my-id .highlight}");
66/// assert_eq!(text, "# Header");
67/// assert_eq!(id, Some("my-id".to_string()));
68/// ```
69pub fn extract_header_id(line: &str) -> (String, Option<String>) {
70    if let Some(captures) = HEADER_ID_PATTERN.captures(line)
71        && let Some(full_match) = captures.get(0)
72        && let Some(attr_content) = captures.get(1)
73    {
74        let attr_str = attr_content.as_str().trim();
75
76        // First, find all potential ID matches in the attr-list
77        if let Some(hash_pos) = attr_str.find('#') {
78            // Extract everything after the hash
79            let after_hash = &attr_str[hash_pos + 1..];
80
81            // For simple cases like {#id}, the ID goes to the end
82            // For complex cases like {: #id .class}, we need to find where the ID ends
83
84            // First check if this looks like a simple kramdown ID: {#id} with no spaces or attributes
85            let is_simple_format = !attr_str.contains(' ') && !attr_str.contains('=') && attr_str.starts_with('#');
86
87            if is_simple_format {
88                // Simple format: entire content after # should be the ID
89                let potential_id = after_hash;
90                if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
91                    let clean_text = line[..full_match.start()].trim_end().to_string();
92                    return (clean_text, Some(potential_id.to_string()));
93                }
94                // If validation fails, reject the entire attr-list
95            } else {
96                // Complex format: find proper delimiters (space for next attribute, dot for class)
97                if let Some(delimiter_pos) = after_hash.find(|c: char| c.is_whitespace() || c == '.' || c == '=') {
98                    let potential_id = &after_hash[..delimiter_pos];
99                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
100                        let clean_text = line[..full_match.start()].trim_end().to_string();
101                        return (clean_text, Some(potential_id.to_string()));
102                    }
103                } else {
104                    // No delimiter found in complex format, ID goes to end
105                    let potential_id = after_hash;
106                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
107                        let clean_text = line[..full_match.start()].trim_end().to_string();
108                        return (clean_text, Some(potential_id.to_string()));
109                    }
110                }
111            }
112        }
113    }
114    (line.to_string(), None)
115}
116
117/// Check if a line is a standalone attr-list (Jekyll/kramdown style)
118///
119/// This detects attr-list syntax that appears on its own line, typically
120/// the line after a header to provide additional attributes.
121///
122/// # Examples
123/// ```
124/// use rumdl_lib::utils::header_id_utils::is_standalone_attr_list;
125///
126/// assert!(is_standalone_attr_list("{#custom-id}"));
127/// assert!(is_standalone_attr_list("{: #spaced .class }"));
128/// assert!(!is_standalone_attr_list("Some text {#not-standalone}"));
129/// assert!(!is_standalone_attr_list(""));
130/// ```
131pub fn is_standalone_attr_list(line: &str) -> bool {
132    STANDALONE_ATTR_LIST_PATTERN.is_match(line)
133}
134
135/// Extract ID from a standalone attr-list line
136///
137/// Returns the ID if the line is a valid standalone attr-list with an ID.
138///
139/// # Examples
140/// ```
141/// use rumdl_lib::utils::header_id_utils::extract_standalone_attr_list_id;
142///
143/// assert_eq!(extract_standalone_attr_list_id("{#custom-id}"), Some("custom-id".to_string()));
144/// assert_eq!(extract_standalone_attr_list_id("{: #spaced .class }"), Some("spaced".to_string()));
145/// assert_eq!(extract_standalone_attr_list_id("not an attr-list"), None);
146/// ```
147pub fn extract_standalone_attr_list_id(line: &str) -> Option<String> {
148    if let Some(captures) = STANDALONE_ATTR_LIST_PATTERN.captures(line)
149        && let Some(attr_content) = captures.get(1)
150    {
151        let attr_str = attr_content.as_str().trim();
152
153        // Use the same logic as extract_header_id for consistency
154        if let Some(hash_pos) = attr_str.find('#') {
155            let after_hash = &attr_str[hash_pos + 1..];
156
157            // Check if this looks like a simple kramdown ID: {#id} with no spaces or attributes
158            let is_simple_format = !attr_str.contains(' ') && !attr_str.contains('=') && attr_str.starts_with('#');
159
160            if is_simple_format {
161                // Simple format: entire content after # should be the ID
162                let potential_id = after_hash;
163                if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
164                    return Some(potential_id.to_string());
165                }
166            } else {
167                // Complex format: find proper delimiters (space for next attribute, dot for class)
168                if let Some(delimiter_pos) = after_hash.find(|c: char| c.is_whitespace() || c == '.' || c == '=') {
169                    let potential_id = &after_hash[..delimiter_pos];
170                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
171                        return Some(potential_id.to_string());
172                    }
173                } else {
174                    // No delimiter found in complex format, ID goes to end
175                    let potential_id = after_hash;
176                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
177                        return Some(potential_id.to_string());
178                    }
179                }
180            }
181        }
182    }
183    None
184}
185
186#[cfg(test)]
187mod tests {
188    use super::*;
189
190    #[test]
191    fn test_kramdown_format_extraction() {
192        // Simple kramdown format
193        let (text, id) = extract_header_id("# Header {#simple}");
194        assert_eq!(text, "# Header");
195        assert_eq!(id, Some("simple".to_string()));
196
197        let (text, id) = extract_header_id("## Section {#section-id}");
198        assert_eq!(text, "## Section");
199        assert_eq!(id, Some("section-id".to_string()));
200    }
201
202    #[test]
203    fn test_python_markdown_attr_list_extraction() {
204        // Python-markdown formats
205        let (text, id) = extract_header_id("# Header {:#colon-id}");
206        assert_eq!(text, "# Header");
207        assert_eq!(id, Some("colon-id".to_string()));
208
209        let (text, id) = extract_header_id("# Header {: #spaced-id }");
210        assert_eq!(text, "# Header");
211        assert_eq!(id, Some("spaced-id".to_string()));
212    }
213
214    #[test]
215    fn test_extended_attr_list_extraction() {
216        // ID with single class
217        let (text, id) = extract_header_id("# Header {: #with-class .highlight }");
218        assert_eq!(text, "# Header");
219        assert_eq!(id, Some("with-class".to_string()));
220
221        // ID with multiple classes
222        let (text, id) = extract_header_id("## Section {: #multi .class1 .class2 }");
223        assert_eq!(text, "## Section");
224        assert_eq!(id, Some("multi".to_string()));
225
226        // ID with key-value attributes
227        let (text, id) = extract_header_id("### Subsection {: #with-attrs data-test=\"value\" style=\"color: red\" }");
228        assert_eq!(text, "### Subsection");
229        assert_eq!(id, Some("with-attrs".to_string()));
230
231        // Complex combination
232        let (text, id) = extract_header_id("#### Complex {: #complex .highlight data-role=\"button\" title=\"Test\" }");
233        assert_eq!(text, "#### Complex");
234        assert_eq!(id, Some("complex".to_string()));
235
236        // ID with quotes in attributes
237        let (text, id) = extract_header_id("##### Quotes {: #quotes title=\"Has \\\"nested\\\" quotes\" }");
238        assert_eq!(text, "##### Quotes");
239        assert_eq!(id, Some("quotes".to_string()));
240    }
241
242    #[test]
243    fn test_attr_list_detection_edge_cases() {
244        // Attr-list without ID should not match
245        let (text, id) = extract_header_id("# Header {: .class-only }");
246        assert_eq!(text, "# Header {: .class-only }");
247        assert_eq!(id, None);
248
249        // Malformed attr-list should not match
250        let (text, id) = extract_header_id("# Header { no-hash }");
251        assert_eq!(text, "# Header { no-hash }");
252        assert_eq!(id, None);
253
254        // Empty ID should not match
255        let (text, id) = extract_header_id("# Header {: # }");
256        assert_eq!(text, "# Header {: # }");
257        assert_eq!(id, None);
258
259        // ID in middle (not at end) should not match
260        let (text, id) = extract_header_id("# Header {: #middle } with more text");
261        assert_eq!(text, "# Header {: #middle } with more text");
262        assert_eq!(id, None);
263    }
264
265    #[test]
266    fn test_standalone_attr_list_detection() {
267        // Simple ID formats
268        assert!(is_standalone_attr_list("{#custom-id}"));
269        assert!(is_standalone_attr_list("{ #spaced-id }"));
270        assert!(is_standalone_attr_list("{:#colon-id}"));
271        assert!(is_standalone_attr_list("{: #full-format }"));
272
273        // With classes and attributes
274        assert!(is_standalone_attr_list("{: #with-class .highlight }"));
275        assert!(is_standalone_attr_list("{: #multi .class1 .class2 }"));
276        assert!(is_standalone_attr_list("{: #complex .highlight data-test=\"value\" }"));
277
278        // Should not match
279        assert!(!is_standalone_attr_list("Some text {#not-standalone}"));
280        assert!(!is_standalone_attr_list("Text before {#id}"));
281        assert!(!is_standalone_attr_list("{#id} text after"));
282        assert!(!is_standalone_attr_list(""));
283        assert!(!is_standalone_attr_list("   ")); // just spaces
284        assert!(!is_standalone_attr_list("{: .class-only }")); // no ID
285    }
286
287    #[test]
288    fn test_standalone_attr_list_id_extraction() {
289        // Basic formats
290        assert_eq!(extract_standalone_attr_list_id("{#simple}"), Some("simple".to_string()));
291        assert_eq!(
292            extract_standalone_attr_list_id("{ #spaced }"),
293            Some("spaced".to_string())
294        );
295        assert_eq!(extract_standalone_attr_list_id("{:#colon}"), Some("colon".to_string()));
296        assert_eq!(extract_standalone_attr_list_id("{: #full }"), Some("full".to_string()));
297
298        // With additional attributes
299        assert_eq!(
300            extract_standalone_attr_list_id("{: #with-class .highlight }"),
301            Some("with-class".to_string())
302        );
303        assert_eq!(
304            extract_standalone_attr_list_id("{: #complex .class1 .class2 data=\"value\" }"),
305            Some("complex".to_string())
306        );
307
308        // Should return None
309        assert_eq!(extract_standalone_attr_list_id("Not an attr-list"), None);
310        assert_eq!(extract_standalone_attr_list_id("Text {#not-standalone}"), None);
311        assert_eq!(extract_standalone_attr_list_id("{: .class-only }"), None);
312        assert_eq!(extract_standalone_attr_list_id(""), None);
313    }
314
315    #[test]
316    fn test_backward_compatibility() {
317        // Ensure all original kramdown formats still work
318        let test_cases = vec![
319            ("# Header {#a}", "# Header", Some("a".to_string())),
320            ("# Header {#simple-id}", "# Header", Some("simple-id".to_string())),
321            ("## Heading {#heading-2}", "## Heading", Some("heading-2".to_string())),
322            (
323                "### With-Hyphens {#with-hyphens}",
324                "### With-Hyphens",
325                Some("with-hyphens".to_string()),
326            ),
327        ];
328
329        for (input, expected_text, expected_id) in test_cases {
330            let (text, id) = extract_header_id(input);
331            assert_eq!(text, expected_text, "Text mismatch for input: {input}");
332            assert_eq!(id, expected_id, "ID mismatch for input: {input}");
333        }
334    }
335
336    #[test]
337    fn test_invalid_id_with_dots() {
338        // IDs with dots should not be extracted (dots are not valid ID characters)
339        let (text, id) = extract_header_id("## Another. {#id.with.dots}");
340        assert_eq!(text, "## Another. {#id.with.dots}"); // Should not strip invalid ID
341        assert_eq!(id, None); // Should not extract invalid ID
342
343        // Test that only the part before the dot would be extracted if it was valid standalone
344        // But since it's in an invalid format, the whole thing should be rejected
345        let (text, id) = extract_header_id("## Another. {#id.more.dots}");
346        assert_eq!(text, "## Another. {#id.more.dots}");
347        assert_eq!(id, None);
348    }
349}