Skip to main content

rumdl_lib/utils/
header_id_utils.rs

1//! Utilities for extracting custom header IDs from various Markdown flavors
2//!
3//! This module supports multiple syntax formats for custom header IDs:
4//!
5//! ## Kramdown Format
6//! - `{#custom-id}` - Simple ID without colon
7//! - Example: `# Header {#my-id}`
8//!
9//! ## Python-markdown attr-list Format
10//! - `{:#custom-id}` - ID with colon, no spaces
11//! - `{: #custom-id}` - ID with colon and spaces
12//! - `{: #custom-id .class}` - ID with classes
13//! - `{: #custom-id .class data="value"}` - ID with full attributes
14//! - Example: `# Header {: #my-id .highlight}`
15//!
16//! ## Position Support
17//! - Inline: `# Header {#id}` (all formats)
18//! - Next-line: Jekyll/kramdown style where attr-list appears on the line after the header
19//!   ```markdown
20//!   # Header
21//!   {#next-line-id}
22//!   ```
23//!
24//! The module provides functions to detect and extract IDs from both inline
25//! and standalone (next-line) attr-list syntax.
26
27use regex::Regex;
28use std::sync::LazyLock;
29
30/// Pattern for HTML anchor elements used for custom anchors in headings
31/// Matches: `<a name="..."></a>`, `<a id="..."></a>`, `<a name="..." id="..."></a>`
32/// These are commonly used by some authors to create custom anchors for headings
33static HTML_ANCHOR_ELEMENT: LazyLock<Regex> =
34    LazyLock::new(|| Regex::new(r#"<a\s+(?:name|id)="[^"]*"(?:\s+(?:name|id)="[^"]*")?>\s*</a>\s*"#).unwrap());
35
36/// Pattern for custom header IDs supporting both kramdown and python-markdown attr-list formats
37/// Supports: {#id}, { #id }, {:#id}, {: #id } and full attr-list with classes/attributes
38/// Must contain #id but can have other attributes: {: #id .class data="value" }
39/// More conservative: only matches when there's actually a hash followed by valid ID characters
40static HEADER_ID_PATTERN: LazyLock<Regex> =
41    LazyLock::new(|| Regex::new(r"\s*\{\s*:?\s*([^}]*?#[^}]*?)\s*\}\s*$").unwrap());
42
43/// Pattern to validate that an ID contains only valid characters
44static ID_VALIDATE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[a-zA-Z0-9_\-:]+$").unwrap());
45
46/// Pattern for standalone attr-list lines (Jekyll/kramdown style on line after heading)
47/// Matches lines that are just attr-list syntax: {#id}, {: #id .class }, etc.
48static STANDALONE_ATTR_LIST_PATTERN: LazyLock<Regex> =
49    LazyLock::new(|| Regex::new(r"^\s*\{\s*:?\s*([^}]*#[a-zA-Z0-9_\-:]+[^}]*)\s*\}\s*$").unwrap());
50
51/// Extract custom header ID from a line if present, returning clean text and ID
52///
53/// Supports multiple formats:
54/// - Kramdown: `{#id}`
55/// - Python-markdown: `{:#id}`, `{: #id}`, `{: #id .class}`
56///
57/// # Examples
58/// ```
59/// use rumdl_lib::utils::header_id_utils::extract_header_id;
60///
61/// // Kramdown format
62/// let (text, id) = extract_header_id("# Header {#custom-id}");
63/// assert_eq!(text, "# Header");
64/// assert_eq!(id, Some("custom-id".to_string()));
65///
66/// // Python-markdown attr-list format
67/// let (text, id) = extract_header_id("# Header {: #my-id .highlight}");
68/// assert_eq!(text, "# Header");
69/// assert_eq!(id, Some("my-id".to_string()));
70/// ```
71pub fn extract_header_id(line: &str) -> (String, Option<String>) {
72    // First, strip HTML anchor elements (e.g., <a name="..."></a>) from the line
73    // These are used by some authors for custom anchors: `## <a name="foo"></a>Heading`
74    let line = HTML_ANCHOR_ELEMENT.replace_all(line, "");
75    let line = line.as_ref();
76
77    if let Some(captures) = HEADER_ID_PATTERN.captures(line)
78        && let Some(full_match) = captures.get(0)
79        && let Some(attr_content) = captures.get(1)
80    {
81        let attr_str = attr_content.as_str().trim();
82
83        // First, find all potential ID matches in the attr-list
84        if let Some(hash_pos) = attr_str.find('#') {
85            // Extract everything after the hash
86            let after_hash = &attr_str[hash_pos + 1..];
87
88            // For simple cases like {#id}, the ID goes to the end
89            // For complex cases like {: #id .class}, we need to find where the ID ends
90
91            // First check if this looks like a simple kramdown ID: {#id} with no spaces or attributes
92            let is_simple_format = !attr_str.contains(' ') && !attr_str.contains('=') && attr_str.starts_with('#');
93
94            if is_simple_format {
95                // Simple format: entire content after # should be the ID
96                let potential_id = after_hash;
97                if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
98                    let clean_text = line[..full_match.start()].trim_end().to_string();
99                    return (clean_text, Some(potential_id.to_string()));
100                }
101                // If validation fails, reject the entire attr-list
102            } else {
103                // Complex format: find proper delimiters (space for next attribute, dot for class)
104                if let Some(delimiter_pos) = after_hash.find(|c: char| c.is_whitespace() || c == '.' || c == '=') {
105                    let potential_id = &after_hash[..delimiter_pos];
106                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
107                        let clean_text = line[..full_match.start()].trim_end().to_string();
108                        return (clean_text, Some(potential_id.to_string()));
109                    }
110                } else {
111                    // No delimiter found in complex format, ID goes to end
112                    let potential_id = after_hash;
113                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
114                        let clean_text = line[..full_match.start()].trim_end().to_string();
115                        return (clean_text, Some(potential_id.to_string()));
116                    }
117                }
118            }
119        }
120    }
121    (line.to_string(), None)
122}
123
124/// Check if a line is a standalone attr-list (Jekyll/kramdown style)
125///
126/// This detects attr-list syntax that appears on its own line, typically
127/// the line after a header to provide additional attributes.
128///
129/// # Examples
130/// ```
131/// use rumdl_lib::utils::header_id_utils::is_standalone_attr_list;
132///
133/// assert!(is_standalone_attr_list("{#custom-id}"));
134/// assert!(is_standalone_attr_list("{: #spaced .class }"));
135/// assert!(!is_standalone_attr_list("Some text {#not-standalone}"));
136/// assert!(!is_standalone_attr_list(""));
137/// ```
138pub fn is_standalone_attr_list(line: &str) -> bool {
139    STANDALONE_ATTR_LIST_PATTERN.is_match(line)
140}
141
142/// Extract ID from a standalone attr-list line
143///
144/// Returns the ID if the line is a valid standalone attr-list with an ID.
145///
146/// # Examples
147/// ```
148/// use rumdl_lib::utils::header_id_utils::extract_standalone_attr_list_id;
149///
150/// assert_eq!(extract_standalone_attr_list_id("{#custom-id}"), Some("custom-id".to_string()));
151/// assert_eq!(extract_standalone_attr_list_id("{: #spaced .class }"), Some("spaced".to_string()));
152/// assert_eq!(extract_standalone_attr_list_id("not an attr-list"), None);
153/// ```
154pub fn extract_standalone_attr_list_id(line: &str) -> Option<String> {
155    if let Some(captures) = STANDALONE_ATTR_LIST_PATTERN.captures(line)
156        && let Some(attr_content) = captures.get(1)
157    {
158        let attr_str = attr_content.as_str().trim();
159
160        // Use the same logic as extract_header_id for consistency
161        if let Some(hash_pos) = attr_str.find('#') {
162            let after_hash = &attr_str[hash_pos + 1..];
163
164            // Check if this looks like a simple kramdown ID: {#id} with no spaces or attributes
165            let is_simple_format = !attr_str.contains(' ') && !attr_str.contains('=') && attr_str.starts_with('#');
166
167            if is_simple_format {
168                // Simple format: entire content after # should be the ID
169                let potential_id = after_hash;
170                if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
171                    return Some(potential_id.to_string());
172                }
173            } else {
174                // Complex format: find proper delimiters (space for next attribute, dot for class)
175                if let Some(delimiter_pos) = after_hash.find(|c: char| c.is_whitespace() || c == '.' || c == '=') {
176                    let potential_id = &after_hash[..delimiter_pos];
177                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
178                        return Some(potential_id.to_string());
179                    }
180                } else {
181                    // No delimiter found in complex format, ID goes to end
182                    let potential_id = after_hash;
183                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
184                        return Some(potential_id.to_string());
185                    }
186                }
187            }
188        }
189    }
190    None
191}
192
193#[cfg(test)]
194mod tests {
195    use super::*;
196
197    #[test]
198    fn test_kramdown_format_extraction() {
199        // Simple kramdown format
200        let (text, id) = extract_header_id("# Header {#simple}");
201        assert_eq!(text, "# Header");
202        assert_eq!(id, Some("simple".to_string()));
203
204        let (text, id) = extract_header_id("## Section {#section-id}");
205        assert_eq!(text, "## Section");
206        assert_eq!(id, Some("section-id".to_string()));
207    }
208
209    #[test]
210    fn test_python_markdown_attr_list_extraction() {
211        // Python-markdown formats
212        let (text, id) = extract_header_id("# Header {:#colon-id}");
213        assert_eq!(text, "# Header");
214        assert_eq!(id, Some("colon-id".to_string()));
215
216        let (text, id) = extract_header_id("# Header {: #spaced-id }");
217        assert_eq!(text, "# Header");
218        assert_eq!(id, Some("spaced-id".to_string()));
219    }
220
221    #[test]
222    fn test_extended_attr_list_extraction() {
223        // ID with single class
224        let (text, id) = extract_header_id("# Header {: #with-class .highlight }");
225        assert_eq!(text, "# Header");
226        assert_eq!(id, Some("with-class".to_string()));
227
228        // ID with multiple classes
229        let (text, id) = extract_header_id("## Section {: #multi .class1 .class2 }");
230        assert_eq!(text, "## Section");
231        assert_eq!(id, Some("multi".to_string()));
232
233        // ID with key-value attributes
234        let (text, id) = extract_header_id("### Subsection {: #with-attrs data-test=\"value\" style=\"color: red\" }");
235        assert_eq!(text, "### Subsection");
236        assert_eq!(id, Some("with-attrs".to_string()));
237
238        // Complex combination
239        let (text, id) = extract_header_id("#### Complex {: #complex .highlight data-role=\"button\" title=\"Test\" }");
240        assert_eq!(text, "#### Complex");
241        assert_eq!(id, Some("complex".to_string()));
242
243        // ID with quotes in attributes
244        let (text, id) = extract_header_id("##### Quotes {: #quotes title=\"Has \\\"nested\\\" quotes\" }");
245        assert_eq!(text, "##### Quotes");
246        assert_eq!(id, Some("quotes".to_string()));
247    }
248
249    #[test]
250    fn test_attr_list_detection_edge_cases() {
251        // Attr-list without ID should not match
252        let (text, id) = extract_header_id("# Header {: .class-only }");
253        assert_eq!(text, "# Header {: .class-only }");
254        assert_eq!(id, None);
255
256        // Malformed attr-list should not match
257        let (text, id) = extract_header_id("# Header { no-hash }");
258        assert_eq!(text, "# Header { no-hash }");
259        assert_eq!(id, None);
260
261        // Empty ID should not match
262        let (text, id) = extract_header_id("# Header {: # }");
263        assert_eq!(text, "# Header {: # }");
264        assert_eq!(id, None);
265
266        // ID in middle (not at end) should not match
267        let (text, id) = extract_header_id("# Header {: #middle } with more text");
268        assert_eq!(text, "# Header {: #middle } with more text");
269        assert_eq!(id, None);
270    }
271
272    #[test]
273    fn test_standalone_attr_list_detection() {
274        // Simple ID formats
275        assert!(is_standalone_attr_list("{#custom-id}"));
276        assert!(is_standalone_attr_list("{ #spaced-id }"));
277        assert!(is_standalone_attr_list("{:#colon-id}"));
278        assert!(is_standalone_attr_list("{: #full-format }"));
279
280        // With classes and attributes
281        assert!(is_standalone_attr_list("{: #with-class .highlight }"));
282        assert!(is_standalone_attr_list("{: #multi .class1 .class2 }"));
283        assert!(is_standalone_attr_list("{: #complex .highlight data-test=\"value\" }"));
284
285        // Should not match
286        assert!(!is_standalone_attr_list("Some text {#not-standalone}"));
287        assert!(!is_standalone_attr_list("Text before {#id}"));
288        assert!(!is_standalone_attr_list("{#id} text after"));
289        assert!(!is_standalone_attr_list(""));
290        assert!(!is_standalone_attr_list("   ")); // just spaces
291        assert!(!is_standalone_attr_list("{: .class-only }")); // no ID
292    }
293
294    #[test]
295    fn test_standalone_attr_list_id_extraction() {
296        // Basic formats
297        assert_eq!(extract_standalone_attr_list_id("{#simple}"), Some("simple".to_string()));
298        assert_eq!(
299            extract_standalone_attr_list_id("{ #spaced }"),
300            Some("spaced".to_string())
301        );
302        assert_eq!(extract_standalone_attr_list_id("{:#colon}"), Some("colon".to_string()));
303        assert_eq!(extract_standalone_attr_list_id("{: #full }"), Some("full".to_string()));
304
305        // With additional attributes
306        assert_eq!(
307            extract_standalone_attr_list_id("{: #with-class .highlight }"),
308            Some("with-class".to_string())
309        );
310        assert_eq!(
311            extract_standalone_attr_list_id("{: #complex .class1 .class2 data=\"value\" }"),
312            Some("complex".to_string())
313        );
314
315        // Should return None
316        assert_eq!(extract_standalone_attr_list_id("Not an attr-list"), None);
317        assert_eq!(extract_standalone_attr_list_id("Text {#not-standalone}"), None);
318        assert_eq!(extract_standalone_attr_list_id("{: .class-only }"), None);
319        assert_eq!(extract_standalone_attr_list_id(""), None);
320    }
321
322    #[test]
323    fn test_backward_compatibility() {
324        // Ensure all original kramdown formats still work
325        let test_cases = vec![
326            ("# Header {#a}", "# Header", Some("a".to_string())),
327            ("# Header {#simple-id}", "# Header", Some("simple-id".to_string())),
328            ("## Heading {#heading-2}", "## Heading", Some("heading-2".to_string())),
329            (
330                "### With-Hyphens {#with-hyphens}",
331                "### With-Hyphens",
332                Some("with-hyphens".to_string()),
333            ),
334        ];
335
336        for (input, expected_text, expected_id) in test_cases {
337            let (text, id) = extract_header_id(input);
338            assert_eq!(text, expected_text, "Text mismatch for input: {input}");
339            assert_eq!(id, expected_id, "ID mismatch for input: {input}");
340        }
341    }
342
343    #[test]
344    fn test_invalid_id_with_dots() {
345        // IDs with dots should not be extracted (dots are not valid ID characters)
346        let (text, id) = extract_header_id("## Another. {#id.with.dots}");
347        assert_eq!(text, "## Another. {#id.with.dots}"); // Should not strip invalid ID
348        assert_eq!(id, None); // Should not extract invalid ID
349
350        // Test that only the part before the dot would be extracted if it was valid standalone
351        // But since it's in an invalid format, the whole thing should be rejected
352        let (text, id) = extract_header_id("## Another. {#id.more.dots}");
353        assert_eq!(text, "## Another. {#id.more.dots}");
354        assert_eq!(id, None);
355    }
356
357    #[test]
358    fn test_html_anchor_stripping() {
359        // HTML anchor elements should be stripped from heading text
360        // This is used by some authors for custom anchors
361
362        // Basic <a name="..."></a> pattern
363        let (text, id) = extract_header_id("<a name=\"cheatsheets\"></a>Cheat Sheets");
364        assert_eq!(text, "Cheat Sheets");
365        assert_eq!(id, None);
366
367        // <a id="..."></a> pattern
368        let (text, id) = extract_header_id("<a id=\"tools\"></a>Tools and session management");
369        assert_eq!(text, "Tools and session management");
370        assert_eq!(id, None);
371
372        // With spaces around the anchor
373        let (text, id) = extract_header_id("<a name=\"foo\"></a> Heading with space");
374        assert_eq!(text, "Heading with space");
375        assert_eq!(id, None);
376
377        // Combined with kramdown custom ID
378        let (text, id) = extract_header_id("<a name=\"old\"></a>My Section {#my-custom-id}");
379        assert_eq!(text, "My Section");
380        assert_eq!(id, Some("my-custom-id".to_string()));
381    }
382}