rumdl_lib/utils/
header_id_utils.rs

1//! Utilities for extracting custom header IDs from various Markdown flavors
2//!
3//! This module supports multiple syntax formats for custom header IDs:
4//!
5//! ## Kramdown Format
6//! - `{#custom-id}` - Simple ID without colon
7//! - Example: `# Header {#my-id}`
8//!
9//! ## Python-markdown attr-list Format
10//! - `{:#custom-id}` - ID with colon, no spaces
11//! - `{: #custom-id}` - ID with colon and spaces
12//! - `{: #custom-id .class}` - ID with classes
13//! - `{: #custom-id .class data="value"}` - ID with full attributes
14//! - Example: `# Header {: #my-id .highlight}`
15//!
16//! ## Position Support
17//! - Inline: `# Header {#id}` (all formats)
18//! - Next-line: Jekyll/kramdown style where attr-list appears on the line after the header
19//!   ```markdown
20//!   # Header
21//!   {#next-line-id}
22//!   ```
23//!
24//! The module provides functions to detect and extract IDs from both inline
25//! and standalone (next-line) attr-list syntax.
26
27use regex::Regex;
28use std::sync::LazyLock;
29
30/// Pattern for HTML anchor elements used for custom anchors in headings
31/// Matches: `<a name="..."></a>`, `<a id="..."></a>`, `<a name="..." id="..."></a>`
32/// These are commonly used by some authors to create custom anchors for headings
33static HTML_ANCHOR_ELEMENT: LazyLock<Regex> =
34    LazyLock::new(|| Regex::new(r#"<a\s+(?:name|id)="[^"]*"(?:\s+(?:name|id)="[^"]*")?>\s*</a>\s*"#).unwrap());
35
36/// Pattern for custom header IDs supporting both kramdown and python-markdown attr-list formats
37/// Supports: {#id}, { #id }, {:#id}, {: #id } and full attr-list with classes/attributes
38/// Must contain #id but can have other attributes: {: #id .class data="value" }
39/// More conservative: only matches when there's actually a hash followed by valid ID characters
40static HEADER_ID_PATTERN: LazyLock<Regex> =
41    LazyLock::new(|| Regex::new(r"\s*\{\s*:?\s*([^}]*?#[^}]*?)\s*\}\s*$").unwrap());
42
43/// Pattern to validate that an ID contains only valid characters
44static ID_VALIDATE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[a-zA-Z0-9_\-:]+$").unwrap());
45
46/// Pattern for standalone attr-list lines (Jekyll/kramdown style on line after heading)
47/// Matches lines that are just attr-list syntax: {#id}, {: #id .class }, etc.
48static STANDALONE_ATTR_LIST_PATTERN: LazyLock<Regex> =
49    LazyLock::new(|| Regex::new(r"^\s*\{\s*:?\s*([^}]*#[a-zA-Z0-9_\-:]+[^}]*)\s*\}\s*$").unwrap());
50
51/// Extract custom header ID from a line if present, returning clean text and ID
52///
53/// Supports multiple formats:
54/// - Kramdown: `{#id}`
55/// - Python-markdown: `{:#id}`, `{: #id}`, `{: #id .class}`
56///
57/// # Examples
58/// ```
59/// use rumdl_lib::utils::header_id_utils::extract_header_id;
60///
61/// // Kramdown format
62/// let (text, id) = extract_header_id("# Header {#custom-id}");
63/// assert_eq!(text, "# Header");
64/// assert_eq!(id, Some("custom-id".to_string()));
65///
66/// // Python-markdown attr-list format
67/// let (text, id) = extract_header_id("# Header {: #my-id .highlight}");
68/// assert_eq!(text, "# Header");
69/// assert_eq!(id, Some("my-id".to_string()));
70/// ```
71pub fn extract_header_id(line: &str) -> (String, Option<String>) {
72    // First, strip HTML anchor elements (e.g., <a name="..."></a>) from the line
73    // These are used by some authors for custom anchors: `## <a name="foo"></a>Heading`
74    let line = HTML_ANCHOR_ELEMENT.replace_all(line, "");
75    let line = line.as_ref();
76
77    if let Some(captures) = HEADER_ID_PATTERN.captures(line)
78        && let Some(full_match) = captures.get(0)
79        && let Some(attr_content) = captures.get(1)
80    {
81        let attr_str = attr_content.as_str().trim();
82
83        // First, find all potential ID matches in the attr-list
84        if let Some(hash_pos) = attr_str.find('#') {
85            // Extract everything after the hash
86            let after_hash = &attr_str[hash_pos + 1..];
87
88            // For simple cases like {#id}, the ID goes to the end
89            // For complex cases like {: #id .class}, we need to find where the ID ends
90
91            // First check if this looks like a simple kramdown ID: {#id} with no spaces or attributes
92            let is_simple_format = !attr_str.contains(' ') && !attr_str.contains('=') && attr_str.starts_with('#');
93
94            if is_simple_format {
95                // Simple format: entire content after # should be the ID
96                let potential_id = after_hash;
97                if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
98                    let clean_text = line[..full_match.start()].trim_end().to_string();
99                    return (clean_text, Some(potential_id.to_string()));
100                }
101                // If validation fails, reject the entire attr-list
102            } else {
103                // Complex format: find proper delimiters (space for next attribute, dot for class)
104                if let Some(delimiter_pos) = after_hash.find(|c: char| c.is_whitespace() || c == '.' || c == '=') {
105                    let potential_id = &after_hash[..delimiter_pos];
106                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
107                        let clean_text = line[..full_match.start()].trim_end().to_string();
108                        return (clean_text, Some(potential_id.to_string()));
109                    }
110                } else {
111                    // No delimiter found in complex format, ID goes to end
112                    let potential_id = after_hash;
113                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
114                        let clean_text = line[..full_match.start()].trim_end().to_string();
115                        return (clean_text, Some(potential_id.to_string()));
116                    }
117                }
118            }
119        }
120    }
121    (line.to_string(), None)
122}
123
124/// Check if a line is a standalone attr-list (Jekyll/kramdown style)
125///
126/// This detects attr-list syntax that appears on its own line, typically
127/// the line after a header to provide additional attributes.
128///
129/// # Examples
130/// ```
131/// use rumdl_lib::utils::header_id_utils::is_standalone_attr_list;
132///
133/// assert!(is_standalone_attr_list("{#custom-id}"));
134/// assert!(is_standalone_attr_list("{: #spaced .class }"));
135/// assert!(!is_standalone_attr_list("Some text {#not-standalone}"));
136/// assert!(!is_standalone_attr_list(""));
137/// ```
138pub fn is_standalone_attr_list(line: &str) -> bool {
139    STANDALONE_ATTR_LIST_PATTERN.is_match(line)
140}
141
142/// Extract ID from a standalone attr-list line
143///
144/// Returns the ID if the line is a valid standalone attr-list with an ID.
145///
146/// # Examples
147/// ```
148/// use rumdl_lib::utils::header_id_utils::extract_standalone_attr_list_id;
149///
150/// assert_eq!(extract_standalone_attr_list_id("{#custom-id}"), Some("custom-id".to_string()));
151/// assert_eq!(extract_standalone_attr_list_id("{: #spaced .class }"), Some("spaced".to_string()));
152/// assert_eq!(extract_standalone_attr_list_id("not an attr-list"), None);
153/// ```
154pub fn extract_standalone_attr_list_id(line: &str) -> Option<String> {
155    if let Some(captures) = STANDALONE_ATTR_LIST_PATTERN.captures(line)
156        && let Some(attr_content) = captures.get(1)
157    {
158        let attr_str = attr_content.as_str().trim();
159
160        // Use the same logic as extract_header_id for consistency
161        if let Some(hash_pos) = attr_str.find('#') {
162            let after_hash = &attr_str[hash_pos + 1..];
163
164            // Check if this looks like a simple kramdown ID: {#id} with no spaces or attributes
165            let is_simple_format = !attr_str.contains(' ') && !attr_str.contains('=') && attr_str.starts_with('#');
166
167            if is_simple_format {
168                // Simple format: entire content after # should be the ID
169                let potential_id = after_hash;
170                if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
171                    return Some(potential_id.to_string());
172                }
173            } else {
174                // Complex format: find proper delimiters (space for next attribute, dot for class)
175                if let Some(delimiter_pos) = after_hash.find(|c: char| c.is_whitespace() || c == '.' || c == '=') {
176                    let potential_id = &after_hash[..delimiter_pos];
177                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
178                        return Some(potential_id.to_string());
179                    }
180                } else {
181                    // No delimiter found in complex format, ID goes to end
182                    let potential_id = after_hash;
183                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
184                        return Some(potential_id.to_string());
185                    }
186                }
187            }
188        }
189    }
190    None
191}
192
193/// Parse an ATX heading written inside a blockquote's inner text.
194///
195/// Blockquote headings (`> ## Heading`) are not detected by the main
196/// line-based heading parser, but they still produce valid fragment anchors.
197/// Strips the leading `#` marker, an optional CommonMark closing hash
198/// sequence, and any trailing `{#custom-id}`. Returns `(clean_text,
199/// custom_id)` or `None` when the blockquote content is not an ATX heading.
200pub fn parse_blockquote_atx_heading(bq_content: &str) -> Option<(String, Option<String>)> {
201    static BQ_ATX_HEADING_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(#{1,6})\s+(.*)$").unwrap());
202
203    let trimmed = bq_content.trim();
204    let caps = BQ_ATX_HEADING_RE.captures(trimmed)?;
205    let mut rest = caps.get(2).map_or("", |m| m.as_str()).to_string();
206
207    // Strip optional closing hash sequence (CommonMark: trailing `#`s preceded by a space)
208    let rest_trimmed = rest.trim_end();
209    if let Some(last_hash_pos) = rest_trimmed.rfind('#') {
210        let after_hashes = &rest_trimmed[last_hash_pos..];
211        if after_hashes.chars().all(|c| c == '#') {
212            // Find where the consecutive trailing hashes start
213            let mut hash_start = last_hash_pos;
214            while hash_start > 0 && rest_trimmed.as_bytes()[hash_start - 1] == b'#' {
215                hash_start -= 1;
216            }
217            // Must be preceded by whitespace (or be the entire content)
218            if hash_start == 0
219                || rest_trimmed
220                    .as_bytes()
221                    .get(hash_start - 1)
222                    .is_some_and(u8::is_ascii_whitespace)
223            {
224                rest = rest_trimmed[..hash_start].trim_end().to_string();
225            }
226        }
227    }
228
229    let (clean_text, custom_id) = extract_header_id(&rest);
230    Some((clean_text, custom_id))
231}
232
233#[cfg(test)]
234mod tests {
235    use super::*;
236
237    #[test]
238    fn test_kramdown_format_extraction() {
239        // Simple kramdown format
240        let (text, id) = extract_header_id("# Header {#simple}");
241        assert_eq!(text, "# Header");
242        assert_eq!(id, Some("simple".to_string()));
243
244        let (text, id) = extract_header_id("## Section {#section-id}");
245        assert_eq!(text, "## Section");
246        assert_eq!(id, Some("section-id".to_string()));
247    }
248
249    #[test]
250    fn test_python_markdown_attr_list_extraction() {
251        // Python-markdown formats
252        let (text, id) = extract_header_id("# Header {:#colon-id}");
253        assert_eq!(text, "# Header");
254        assert_eq!(id, Some("colon-id".to_string()));
255
256        let (text, id) = extract_header_id("# Header {: #spaced-id }");
257        assert_eq!(text, "# Header");
258        assert_eq!(id, Some("spaced-id".to_string()));
259    }
260
261    #[test]
262    fn test_extended_attr_list_extraction() {
263        // ID with single class
264        let (text, id) = extract_header_id("# Header {: #with-class .highlight }");
265        assert_eq!(text, "# Header");
266        assert_eq!(id, Some("with-class".to_string()));
267
268        // ID with multiple classes
269        let (text, id) = extract_header_id("## Section {: #multi .class1 .class2 }");
270        assert_eq!(text, "## Section");
271        assert_eq!(id, Some("multi".to_string()));
272
273        // ID with key-value attributes
274        let (text, id) = extract_header_id("### Subsection {: #with-attrs data-test=\"value\" style=\"color: red\" }");
275        assert_eq!(text, "### Subsection");
276        assert_eq!(id, Some("with-attrs".to_string()));
277
278        // Complex combination
279        let (text, id) = extract_header_id("#### Complex {: #complex .highlight data-role=\"button\" title=\"Test\" }");
280        assert_eq!(text, "#### Complex");
281        assert_eq!(id, Some("complex".to_string()));
282
283        // ID with quotes in attributes
284        let (text, id) = extract_header_id("##### Quotes {: #quotes title=\"Has \\\"nested\\\" quotes\" }");
285        assert_eq!(text, "##### Quotes");
286        assert_eq!(id, Some("quotes".to_string()));
287    }
288
289    #[test]
290    fn test_attr_list_detection_edge_cases() {
291        // Attr-list without ID should not match
292        let (text, id) = extract_header_id("# Header {: .class-only }");
293        assert_eq!(text, "# Header {: .class-only }");
294        assert_eq!(id, None);
295
296        // Malformed attr-list should not match
297        let (text, id) = extract_header_id("# Header { no-hash }");
298        assert_eq!(text, "# Header { no-hash }");
299        assert_eq!(id, None);
300
301        // Empty ID should not match
302        let (text, id) = extract_header_id("# Header {: # }");
303        assert_eq!(text, "# Header {: # }");
304        assert_eq!(id, None);
305
306        // ID in middle (not at end) should not match
307        let (text, id) = extract_header_id("# Header {: #middle } with more text");
308        assert_eq!(text, "# Header {: #middle } with more text");
309        assert_eq!(id, None);
310    }
311
312    #[test]
313    fn test_standalone_attr_list_detection() {
314        // Simple ID formats
315        assert!(is_standalone_attr_list("{#custom-id}"));
316        assert!(is_standalone_attr_list("{ #spaced-id }"));
317        assert!(is_standalone_attr_list("{:#colon-id}"));
318        assert!(is_standalone_attr_list("{: #full-format }"));
319
320        // With classes and attributes
321        assert!(is_standalone_attr_list("{: #with-class .highlight }"));
322        assert!(is_standalone_attr_list("{: #multi .class1 .class2 }"));
323        assert!(is_standalone_attr_list("{: #complex .highlight data-test=\"value\" }"));
324
325        // Should not match
326        assert!(!is_standalone_attr_list("Some text {#not-standalone}"));
327        assert!(!is_standalone_attr_list("Text before {#id}"));
328        assert!(!is_standalone_attr_list("{#id} text after"));
329        assert!(!is_standalone_attr_list(""));
330        assert!(!is_standalone_attr_list("   ")); // just spaces
331        assert!(!is_standalone_attr_list("{: .class-only }")); // no ID
332    }
333
334    #[test]
335    fn test_standalone_attr_list_id_extraction() {
336        // Basic formats
337        assert_eq!(extract_standalone_attr_list_id("{#simple}"), Some("simple".to_string()));
338        assert_eq!(
339            extract_standalone_attr_list_id("{ #spaced }"),
340            Some("spaced".to_string())
341        );
342        assert_eq!(extract_standalone_attr_list_id("{:#colon}"), Some("colon".to_string()));
343        assert_eq!(extract_standalone_attr_list_id("{: #full }"), Some("full".to_string()));
344
345        // With additional attributes
346        assert_eq!(
347            extract_standalone_attr_list_id("{: #with-class .highlight }"),
348            Some("with-class".to_string())
349        );
350        assert_eq!(
351            extract_standalone_attr_list_id("{: #complex .class1 .class2 data=\"value\" }"),
352            Some("complex".to_string())
353        );
354
355        // Should return None
356        assert_eq!(extract_standalone_attr_list_id("Not an attr-list"), None);
357        assert_eq!(extract_standalone_attr_list_id("Text {#not-standalone}"), None);
358        assert_eq!(extract_standalone_attr_list_id("{: .class-only }"), None);
359        assert_eq!(extract_standalone_attr_list_id(""), None);
360    }
361
362    #[test]
363    fn test_backward_compatibility() {
364        // Ensure all original kramdown formats still work
365        let test_cases = vec![
366            ("# Header {#a}", "# Header", Some("a".to_string())),
367            ("# Header {#simple-id}", "# Header", Some("simple-id".to_string())),
368            ("## Heading {#heading-2}", "## Heading", Some("heading-2".to_string())),
369            (
370                "### With-Hyphens {#with-hyphens}",
371                "### With-Hyphens",
372                Some("with-hyphens".to_string()),
373            ),
374        ];
375
376        for (input, expected_text, expected_id) in test_cases {
377            let (text, id) = extract_header_id(input);
378            assert_eq!(text, expected_text, "Text mismatch for input: {input}");
379            assert_eq!(id, expected_id, "ID mismatch for input: {input}");
380        }
381    }
382
383    #[test]
384    fn test_invalid_id_with_dots() {
385        // IDs with dots should not be extracted (dots are not valid ID characters)
386        let (text, id) = extract_header_id("## Another. {#id.with.dots}");
387        assert_eq!(text, "## Another. {#id.with.dots}"); // Should not strip invalid ID
388        assert_eq!(id, None); // Should not extract invalid ID
389
390        // Test that only the part before the dot would be extracted if it was valid standalone
391        // But since it's in an invalid format, the whole thing should be rejected
392        let (text, id) = extract_header_id("## Another. {#id.more.dots}");
393        assert_eq!(text, "## Another. {#id.more.dots}");
394        assert_eq!(id, None);
395    }
396
397    #[test]
398    fn test_html_anchor_stripping() {
399        // HTML anchor elements should be stripped from heading text
400        // This is used by some authors for custom anchors
401
402        // Basic <a name="..."></a> pattern
403        let (text, id) = extract_header_id("<a name=\"cheatsheets\"></a>Cheat Sheets");
404        assert_eq!(text, "Cheat Sheets");
405        assert_eq!(id, None);
406
407        // <a id="..."></a> pattern
408        let (text, id) = extract_header_id("<a id=\"tools\"></a>Tools and session management");
409        assert_eq!(text, "Tools and session management");
410        assert_eq!(id, None);
411
412        // With spaces around the anchor
413        let (text, id) = extract_header_id("<a name=\"foo\"></a> Heading with space");
414        assert_eq!(text, "Heading with space");
415        assert_eq!(id, None);
416
417        // Combined with kramdown custom ID
418        let (text, id) = extract_header_id("<a name=\"old\"></a>My Section {#my-custom-id}");
419        assert_eq!(text, "My Section");
420        assert_eq!(id, Some("my-custom-id".to_string()));
421    }
422}
rumdl_lib/utils/header_id_utils.rs

rumdl_lib/utils/
header_id_utils.rs