rumdl 0.1.95

A fast Markdown linter written in Rust (Ru(st) MarkDown Linter)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
//! Utilities for extracting custom header IDs from various Markdown flavors
//!
//! This module supports multiple syntax formats for custom header IDs:
//!
//! ## Kramdown Format
//! - `{#custom-id}` - Simple ID without colon
//! - Example: `# Header {#my-id}`
//!
//! ## Python-markdown attr-list Format
//! - `{:#custom-id}` - ID with colon, no spaces
//! - `{: #custom-id}` - ID with colon and spaces
//! - `{: #custom-id .class}` - ID with classes
//! - `{: #custom-id .class data="value"}` - ID with full attributes
//! - Example: `# Header {: #my-id .highlight}`
//!
//! ## Position Support
//! - Inline: `# Header {#id}` (all formats)
//! - Next-line: Jekyll/kramdown style where attr-list appears on the line after the header
//!   ```markdown
//!   # Header
//!   {#next-line-id}
//!   ```
//!
//! The module provides functions to detect and extract IDs from both inline
//! and standalone (next-line) attr-list syntax.

use regex::Regex;
use std::sync::LazyLock;

/// Pattern for HTML anchor elements used for custom anchors in headings
/// Matches: `<a name="..."></a>`, `<a id="..."></a>`, `<a name="..." id="..."></a>`
/// These are commonly used by some authors to create custom anchors for headings
static HTML_ANCHOR_ELEMENT: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r#"<a\s+(?:name|id)="[^"]*"(?:\s+(?:name|id)="[^"]*")?>\s*</a>\s*"#).unwrap());

/// Pattern for custom header IDs supporting both kramdown and python-markdown attr-list formats
/// Supports: {#id}, { #id }, {:#id}, {: #id } and full attr-list with classes/attributes
/// Must contain #id but can have other attributes: {: #id .class data="value" }
/// More conservative: only matches when there's actually a hash followed by valid ID characters
static HEADER_ID_PATTERN: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"\s*\{\s*:?\s*([^}]*?#[^}]*?)\s*\}\s*$").unwrap());

/// Pattern to validate that an ID contains only valid characters
static ID_VALIDATE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^[a-zA-Z0-9_\-:]+$").unwrap());

/// Pattern for standalone attr-list lines (Jekyll/kramdown style on line after heading)
/// Matches lines that are just attr-list syntax: {#id}, {: #id .class }, etc.
static STANDALONE_ATTR_LIST_PATTERN: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"^\s*\{\s*:?\s*([^}]*#[a-zA-Z0-9_\-:]+[^}]*)\s*\}\s*$").unwrap());

/// Extract custom header ID from a line if present, returning clean text and ID
///
/// Supports multiple formats:
/// - Kramdown: `{#id}`
/// - Python-markdown: `{:#id}`, `{: #id}`, `{: #id .class}`
///
/// # Examples
/// ```
/// use rumdl_lib::utils::header_id_utils::extract_header_id;
///
/// // Kramdown format
/// let (text, id) = extract_header_id("# Header {#custom-id}");
/// assert_eq!(text, "# Header");
/// assert_eq!(id, Some("custom-id".to_string()));
///
/// // Python-markdown attr-list format
/// let (text, id) = extract_header_id("# Header {: #my-id .highlight}");
/// assert_eq!(text, "# Header");
/// assert_eq!(id, Some("my-id".to_string()));
/// ```
pub fn extract_header_id(line: &str) -> (String, Option<String>) {
    // First, strip HTML anchor elements (e.g., <a name="..."></a>) from the line
    // These are used by some authors for custom anchors: `## <a name="foo"></a>Heading`
    let line = HTML_ANCHOR_ELEMENT.replace_all(line, "");
    let line = line.as_ref();

    if let Some(captures) = HEADER_ID_PATTERN.captures(line)
        && let Some(full_match) = captures.get(0)
        && let Some(attr_content) = captures.get(1)
    {
        let attr_str = attr_content.as_str().trim();

        // First, find all potential ID matches in the attr-list
        if let Some(hash_pos) = attr_str.find('#') {
            // Extract everything after the hash
            let after_hash = &attr_str[hash_pos + 1..];

            // For simple cases like {#id}, the ID goes to the end
            // For complex cases like {: #id .class}, we need to find where the ID ends

            // First check if this looks like a simple kramdown ID: {#id} with no spaces or attributes
            let is_simple_format = !attr_str.contains(' ') && !attr_str.contains('=') && attr_str.starts_with('#');

            if is_simple_format {
                // Simple format: entire content after # should be the ID
                let potential_id = after_hash;
                if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
                    let clean_text = line[..full_match.start()].trim_end().to_string();
                    return (clean_text, Some(potential_id.to_string()));
                }
                // If validation fails, reject the entire attr-list
            } else {
                // Complex format: find proper delimiters (space for next attribute, dot for class)
                if let Some(delimiter_pos) = after_hash.find(|c: char| c.is_whitespace() || c == '.' || c == '=') {
                    let potential_id = &after_hash[..delimiter_pos];
                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
                        let clean_text = line[..full_match.start()].trim_end().to_string();
                        return (clean_text, Some(potential_id.to_string()));
                    }
                } else {
                    // No delimiter found in complex format, ID goes to end
                    let potential_id = after_hash;
                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
                        let clean_text = line[..full_match.start()].trim_end().to_string();
                        return (clean_text, Some(potential_id.to_string()));
                    }
                }
            }
        }
    }
    (line.to_string(), None)
}

/// Check if a line is a standalone attr-list (Jekyll/kramdown style)
///
/// This detects attr-list syntax that appears on its own line, typically
/// the line after a header to provide additional attributes.
///
/// # Examples
/// ```
/// use rumdl_lib::utils::header_id_utils::is_standalone_attr_list;
///
/// assert!(is_standalone_attr_list("{#custom-id}"));
/// assert!(is_standalone_attr_list("{: #spaced .class }"));
/// assert!(!is_standalone_attr_list("Some text {#not-standalone}"));
/// assert!(!is_standalone_attr_list(""));
/// ```
pub fn is_standalone_attr_list(line: &str) -> bool {
    STANDALONE_ATTR_LIST_PATTERN.is_match(line)
}

/// Extract ID from a standalone attr-list line
///
/// Returns the ID if the line is a valid standalone attr-list with an ID.
///
/// # Examples
/// ```
/// use rumdl_lib::utils::header_id_utils::extract_standalone_attr_list_id;
///
/// assert_eq!(extract_standalone_attr_list_id("{#custom-id}"), Some("custom-id".to_string()));
/// assert_eq!(extract_standalone_attr_list_id("{: #spaced .class }"), Some("spaced".to_string()));
/// assert_eq!(extract_standalone_attr_list_id("not an attr-list"), None);
/// ```
pub fn extract_standalone_attr_list_id(line: &str) -> Option<String> {
    if let Some(captures) = STANDALONE_ATTR_LIST_PATTERN.captures(line)
        && let Some(attr_content) = captures.get(1)
    {
        let attr_str = attr_content.as_str().trim();

        // Use the same logic as extract_header_id for consistency
        if let Some(hash_pos) = attr_str.find('#') {
            let after_hash = &attr_str[hash_pos + 1..];

            // Check if this looks like a simple kramdown ID: {#id} with no spaces or attributes
            let is_simple_format = !attr_str.contains(' ') && !attr_str.contains('=') && attr_str.starts_with('#');

            if is_simple_format {
                // Simple format: entire content after # should be the ID
                let potential_id = after_hash;
                if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
                    return Some(potential_id.to_string());
                }
            } else {
                // Complex format: find proper delimiters (space for next attribute, dot for class)
                if let Some(delimiter_pos) = after_hash.find(|c: char| c.is_whitespace() || c == '.' || c == '=') {
                    let potential_id = &after_hash[..delimiter_pos];
                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
                        return Some(potential_id.to_string());
                    }
                } else {
                    // No delimiter found in complex format, ID goes to end
                    let potential_id = after_hash;
                    if ID_VALIDATE_PATTERN.is_match(potential_id) && !potential_id.is_empty() {
                        return Some(potential_id.to_string());
                    }
                }
            }
        }
    }
    None
}

/// Parse an ATX heading written inside a blockquote's inner text.
///
/// Blockquote headings (`> ## Heading`) are not detected by the main
/// line-based heading parser, but they still produce valid fragment anchors.
/// Strips the leading `#` marker, an optional CommonMark closing hash
/// sequence, and any trailing `{#custom-id}`. Returns `(clean_text,
/// custom_id)` or `None` when the blockquote content is not an ATX heading.
pub fn parse_blockquote_atx_heading(bq_content: &str) -> Option<(String, Option<String>)> {
    static BQ_ATX_HEADING_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(#{1,6})\s+(.*)$").unwrap());

    let trimmed = bq_content.trim();
    let caps = BQ_ATX_HEADING_RE.captures(trimmed)?;
    let mut rest = caps.get(2).map_or("", |m| m.as_str()).to_string();

    // Strip optional closing hash sequence (CommonMark: trailing `#`s preceded by a space)
    let rest_trimmed = rest.trim_end();
    if let Some(last_hash_pos) = rest_trimmed.rfind('#') {
        let after_hashes = &rest_trimmed[last_hash_pos..];
        if after_hashes.chars().all(|c| c == '#') {
            // Find where the consecutive trailing hashes start
            let mut hash_start = last_hash_pos;
            while hash_start > 0 && rest_trimmed.as_bytes()[hash_start - 1] == b'#' {
                hash_start -= 1;
            }
            // Must be preceded by whitespace (or be the entire content)
            if hash_start == 0
                || rest_trimmed
                    .as_bytes()
                    .get(hash_start - 1)
                    .is_some_and(u8::is_ascii_whitespace)
            {
                rest = rest_trimmed[..hash_start].trim_end().to_string();
            }
        }
    }

    let (clean_text, custom_id) = extract_header_id(&rest);
    Some((clean_text, custom_id))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_kramdown_format_extraction() {
        // Simple kramdown format
        let (text, id) = extract_header_id("# Header {#simple}");
        assert_eq!(text, "# Header");
        assert_eq!(id, Some("simple".to_string()));

        let (text, id) = extract_header_id("## Section {#section-id}");
        assert_eq!(text, "## Section");
        assert_eq!(id, Some("section-id".to_string()));
    }

    #[test]
    fn test_python_markdown_attr_list_extraction() {
        // Python-markdown formats
        let (text, id) = extract_header_id("# Header {:#colon-id}");
        assert_eq!(text, "# Header");
        assert_eq!(id, Some("colon-id".to_string()));

        let (text, id) = extract_header_id("# Header {: #spaced-id }");
        assert_eq!(text, "# Header");
        assert_eq!(id, Some("spaced-id".to_string()));
    }

    #[test]
    fn test_extended_attr_list_extraction() {
        // ID with single class
        let (text, id) = extract_header_id("# Header {: #with-class .highlight }");
        assert_eq!(text, "# Header");
        assert_eq!(id, Some("with-class".to_string()));

        // ID with multiple classes
        let (text, id) = extract_header_id("## Section {: #multi .class1 .class2 }");
        assert_eq!(text, "## Section");
        assert_eq!(id, Some("multi".to_string()));

        // ID with key-value attributes
        let (text, id) = extract_header_id("### Subsection {: #with-attrs data-test=\"value\" style=\"color: red\" }");
        assert_eq!(text, "### Subsection");
        assert_eq!(id, Some("with-attrs".to_string()));

        // Complex combination
        let (text, id) = extract_header_id("#### Complex {: #complex .highlight data-role=\"button\" title=\"Test\" }");
        assert_eq!(text, "#### Complex");
        assert_eq!(id, Some("complex".to_string()));

        // ID with quotes in attributes
        let (text, id) = extract_header_id("##### Quotes {: #quotes title=\"Has \\\"nested\\\" quotes\" }");
        assert_eq!(text, "##### Quotes");
        assert_eq!(id, Some("quotes".to_string()));
    }

    #[test]
    fn test_attr_list_detection_edge_cases() {
        // Attr-list without ID should not match
        let (text, id) = extract_header_id("# Header {: .class-only }");
        assert_eq!(text, "# Header {: .class-only }");
        assert_eq!(id, None);

        // Malformed attr-list should not match
        let (text, id) = extract_header_id("# Header { no-hash }");
        assert_eq!(text, "# Header { no-hash }");
        assert_eq!(id, None);

        // Empty ID should not match
        let (text, id) = extract_header_id("# Header {: # }");
        assert_eq!(text, "# Header {: # }");
        assert_eq!(id, None);

        // ID in middle (not at end) should not match
        let (text, id) = extract_header_id("# Header {: #middle } with more text");
        assert_eq!(text, "# Header {: #middle } with more text");
        assert_eq!(id, None);
    }

    #[test]
    fn test_standalone_attr_list_detection() {
        // Simple ID formats
        assert!(is_standalone_attr_list("{#custom-id}"));
        assert!(is_standalone_attr_list("{ #spaced-id }"));
        assert!(is_standalone_attr_list("{:#colon-id}"));
        assert!(is_standalone_attr_list("{: #full-format }"));

        // With classes and attributes
        assert!(is_standalone_attr_list("{: #with-class .highlight }"));
        assert!(is_standalone_attr_list("{: #multi .class1 .class2 }"));
        assert!(is_standalone_attr_list("{: #complex .highlight data-test=\"value\" }"));

        // Should not match
        assert!(!is_standalone_attr_list("Some text {#not-standalone}"));
        assert!(!is_standalone_attr_list("Text before {#id}"));
        assert!(!is_standalone_attr_list("{#id} text after"));
        assert!(!is_standalone_attr_list(""));
        assert!(!is_standalone_attr_list("   ")); // just spaces
        assert!(!is_standalone_attr_list("{: .class-only }")); // no ID
    }

    #[test]
    fn test_standalone_attr_list_id_extraction() {
        // Basic formats
        assert_eq!(extract_standalone_attr_list_id("{#simple}"), Some("simple".to_string()));
        assert_eq!(
            extract_standalone_attr_list_id("{ #spaced }"),
            Some("spaced".to_string())
        );
        assert_eq!(extract_standalone_attr_list_id("{:#colon}"), Some("colon".to_string()));
        assert_eq!(extract_standalone_attr_list_id("{: #full }"), Some("full".to_string()));

        // With additional attributes
        assert_eq!(
            extract_standalone_attr_list_id("{: #with-class .highlight }"),
            Some("with-class".to_string())
        );
        assert_eq!(
            extract_standalone_attr_list_id("{: #complex .class1 .class2 data=\"value\" }"),
            Some("complex".to_string())
        );

        // Should return None
        assert_eq!(extract_standalone_attr_list_id("Not an attr-list"), None);
        assert_eq!(extract_standalone_attr_list_id("Text {#not-standalone}"), None);
        assert_eq!(extract_standalone_attr_list_id("{: .class-only }"), None);
        assert_eq!(extract_standalone_attr_list_id(""), None);
    }

    #[test]
    fn test_backward_compatibility() {
        // Ensure all original kramdown formats still work
        let test_cases = vec![
            ("# Header {#a}", "# Header", Some("a".to_string())),
            ("# Header {#simple-id}", "# Header", Some("simple-id".to_string())),
            ("## Heading {#heading-2}", "## Heading", Some("heading-2".to_string())),
            (
                "### With-Hyphens {#with-hyphens}",
                "### With-Hyphens",
                Some("with-hyphens".to_string()),
            ),
        ];

        for (input, expected_text, expected_id) in test_cases {
            let (text, id) = extract_header_id(input);
            assert_eq!(text, expected_text, "Text mismatch for input: {input}");
            assert_eq!(id, expected_id, "ID mismatch for input: {input}");
        }
    }

    #[test]
    fn test_invalid_id_with_dots() {
        // IDs with dots should not be extracted (dots are not valid ID characters)
        let (text, id) = extract_header_id("## Another. {#id.with.dots}");
        assert_eq!(text, "## Another. {#id.with.dots}"); // Should not strip invalid ID
        assert_eq!(id, None); // Should not extract invalid ID

        // Test that only the part before the dot would be extracted if it was valid standalone
        // But since it's in an invalid format, the whole thing should be rejected
        let (text, id) = extract_header_id("## Another. {#id.more.dots}");
        assert_eq!(text, "## Another. {#id.more.dots}");
        assert_eq!(id, None);
    }

    #[test]
    fn test_html_anchor_stripping() {
        // HTML anchor elements should be stripped from heading text
        // This is used by some authors for custom anchors

        // Basic <a name="..."></a> pattern
        let (text, id) = extract_header_id("<a name=\"cheatsheets\"></a>Cheat Sheets");
        assert_eq!(text, "Cheat Sheets");
        assert_eq!(id, None);

        // <a id="..."></a> pattern
        let (text, id) = extract_header_id("<a id=\"tools\"></a>Tools and session management");
        assert_eq!(text, "Tools and session management");
        assert_eq!(id, None);

        // With spaces around the anchor
        let (text, id) = extract_header_id("<a name=\"foo\"></a> Heading with space");
        assert_eq!(text, "Heading with space");
        assert_eq!(id, None);

        // Combined with kramdown custom ID
        let (text, id) = extract_header_id("<a name=\"old\"></a>My Section {#my-custom-id}");
        assert_eq!(text, "My Section");
        assert_eq!(id, Some("my-custom-id".to_string()));
    }
}