rumdl_lib/utils/
markdown_elements.rs

1use regex::Regex;
2use std::collections::HashSet;
3use std::sync::LazyLock;
4
5/// Types of Markdown elements that can be detected
6#[derive(Debug, Clone, Copy, PartialEq)]
7pub enum ElementType {
8    CodeBlock,
9    CodeSpan,
10    Heading,
11    List,
12    FrontMatter,
13}
14
15/// Quality status of an element
16#[derive(Debug, Clone, Copy, PartialEq)]
17pub enum ElementQuality {
18    Valid,
19    Malformed,
20}
21
22/// Represents a detected element in a Markdown document
23#[derive(Debug, Clone)]
24pub struct MarkdownElement {
25    pub element_type: ElementType,
26    pub start_line: usize,
27    pub end_line: usize,
28    pub text: String,
29    pub metadata: Option<String>, // For code blocks: language, for headings: level, etc.
30    pub quality: ElementQuality,  // Whether the element is well-formed or malformed
31}
32
33// Code block patterns
34static CODE_BLOCK_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(```|~~~)(.*)$").unwrap());
35
36// Heading patterns
37static ATX_HEADING: LazyLock<Regex> =
38    LazyLock::new(|| Regex::new(r"^(\s*)(#{1,6})(\s*)([^#\n]*?)(?:\s+(#{1,6}))?\s*$").unwrap());
39static ATX_HEADING_NO_SPACE: LazyLock<Regex> =
40    LazyLock::new(|| Regex::new(r"^(\s*)(#{1,6})([^#\s][^#\n]*?)(?:\s+(#{1,6}))?\s*$").unwrap());
41static SETEXT_HEADING_1: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(=+)(\s*)$").unwrap());
42static SETEXT_HEADING_2: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(-+)(\s*)$").unwrap());
43
44// List patterns
45static UNORDERED_LIST: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)([*+-])(\s+)").unwrap());
46static ORDERED_LIST: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(\d+\.)(\s+)").unwrap());
47
48// Malformed list patterns
49static MALFORMED_UNORDERED_LIST: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)([*+-])([^\s])").unwrap());
50static MALFORMED_ORDERED_LIST: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(\d+\.)([^\s])").unwrap());
51static MALFORMED_ORDERED_LIST_WRONG_MARKER: LazyLock<Regex> =
52    LazyLock::new(|| Regex::new(r"^(\s*)(\d+[)\]])(\s*)").unwrap());
53
54// Empty list patterns (just marker without content)
55static EMPTY_UNORDERED_LIST: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)([*+-])\s*$").unwrap());
56
57// Front matter pattern
58static FRONT_MATTER_DELIMITER: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^---\s*$").unwrap());
59
60/// Utility struct for working with Markdown elements
61pub struct MarkdownElements;
62
63impl MarkdownElements {
64    /// Detect all code blocks in the content
65    pub fn detect_code_blocks(content: &str) -> Vec<MarkdownElement> {
66        let mut blocks = Vec::new();
67        let mut in_code_block = false;
68        let mut block_start = 0;
69        let mut language = String::new();
70        let mut fence_type = String::new();
71
72        for (i, line) in content.lines().enumerate() {
73            if let Some(captures) = CODE_BLOCK_START.captures(line) {
74                if !in_code_block {
75                    block_start = i;
76                    in_code_block = true;
77                    fence_type = captures.get(2).unwrap().as_str().to_string();
78                    language = captures.get(3).map_or("", |m| m.as_str()).trim().to_string();
79                } else if line.trim().starts_with(&fence_type) {
80                    // End of code block
81                    blocks.push(MarkdownElement {
82                        element_type: ElementType::CodeBlock,
83                        start_line: block_start,
84                        end_line: i,
85                        text: content
86                            .lines()
87                            .skip(block_start)
88                            .take(i - block_start + 1)
89                            .collect::<Vec<&str>>()
90                            .join("\n"),
91                        metadata: Some(language.clone()),
92                        quality: ElementQuality::Valid,
93                    });
94                    in_code_block = false;
95                    language = String::new();
96                }
97            }
98        }
99
100        // Handle unclosed code blocks
101        if in_code_block {
102            let line_count = content.lines().count();
103            blocks.push(MarkdownElement {
104                element_type: ElementType::CodeBlock,
105                start_line: block_start,
106                end_line: line_count - 1,
107                text: content.lines().skip(block_start).collect::<Vec<&str>>().join("\n"),
108                metadata: Some(language),
109                quality: ElementQuality::Malformed, // Unclosed code block is malformed
110            });
111        }
112
113        blocks
114    }
115
116    /// Detect all code block line indices in the content
117    pub fn detect_code_block_lines(content: &str) -> HashSet<usize> {
118        let code_blocks = Self::detect_code_blocks(content);
119        let mut lines = HashSet::new();
120
121        for block in code_blocks {
122            for i in block.start_line..=block.end_line {
123                lines.insert(i);
124            }
125        }
126
127        lines
128    }
129
130    /// Check if position in a line is within a code span
131    pub fn is_in_code_span(line: &str, position: usize) -> bool {
132        let mut in_code_span = false;
133        let mut code_start = 0;
134
135        for (pos, c) in line.char_indices() {
136            if c == '`' {
137                if !in_code_span {
138                    in_code_span = true;
139                    code_start = pos;
140                } else {
141                    // Found end of code span, check if position is within
142                    if position >= code_start && position <= pos {
143                        return true;
144                    }
145                    in_code_span = false;
146                }
147            }
148
149            // Early return optimization
150            if pos > position && !in_code_span {
151                return false;
152            }
153        }
154
155        // Check if position is in an unclosed code span
156        in_code_span && position >= code_start
157    }
158
159    /// Detect all headings in the content
160    pub fn detect_headings(content: &str) -> Vec<MarkdownElement> {
161        let mut headings = Vec::new();
162        let lines: Vec<&str> = content.lines().collect();
163        let code_block_lines = Self::detect_code_block_lines(content);
164
165        // Get frontmatter to skip those lines
166        let frontmatter_lines = if let Some(frontmatter) = Self::detect_front_matter(content) {
167            (frontmatter.start_line..=frontmatter.end_line).collect::<HashSet<usize>>()
168        } else {
169            HashSet::new()
170        };
171
172        // Process each line
173        for (i, line) in lines.iter().enumerate() {
174            // Skip lines in code blocks or frontmatter
175            if code_block_lines.contains(&i) || frontmatter_lines.contains(&i) {
176                continue;
177            }
178
179            // Check for ATX style heading with proper space
180            if let Some(captures) = ATX_HEADING.captures(line) {
181                let hashes = captures.get(2).unwrap().as_str();
182                let level = hashes.len().to_string();
183                let text = captures.get(4).map_or("", |m| m.as_str()).trim().to_string();
184                let spaces_after_hash = captures.get(3).map_or("", |m| m.as_str()).len();
185
186                // Determine if heading is well-formed
187                // Special cases for empty headings: # and ###### are valid, others need space
188                let quality = if spaces_after_hash > 0 || (text.is_empty() && (hashes.len() == 1 || hashes.len() == 6))
189                {
190                    ElementQuality::Valid
191                } else {
192                    ElementQuality::Malformed
193                };
194
195                headings.push(MarkdownElement {
196                    element_type: ElementType::Heading,
197                    start_line: i,
198                    end_line: i,
199                    text,
200                    metadata: Some(level),
201                    quality,
202                });
203
204                continue;
205            }
206
207            // Check for ATX style heading without space after #
208            if let Some(captures) = ATX_HEADING_NO_SPACE.captures(line) {
209                let hashes = captures.get(2).unwrap().as_str();
210                let level = hashes.len().to_string();
211                let text = captures.get(3).map_or("", |m| m.as_str()).trim().to_string();
212
213                headings.push(MarkdownElement {
214                    element_type: ElementType::Heading,
215                    start_line: i,
216                    end_line: i,
217                    text,
218                    metadata: Some(level),
219                    quality: ElementQuality::Malformed, // No space after # makes it malformed
220                });
221
222                continue;
223            }
224
225            // Check for Setext style heading (requires looking at next line)
226            if i + 1 < lines.len() {
227                let next_line = lines[i + 1];
228
229                if SETEXT_HEADING_1.is_match(next_line) {
230                    headings.push(MarkdownElement {
231                        element_type: ElementType::Heading,
232                        start_line: i,
233                        end_line: i + 1,
234                        text: line.trim().to_string(),
235                        metadata: Some("1".to_string()), // Level 1 setext heading
236                        quality: ElementQuality::Valid,
237                    });
238
239                    continue;
240                }
241
242                if SETEXT_HEADING_2.is_match(next_line) {
243                    headings.push(MarkdownElement {
244                        element_type: ElementType::Heading,
245                        start_line: i,
246                        end_line: i + 1,
247                        text: line.trim().to_string(),
248                        metadata: Some("2".to_string()), // Level 2 setext heading
249                        quality: ElementQuality::Valid,
250                    });
251
252                    continue;
253                }
254            }
255        }
256
257        headings
258    }
259
260    /// Get heading level (1-6) for a heading element
261    pub fn get_heading_level(element: &MarkdownElement) -> Option<u32> {
262        if element.element_type != ElementType::Heading {
263            return None;
264        }
265
266        element.metadata.as_ref().and_then(|level| level.parse::<u32>().ok())
267    }
268
269    /// Detect all list items in the content
270    pub fn detect_lists(content: &str) -> Vec<MarkdownElement> {
271        let mut lists = Vec::new();
272        let lines: Vec<&str> = content.lines().collect();
273        let code_block_lines = Self::detect_code_block_lines(content);
274
275        // Get frontmatter to skip those lines
276        let frontmatter_lines = if let Some(frontmatter) = Self::detect_front_matter(content) {
277            (frontmatter.start_line..=frontmatter.end_line).collect::<HashSet<usize>>()
278        } else {
279            HashSet::new()
280        };
281
282        // Pattern to match horizontal rule or front matter markers
283        static HORIZONTAL_RULE: LazyLock<Regex> =
284            LazyLock::new(|| Regex::new(r"^(\s*)(-{3,}|\*{3,}|_{3,})(\s*)$").unwrap());
285
286        for (i, line) in lines.iter().enumerate() {
287            // Skip lines in code blocks or frontmatter
288            if code_block_lines.contains(&i) || frontmatter_lines.contains(&i) {
289                continue;
290            }
291
292            // Skip lines that are horizontal rules or front matter markers
293            if HORIZONTAL_RULE.is_match(line) {
294                continue;
295            }
296
297            // Check for well-formed unordered list items
298            if let Some(_captures) = UNORDERED_LIST.captures(line) {
299                let marker = if line.trim_start().starts_with('*') {
300                    "asterisk"
301                } else if line.trim_start().starts_with('+') {
302                    "plus"
303                } else {
304                    "minus"
305                };
306
307                lists.push(MarkdownElement {
308                    element_type: ElementType::List,
309                    start_line: i,
310                    end_line: i,
311                    text: line.trim().to_string(),
312                    metadata: Some(marker.to_string()),
313                    quality: ElementQuality::Valid,
314                });
315
316                continue;
317            }
318
319            // Check for empty unordered list items (just marker)
320            if let Some(_captures) = EMPTY_UNORDERED_LIST.captures(line) {
321                // Exclude horizontal rules and front matter markers
322                if line.trim() == "---" || line.trim() == "***" || line.trim() == "___" {
323                    continue;
324                }
325
326                let marker = if line.trim_start().starts_with('*') {
327                    "asterisk"
328                } else if line.trim_start().starts_with('+') {
329                    "plus"
330                } else {
331                    "minus"
332                };
333
334                lists.push(MarkdownElement {
335                    element_type: ElementType::List,
336                    start_line: i,
337                    end_line: i,
338                    text: String::new(), // Empty list item
339                    metadata: Some(marker.to_string()),
340                    quality: ElementQuality::Valid,
341                });
342
343                continue;
344            }
345
346            // Check for malformed unordered list (no space after marker)
347            if let Some(_captures) = MALFORMED_UNORDERED_LIST.captures(line) {
348                // Exclude horizontal rules and front matter markers which might match this pattern
349                if line.trim() == "---" || line.trim() == "***" || line.trim() == "___" {
350                    continue;
351                }
352
353                let marker = if line.trim_start().starts_with('*') {
354                    "asterisk:no_space"
355                } else if line.trim_start().starts_with('+') {
356                    "plus:no_space"
357                } else {
358                    "minus:no_space"
359                };
360
361                lists.push(MarkdownElement {
362                    element_type: ElementType::List,
363                    start_line: i,
364                    end_line: i,
365                    text: line.trim().to_string(),
366                    metadata: Some(marker.to_string()),
367                    quality: ElementQuality::Malformed,
368                });
369
370                continue;
371            }
372
373            // Check for well-formed ordered list items
374            if let Some(_captures) = ORDERED_LIST.captures(line) {
375                lists.push(MarkdownElement {
376                    element_type: ElementType::List,
377                    start_line: i,
378                    end_line: i,
379                    text: line.trim().to_string(),
380                    metadata: Some("ordered".to_string()),
381                    quality: ElementQuality::Valid,
382                });
383
384                continue;
385            }
386
387            // Check for malformed ordered list (no space after marker)
388            if let Some(_captures) = MALFORMED_ORDERED_LIST.captures(line) {
389                lists.push(MarkdownElement {
390                    element_type: ElementType::List,
391                    start_line: i,
392                    end_line: i,
393                    text: line.trim().to_string(),
394                    metadata: Some("ordered:no_space".to_string()),
395                    quality: ElementQuality::Malformed,
396                });
397
398                continue;
399            }
400
401            // Check for malformed ordered list (wrong marker type)
402            if let Some(_captures) = MALFORMED_ORDERED_LIST_WRONG_MARKER.captures(line) {
403                lists.push(MarkdownElement {
404                    element_type: ElementType::List,
405                    start_line: i,
406                    end_line: i,
407                    text: line.trim().to_string(),
408                    metadata: Some("ordered:wrong_marker".to_string()),
409                    quality: ElementQuality::Malformed,
410                });
411            }
412        }
413
414        lists
415    }
416
417    /// Detect front matter in content
418    pub fn detect_front_matter(content: &str) -> Option<MarkdownElement> {
419        let lines: Vec<&str> = content.lines().collect();
420
421        if lines.is_empty() || !FRONT_MATTER_DELIMITER.is_match(lines[0]) {
422            return None;
423        }
424
425        // Look for closing delimiter
426        for (i, line) in lines.iter().enumerate().skip(1) {
427            if FRONT_MATTER_DELIMITER.is_match(line) {
428                return Some(MarkdownElement {
429                    element_type: ElementType::FrontMatter,
430                    start_line: 0,
431                    end_line: i,
432                    text: lines[0..=i].join("\n"),
433                    metadata: None,
434                    quality: ElementQuality::Valid,
435                });
436            }
437        }
438
439        // Front matter without closing delimiter is malformed
440        None
441    }
442
443    /// Convert heading text to a valid ID for fragment links
444    pub fn heading_to_fragment(text: &str) -> String {
445        // Remove any HTML tags
446        let text_no_html = regex::Regex::new(r"<[^>]*>").unwrap().replace_all(text, "");
447
448        // Convert to lowercase and trim
449        let text_lower = text_no_html.trim().to_lowercase();
450
451        // Replace spaces and punctuation with hyphens
452        let text_with_hyphens = text_lower
453            .chars()
454            .map(|c| if c.is_alphanumeric() { c } else { '-' })
455            .collect::<String>();
456
457        // Replace multiple consecutive hyphens with a single hyphen
458        let text_clean = text_with_hyphens
459            .split('-')
460            .filter(|s| !s.is_empty())
461            .collect::<Vec<_>>()
462            .join("-");
463
464        // Remove leading and trailing hyphens
465        text_clean.trim_matches('-').to_string()
466    }
467
468    /// Check if a line is in a code block
469    pub fn is_line_in_code_block(content: &str, line_number: usize) -> bool {
470        let code_block_lines = Self::detect_code_block_lines(content);
471        code_block_lines.contains(&line_number)
472    }
473
474    /// Get all line indices in a given Markdown element
475    pub fn get_element_line_indices(element: &MarkdownElement) -> Vec<usize> {
476        (element.start_line..=element.end_line).collect()
477    }
478}
479
480#[cfg(test)]
481mod tests {
482    use super::*;
483
484    #[test]
485    fn test_detect_code_blocks() {
486        let content = "# Heading\n```js\nlet x = 1;\n```\nText";
487        let blocks = MarkdownElements::detect_code_blocks(content);
488
489        assert_eq!(blocks.len(), 1);
490        assert_eq!(blocks[0].element_type, ElementType::CodeBlock);
491        assert_eq!(blocks[0].start_line, 1);
492        assert_eq!(blocks[0].end_line, 3);
493        assert_eq!(blocks[0].metadata, Some("js".to_string()));
494    }
495
496    #[test]
497    fn test_is_in_code_span() {
498        let line = "Text with `code` and more";
499        assert!(!MarkdownElements::is_in_code_span(line, 0));
500        assert!(MarkdownElements::is_in_code_span(line, 11));
501        assert!(!MarkdownElements::is_in_code_span(line, 20));
502    }
503
504    #[test]
505    fn test_detect_headings() {
506        let content = "# Heading 1\n## Heading 2\nText\nHeading 3\n===";
507        let headings = MarkdownElements::detect_headings(content);
508
509        assert_eq!(headings.len(), 3);
510        assert_eq!(MarkdownElements::get_heading_level(&headings[0]), Some(1));
511        assert_eq!(MarkdownElements::get_heading_level(&headings[1]), Some(2));
512        assert_eq!(MarkdownElements::get_heading_level(&headings[2]), Some(1));
513    }
514
515    #[test]
516    fn test_detect_lists() {
517        let content = "- Item 1\n* Item 2\n+ Item 3\n1. Item 4";
518        let lists = MarkdownElements::detect_lists(content);
519
520        assert_eq!(lists.len(), 4);
521        assert_eq!(lists[0].metadata, Some("minus".to_string()));
522        assert_eq!(lists[1].metadata, Some("asterisk".to_string()));
523        assert_eq!(lists[2].metadata, Some("plus".to_string()));
524        assert_eq!(lists[3].metadata, Some("ordered".to_string()));
525    }
526
527    #[test]
528    fn test_detect_front_matter() {
529        let content = "---\ntitle: Test\n---\n# Content";
530        let front_matter = MarkdownElements::detect_front_matter(content);
531
532        assert!(front_matter.is_some());
533        assert_eq!(front_matter.unwrap().end_line, 2);
534    }
535
536    #[test]
537    fn test_heading_to_fragment() {
538        assert_eq!(MarkdownElements::heading_to_fragment("Hello World!"), "hello-world");
539        assert_eq!(
540            MarkdownElements::heading_to_fragment("Complex: (Header) 123"),
541            "complex-header-123"
542        );
543    }
544
545    #[test]
546    fn test_is_line_in_code_block() {
547        let content = "Text\n```\nCode\n```\nMore text";
548        assert!(!MarkdownElements::is_line_in_code_block(content, 0));
549        assert!(MarkdownElements::is_line_in_code_block(content, 1));
550        assert!(MarkdownElements::is_line_in_code_block(content, 2));
551        assert!(MarkdownElements::is_line_in_code_block(content, 3));
552        assert!(!MarkdownElements::is_line_in_code_block(content, 4));
553    }
554}