rumdl_lib/utils/
markdown_elements.rs

1use lazy_static::lazy_static;
2use regex::Regex;
3use std::collections::HashSet;
4
5/// Types of Markdown elements that can be detected
6#[derive(Debug, Clone, Copy, PartialEq)]
7pub enum ElementType {
8    CodeBlock,
9    CodeSpan,
10    Heading,
11    List,
12    FrontMatter,
13}
14
15/// Quality status of an element
16#[derive(Debug, Clone, Copy, PartialEq)]
17pub enum ElementQuality {
18    Valid,
19    Malformed,
20}
21
22/// Represents a detected element in a Markdown document
23#[derive(Debug, Clone)]
24pub struct MarkdownElement {
25    pub element_type: ElementType,
26    pub start_line: usize,
27    pub end_line: usize,
28    pub text: String,
29    pub metadata: Option<String>, // For code blocks: language, for headings: level, etc.
30    pub quality: ElementQuality,  // Whether the element is well-formed or malformed
31}
32
33lazy_static! {
34    // Code block patterns
35    static ref CODE_BLOCK_START: Regex = Regex::new(r"^(\s*)(```|~~~)(.*)$").unwrap();
36    static ref CODE_BLOCK_END: Regex = Regex::new(r"^(\s*)(```|~~~)\s*$").unwrap();
37    static ref CODE_SPAN_PATTERN: Regex = Regex::new(r"`+").unwrap();
38
39    // Heading patterns
40    static ref ATX_HEADING: Regex = Regex::new(r"^(\s*)(#{1,6})(\s*)([^#\n]*?)(?:\s+(#{1,6}))?\s*$").unwrap();
41    static ref ATX_HEADING_NO_SPACE: Regex = Regex::new(r"^(\s*)(#{1,6})([^#\s][^#\n]*?)(?:\s+(#{1,6}))?\s*$").unwrap();
42    static ref SETEXT_HEADING_1: Regex = Regex::new(r"^(\s*)(=+)(\s*)$").unwrap();
43    static ref SETEXT_HEADING_2: Regex = Regex::new(r"^(\s*)(-+)(\s*)$").unwrap();
44
45    // List patterns
46    static ref UNORDERED_LIST: Regex = Regex::new(r"^(\s*)([*+-])(\s+)").unwrap();
47    static ref ORDERED_LIST: Regex = Regex::new(r"^(\s*)(\d+\.)(\s+)").unwrap();
48
49    // Malformed list patterns
50    static ref MALFORMED_UNORDERED_LIST: Regex = Regex::new(r"^(\s*)([*+-])([^\s])").unwrap();
51    static ref MALFORMED_ORDERED_LIST: Regex = Regex::new(r"^(\s*)(\d+\.)([^\s])").unwrap();
52    static ref MALFORMED_ORDERED_LIST_WRONG_MARKER: Regex = Regex::new(r"^(\s*)(\d+[)\]])(\s*)").unwrap();
53
54    // Empty list patterns (just marker without content)
55    static ref EMPTY_UNORDERED_LIST: Regex = Regex::new(r"^(\s*)([*+-])\s*$").unwrap();
56
57    // Front matter pattern
58    static ref FRONT_MATTER_DELIMITER: Regex = Regex::new(r"^---\s*$").unwrap();
59}
60
61/// Utility struct for working with Markdown elements
62pub struct MarkdownElements;
63
64impl MarkdownElements {
65    /// Detect all code blocks in the content
66    pub fn detect_code_blocks(content: &str) -> Vec<MarkdownElement> {
67        let mut blocks = Vec::new();
68        let mut in_code_block = false;
69        let mut block_start = 0;
70        let mut language = String::new();
71        let mut fence_type = String::new();
72
73        for (i, line) in content.lines().enumerate() {
74            if let Some(captures) = CODE_BLOCK_START.captures(line) {
75                if !in_code_block {
76                    block_start = i;
77                    in_code_block = true;
78                    fence_type = captures.get(2).unwrap().as_str().to_string();
79                    language = captures.get(3).map_or("", |m| m.as_str()).trim().to_string();
80                } else if line.trim().starts_with(&fence_type) {
81                    // End of code block
82                    blocks.push(MarkdownElement {
83                        element_type: ElementType::CodeBlock,
84                        start_line: block_start,
85                        end_line: i,
86                        text: content
87                            .lines()
88                            .skip(block_start)
89                            .take(i - block_start + 1)
90                            .collect::<Vec<&str>>()
91                            .join("\n"),
92                        metadata: Some(language.clone()),
93                        quality: ElementQuality::Valid,
94                    });
95                    in_code_block = false;
96                    language = String::new();
97                }
98            }
99        }
100
101        // Handle unclosed code blocks
102        if in_code_block {
103            let line_count = content.lines().count();
104            blocks.push(MarkdownElement {
105                element_type: ElementType::CodeBlock,
106                start_line: block_start,
107                end_line: line_count - 1,
108                text: content.lines().skip(block_start).collect::<Vec<&str>>().join("\n"),
109                metadata: Some(language),
110                quality: ElementQuality::Malformed, // Unclosed code block is malformed
111            });
112        }
113
114        blocks
115    }
116
117    /// Detect all code block line indices in the content
118    pub fn detect_code_block_lines(content: &str) -> HashSet<usize> {
119        let code_blocks = Self::detect_code_blocks(content);
120        let mut lines = HashSet::new();
121
122        for block in code_blocks {
123            for i in block.start_line..=block.end_line {
124                lines.insert(i);
125            }
126        }
127
128        lines
129    }
130
131    /// Check if position in a line is within a code span
132    pub fn is_in_code_span(line: &str, position: usize) -> bool {
133        let mut in_code_span = false;
134        let mut code_start = 0;
135
136        for (pos, c) in line.char_indices() {
137            if c == '`' {
138                if !in_code_span {
139                    in_code_span = true;
140                    code_start = pos;
141                } else {
142                    // Found end of code span, check if position is within
143                    if position >= code_start && position <= pos {
144                        return true;
145                    }
146                    in_code_span = false;
147                }
148            }
149
150            // Early return optimization
151            if pos > position && !in_code_span {
152                return false;
153            }
154        }
155
156        // Check if position is in an unclosed code span
157        in_code_span && position >= code_start
158    }
159
160    /// Detect all headings in the content
161    pub fn detect_headings(content: &str) -> Vec<MarkdownElement> {
162        let mut headings = Vec::new();
163        let lines: Vec<&str> = content.lines().collect();
164        let code_block_lines = Self::detect_code_block_lines(content);
165
166        // Get frontmatter to skip those lines
167        let frontmatter_lines = if let Some(frontmatter) = Self::detect_front_matter(content) {
168            (frontmatter.start_line..=frontmatter.end_line).collect::<HashSet<usize>>()
169        } else {
170            HashSet::new()
171        };
172
173        // Process each line
174        for (i, line) in lines.iter().enumerate() {
175            // Skip lines in code blocks or frontmatter
176            if code_block_lines.contains(&i) || frontmatter_lines.contains(&i) {
177                continue;
178            }
179
180            // Check for ATX style heading with proper space
181            if let Some(captures) = ATX_HEADING.captures(line) {
182                let hashes = captures.get(2).unwrap().as_str();
183                let level = hashes.len().to_string();
184                let text = captures.get(4).map_or("", |m| m.as_str()).trim().to_string();
185                let spaces_after_hash = captures.get(3).map_or("", |m| m.as_str()).len();
186
187                // Determine if heading is well-formed
188                // Special cases for empty headings: # and ###### are valid, others need space
189                let quality = if spaces_after_hash > 0 || (text.is_empty() && (hashes.len() == 1 || hashes.len() == 6))
190                {
191                    ElementQuality::Valid
192                } else {
193                    ElementQuality::Malformed
194                };
195
196                headings.push(MarkdownElement {
197                    element_type: ElementType::Heading,
198                    start_line: i,
199                    end_line: i,
200                    text,
201                    metadata: Some(level),
202                    quality,
203                });
204
205                continue;
206            }
207
208            // Check for ATX style heading without space after #
209            if let Some(captures) = ATX_HEADING_NO_SPACE.captures(line) {
210                let hashes = captures.get(2).unwrap().as_str();
211                let level = hashes.len().to_string();
212                let text = captures.get(3).map_or("", |m| m.as_str()).trim().to_string();
213
214                headings.push(MarkdownElement {
215                    element_type: ElementType::Heading,
216                    start_line: i,
217                    end_line: i,
218                    text,
219                    metadata: Some(level),
220                    quality: ElementQuality::Malformed, // No space after # makes it malformed
221                });
222
223                continue;
224            }
225
226            // Check for Setext style heading (requires looking at next line)
227            if i + 1 < lines.len() {
228                let next_line = lines[i + 1];
229
230                if SETEXT_HEADING_1.is_match(next_line) {
231                    headings.push(MarkdownElement {
232                        element_type: ElementType::Heading,
233                        start_line: i,
234                        end_line: i + 1,
235                        text: line.trim().to_string(),
236                        metadata: Some("1".to_string()), // Level 1 setext heading
237                        quality: ElementQuality::Valid,
238                    });
239
240                    continue;
241                }
242
243                if SETEXT_HEADING_2.is_match(next_line) {
244                    headings.push(MarkdownElement {
245                        element_type: ElementType::Heading,
246                        start_line: i,
247                        end_line: i + 1,
248                        text: line.trim().to_string(),
249                        metadata: Some("2".to_string()), // Level 2 setext heading
250                        quality: ElementQuality::Valid,
251                    });
252
253                    continue;
254                }
255            }
256        }
257
258        headings
259    }
260
261    /// Get heading level (1-6) for a heading element
262    pub fn get_heading_level(element: &MarkdownElement) -> Option<u32> {
263        if element.element_type != ElementType::Heading {
264            return None;
265        }
266
267        element.metadata.as_ref().and_then(|level| level.parse::<u32>().ok())
268    }
269
270    /// Detect all list items in the content
271    pub fn detect_lists(content: &str) -> Vec<MarkdownElement> {
272        let mut lists = Vec::new();
273        let lines: Vec<&str> = content.lines().collect();
274        let code_block_lines = Self::detect_code_block_lines(content);
275
276        // Get frontmatter to skip those lines
277        let frontmatter_lines = if let Some(frontmatter) = Self::detect_front_matter(content) {
278            (frontmatter.start_line..=frontmatter.end_line).collect::<HashSet<usize>>()
279        } else {
280            HashSet::new()
281        };
282
283        // Pattern to match horizontal rule or front matter markers
284        lazy_static! {
285            static ref HORIZONTAL_RULE: Regex = Regex::new(r"^(\s*)(-{3,}|\*{3,}|_{3,})(\s*)$").unwrap();
286        }
287
288        for (i, line) in lines.iter().enumerate() {
289            // Skip lines in code blocks or frontmatter
290            if code_block_lines.contains(&i) || frontmatter_lines.contains(&i) {
291                continue;
292            }
293
294            // Skip lines that are horizontal rules or front matter markers
295            if HORIZONTAL_RULE.is_match(line) {
296                continue;
297            }
298
299            // Check for well-formed unordered list items
300            if let Some(_captures) = UNORDERED_LIST.captures(line) {
301                let marker = if line.trim_start().starts_with('*') {
302                    "asterisk"
303                } else if line.trim_start().starts_with('+') {
304                    "plus"
305                } else {
306                    "minus"
307                };
308
309                lists.push(MarkdownElement {
310                    element_type: ElementType::List,
311                    start_line: i,
312                    end_line: i,
313                    text: line.trim().to_string(),
314                    metadata: Some(marker.to_string()),
315                    quality: ElementQuality::Valid,
316                });
317
318                continue;
319            }
320
321            // Check for empty unordered list items (just marker)
322            if let Some(_captures) = EMPTY_UNORDERED_LIST.captures(line) {
323                // Exclude horizontal rules and front matter markers
324                if line.trim() == "---" || line.trim() == "***" || line.trim() == "___" {
325                    continue;
326                }
327
328                let marker = if line.trim_start().starts_with('*') {
329                    "asterisk"
330                } else if line.trim_start().starts_with('+') {
331                    "plus"
332                } else {
333                    "minus"
334                };
335
336                lists.push(MarkdownElement {
337                    element_type: ElementType::List,
338                    start_line: i,
339                    end_line: i,
340                    text: String::new(), // Empty list item
341                    metadata: Some(marker.to_string()),
342                    quality: ElementQuality::Valid,
343                });
344
345                continue;
346            }
347
348            // Check for malformed unordered list (no space after marker)
349            if let Some(_captures) = MALFORMED_UNORDERED_LIST.captures(line) {
350                // Exclude horizontal rules and front matter markers which might match this pattern
351                if line.trim() == "---" || line.trim() == "***" || line.trim() == "___" {
352                    continue;
353                }
354
355                let marker = if line.trim_start().starts_with('*') {
356                    "asterisk:no_space"
357                } else if line.trim_start().starts_with('+') {
358                    "plus:no_space"
359                } else {
360                    "minus:no_space"
361                };
362
363                lists.push(MarkdownElement {
364                    element_type: ElementType::List,
365                    start_line: i,
366                    end_line: i,
367                    text: line.trim().to_string(),
368                    metadata: Some(marker.to_string()),
369                    quality: ElementQuality::Malformed,
370                });
371
372                continue;
373            }
374
375            // Check for well-formed ordered list items
376            if let Some(_captures) = ORDERED_LIST.captures(line) {
377                lists.push(MarkdownElement {
378                    element_type: ElementType::List,
379                    start_line: i,
380                    end_line: i,
381                    text: line.trim().to_string(),
382                    metadata: Some("ordered".to_string()),
383                    quality: ElementQuality::Valid,
384                });
385
386                continue;
387            }
388
389            // Check for malformed ordered list (no space after marker)
390            if let Some(_captures) = MALFORMED_ORDERED_LIST.captures(line) {
391                lists.push(MarkdownElement {
392                    element_type: ElementType::List,
393                    start_line: i,
394                    end_line: i,
395                    text: line.trim().to_string(),
396                    metadata: Some("ordered:no_space".to_string()),
397                    quality: ElementQuality::Malformed,
398                });
399
400                continue;
401            }
402
403            // Check for malformed ordered list (wrong marker type)
404            if let Some(_captures) = MALFORMED_ORDERED_LIST_WRONG_MARKER.captures(line) {
405                lists.push(MarkdownElement {
406                    element_type: ElementType::List,
407                    start_line: i,
408                    end_line: i,
409                    text: line.trim().to_string(),
410                    metadata: Some("ordered:wrong_marker".to_string()),
411                    quality: ElementQuality::Malformed,
412                });
413            }
414        }
415
416        lists
417    }
418
419    /// Detect front matter in content
420    pub fn detect_front_matter(content: &str) -> Option<MarkdownElement> {
421        let lines: Vec<&str> = content.lines().collect();
422
423        if lines.is_empty() || !FRONT_MATTER_DELIMITER.is_match(lines[0]) {
424            return None;
425        }
426
427        // Look for closing delimiter
428        for (i, line) in lines.iter().enumerate().skip(1) {
429            if FRONT_MATTER_DELIMITER.is_match(line) {
430                return Some(MarkdownElement {
431                    element_type: ElementType::FrontMatter,
432                    start_line: 0,
433                    end_line: i,
434                    text: lines[0..=i].join("\n"),
435                    metadata: None,
436                    quality: ElementQuality::Valid,
437                });
438            }
439        }
440
441        // Front matter without closing delimiter is malformed
442        None
443    }
444
445    /// Convert heading text to a valid ID for fragment links
446    pub fn heading_to_fragment(text: &str) -> String {
447        // Remove any HTML tags
448        let text_no_html = regex::Regex::new(r"<[^>]*>").unwrap().replace_all(text, "");
449
450        // Convert to lowercase and trim
451        let text_lower = text_no_html.trim().to_lowercase();
452
453        // Replace spaces and punctuation with hyphens
454        let text_with_hyphens = text_lower
455            .chars()
456            .map(|c| if c.is_alphanumeric() { c } else { '-' })
457            .collect::<String>();
458
459        // Replace multiple consecutive hyphens with a single hyphen
460        let text_clean = text_with_hyphens
461            .split('-')
462            .filter(|s| !s.is_empty())
463            .collect::<Vec<_>>()
464            .join("-");
465
466        // Remove leading and trailing hyphens
467        text_clean.trim_matches('-').to_string()
468    }
469
470    /// Check if a line is in a code block
471    pub fn is_line_in_code_block(content: &str, line_number: usize) -> bool {
472        let code_block_lines = Self::detect_code_block_lines(content);
473        code_block_lines.contains(&line_number)
474    }
475
476    /// Get all line indices in a given Markdown element
477    pub fn get_element_line_indices(element: &MarkdownElement) -> Vec<usize> {
478        (element.start_line..=element.end_line).collect()
479    }
480}
481
482#[cfg(test)]
483mod tests {
484    use super::*;
485
486    #[test]
487    fn test_detect_code_blocks() {
488        let content = "# Heading\n```js\nlet x = 1;\n```\nText";
489        let blocks = MarkdownElements::detect_code_blocks(content);
490
491        assert_eq!(blocks.len(), 1);
492        assert_eq!(blocks[0].element_type, ElementType::CodeBlock);
493        assert_eq!(blocks[0].start_line, 1);
494        assert_eq!(blocks[0].end_line, 3);
495        assert_eq!(blocks[0].metadata, Some("js".to_string()));
496    }
497
498    #[test]
499    fn test_is_in_code_span() {
500        let line = "Text with `code` and more";
501        assert!(!MarkdownElements::is_in_code_span(line, 0));
502        assert!(MarkdownElements::is_in_code_span(line, 11));
503        assert!(!MarkdownElements::is_in_code_span(line, 20));
504    }
505
506    #[test]
507    fn test_detect_headings() {
508        let content = "# Heading 1\n## Heading 2\nText\nHeading 3\n===";
509        let headings = MarkdownElements::detect_headings(content);
510
511        assert_eq!(headings.len(), 3);
512        assert_eq!(MarkdownElements::get_heading_level(&headings[0]), Some(1));
513        assert_eq!(MarkdownElements::get_heading_level(&headings[1]), Some(2));
514        assert_eq!(MarkdownElements::get_heading_level(&headings[2]), Some(1));
515    }
516
517    #[test]
518    fn test_detect_lists() {
519        let content = "- Item 1\n* Item 2\n+ Item 3\n1. Item 4";
520        let lists = MarkdownElements::detect_lists(content);
521
522        assert_eq!(lists.len(), 4);
523        assert_eq!(lists[0].metadata, Some("minus".to_string()));
524        assert_eq!(lists[1].metadata, Some("asterisk".to_string()));
525        assert_eq!(lists[2].metadata, Some("plus".to_string()));
526        assert_eq!(lists[3].metadata, Some("ordered".to_string()));
527    }
528
529    #[test]
530    fn test_detect_front_matter() {
531        let content = "---\ntitle: Test\n---\n# Content";
532        let front_matter = MarkdownElements::detect_front_matter(content);
533
534        assert!(front_matter.is_some());
535        assert_eq!(front_matter.unwrap().end_line, 2);
536    }
537
538    #[test]
539    fn test_heading_to_fragment() {
540        assert_eq!(MarkdownElements::heading_to_fragment("Hello World!"), "hello-world");
541        assert_eq!(
542            MarkdownElements::heading_to_fragment("Complex: (Header) 123"),
543            "complex-header-123"
544        );
545    }
546
547    #[test]
548    fn test_is_line_in_code_block() {
549        let content = "Text\n```\nCode\n```\nMore text";
550        assert!(!MarkdownElements::is_line_in_code_block(content, 0));
551        assert!(MarkdownElements::is_line_in_code_block(content, 1));
552        assert!(MarkdownElements::is_line_in_code_block(content, 2));
553        assert!(MarkdownElements::is_line_in_code_block(content, 3));
554        assert!(!MarkdownElements::is_line_in_code_block(content, 4));
555    }
556}