rumdl_lib/utils/
element_cache.rs

1use fancy_regex::Regex as FancyRegex;
2use regex::Regex;
3use std::sync::LazyLock;
4use std::sync::{Arc, Mutex};
5
6// Efficient regex patterns
7static CODE_BLOCK_START_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(```|~~~)(.*)$").unwrap());
8static INDENTED_CODE_BLOCK_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s{4,})(.+)$").unwrap());
9
10// List detection patterns
11static UNORDERED_LIST_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| {
12    FancyRegex::new(r"^(?P<indent>[ \t]*)(?P<marker>[*+-])(?P<after>[ \t]*)(?P<content>.*)$").unwrap()
13});
14static ORDERED_LIST_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| {
15    FancyRegex::new(r"^(?P<indent>[ \t]*)(?P<marker>\d+\.)(?P<after>[ \t]*)(?P<content>.*)$").unwrap()
16});
17
18// Inline code span pattern
19static CODE_SPAN_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"`+").unwrap());
20
21/// Represents a range in the document with start and end positions
22#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23pub struct Range {
24    pub start: usize,
25    pub end: usize,
26}
27
28/// Represents the type of code block
29#[derive(Debug, Clone, Copy, PartialEq, Eq)]
30pub enum CodeBlockType {
31    Fenced,
32    Indented,
33}
34
35/// Represents a code block in the document
36#[derive(Debug, Clone)]
37pub struct CodeBlock {
38    pub range: Range,
39    pub block_type: CodeBlockType,
40    pub start_line: usize,
41    pub end_line: usize,
42    pub language: Option<String>,
43}
44
45/// Represents the type of list marker
46#[derive(Debug, Clone, Copy, PartialEq, Eq)]
47pub enum ListMarkerType {
48    Asterisk,
49    Plus,
50    Minus,
51    Ordered,
52}
53
54/// Represents a list item in the document
55#[derive(Debug, Clone)]
56pub struct ListItem {
57    pub line_number: usize, // 1-indexed
58    pub indentation: usize,
59    pub indent_str: String, // Actual leading whitespace
60    pub marker_type: ListMarkerType,
61    pub marker: String,
62    pub content: String,
63    pub spaces_after_marker: usize,
64    pub nesting_level: usize,
65    pub parent_line_number: Option<usize>,
66    pub blockquote_depth: usize,   // Number of leading blockquote markers
67    pub blockquote_prefix: String, // The actual prefix (e.g., "> > ")
68}
69
70/// Cache for Markdown document structural elements
71/// This allows sharing computed data across multiple rule checks
72#[derive(Debug, Default, Clone)]
73pub struct ElementCache {
74    // Document content and metadata
75    content: Option<String>,
76    line_count: usize,
77
78    // Code blocks
79    code_blocks: Vec<CodeBlock>,
80    code_block_line_map: Vec<bool>, // Line index -> is in code block
81
82    // Code spans (inline code)
83    code_spans: Vec<Range>,
84
85    // Lists
86    list_items: Vec<ListItem>,
87    list_line_map: Vec<bool>, // Line index -> is list item
88}
89
90impl ElementCache {
91    /// Create a new cache from document content
92    pub fn new(content: &str) -> Self {
93        let mut cache = ElementCache {
94            content: Some(content.to_string()),
95            line_count: content.lines().count(),
96            code_blocks: Vec::new(),
97            code_block_line_map: Vec::new(),
98            code_spans: Vec::new(),
99            list_items: Vec::new(),
100            list_line_map: Vec::new(),
101        };
102
103        // Initialize maps
104        cache.code_block_line_map = vec![false; cache.line_count];
105        cache.list_line_map = vec![false; cache.line_count];
106
107        // Populate the cache
108        cache.populate_code_blocks();
109        cache.populate_code_spans();
110        cache.populate_list_items();
111
112        cache
113    }
114
115    /// Calculate the visual indentation width of a string, expanding tabs to spaces
116    /// Default tab width is 4 spaces
117    fn calculate_indentation_width(indent_str: &str, tab_width: usize) -> usize {
118        let mut width = 0;
119        for ch in indent_str.chars() {
120            if ch == '\t' {
121                // Round up to next tab stop
122                width = ((width / tab_width) + 1) * tab_width;
123            } else if ch == ' ' {
124                width += 1;
125            } else {
126                // Non-whitespace character, stop counting
127                break;
128            }
129        }
130        width
131    }
132
133    /// Calculate the visual indentation width using default tab width of 4
134    fn calculate_indentation_width_default(indent_str: &str) -> usize {
135        Self::calculate_indentation_width(indent_str, 4)
136    }
137
138    /// Check if a line is within a code block
139    pub fn is_in_code_block(&self, line_num: usize) -> bool {
140        if line_num == 0 || line_num > self.code_block_line_map.len() {
141            return false;
142        }
143        self.code_block_line_map[line_num - 1] // Convert 1-indexed to 0-indexed
144    }
145
146    /// Check if a position is within a code span
147    pub fn is_in_code_span(&self, position: usize) -> bool {
148        self.code_spans
149            .iter()
150            .any(|span| position >= span.start && position < span.end)
151    }
152
153    /// Check if a line is a list item
154    pub fn is_list_item(&self, line_num: usize) -> bool {
155        if line_num == 0 || line_num > self.list_line_map.len() {
156            return false;
157        }
158        self.list_line_map[line_num - 1] // Convert 1-indexed to 0-indexed
159    }
160
161    /// Get list item at line
162    pub fn get_list_item(&self, line_num: usize) -> Option<&ListItem> {
163        self.list_items.iter().find(|item| item.line_number == line_num)
164    }
165
166    /// Get all list items
167    pub fn get_list_items(&self) -> &[ListItem] {
168        &self.list_items
169    }
170
171    /// Get all code blocks
172    pub fn get_code_blocks(&self) -> &[CodeBlock] {
173        &self.code_blocks
174    }
175
176    /// Get all code spans
177    pub fn get_code_spans(&self) -> &[Range] {
178        &self.code_spans
179    }
180
181    /// Detect and populate code blocks
182    fn populate_code_blocks(&mut self) {
183        if let Some(content) = &self.content {
184            let lines: Vec<&str> = content.lines().collect();
185            let mut in_fenced_block = false;
186            let mut fence_marker = String::new();
187            let mut block_start_line = 0;
188            let mut block_language = String::new();
189
190            for (i, line) in lines.iter().enumerate() {
191                if in_fenced_block {
192                    // Already in a fenced code block, look for the end
193                    self.code_block_line_map[i] = true;
194
195                    if line.trim().starts_with(&fence_marker) {
196                        // End of code block
197                        let start_pos =
198                            lines[0..block_start_line].join("\n").len() + if block_start_line > 0 { 1 } else { 0 };
199                        let end_pos = lines[0..=i].join("\n").len();
200
201                        self.code_blocks.push(CodeBlock {
202                            range: Range {
203                                start: start_pos,
204                                end: end_pos,
205                            },
206                            block_type: CodeBlockType::Fenced,
207                            start_line: block_start_line + 1, // 1-indexed
208                            end_line: i + 1,                  // 1-indexed
209                            language: if !block_language.is_empty() {
210                                Some(block_language.clone())
211                            } else {
212                                None
213                            },
214                        });
215
216                        in_fenced_block = false;
217                        fence_marker.clear();
218                        block_language.clear();
219                    }
220                } else if let Some(caps) = CODE_BLOCK_START_REGEX.captures(line) {
221                    // Start of a new code block
222                    fence_marker = caps.get(2).map_or("```", |m| m.as_str()).to_string();
223                    in_fenced_block = true;
224                    block_start_line = i;
225                    block_language = caps.get(3).map_or("", |m| m.as_str().trim()).to_string();
226                    self.code_block_line_map[i] = true;
227                } else if INDENTED_CODE_BLOCK_REGEX.is_match(line) {
228                    // Only mark as indented code block if not a list item
229                    let is_unordered_list = UNORDERED_LIST_REGEX.is_match(line).unwrap_or(false);
230                    let is_ordered_list = ORDERED_LIST_REGEX.is_match(line).unwrap_or(false);
231                    if !is_unordered_list && !is_ordered_list {
232                        // Indented code block
233                        self.code_block_line_map[i] = true;
234                        // For indented code blocks, we handle them as individual lines
235                        // We don't track them as blocks with start/end because they can be
236                        // interrupted by blank lines, etc.
237                        let start_pos = lines[0..i].join("\n").len() + if i > 0 { 1 } else { 0 };
238                        let end_pos = start_pos + line.len();
239                        self.code_blocks.push(CodeBlock {
240                            range: Range {
241                                start: start_pos,
242                                end: end_pos,
243                            },
244                            block_type: CodeBlockType::Indented,
245                            start_line: i + 1, // 1-indexed
246                            end_line: i + 1,   // 1-indexed
247                            language: None,
248                        });
249                    }
250                }
251            }
252
253            // Handle unclosed code block
254            if in_fenced_block {
255                let start_pos = lines[0..block_start_line].join("\n").len() + if block_start_line > 0 { 1 } else { 0 };
256                let end_pos = content.len();
257
258                self.code_blocks.push(CodeBlock {
259                    range: Range {
260                        start: start_pos,
261                        end: end_pos,
262                    },
263                    block_type: CodeBlockType::Fenced,
264                    start_line: block_start_line + 1, // 1-indexed
265                    end_line: lines.len(),            // 1-indexed
266                    language: if !block_language.is_empty() {
267                        Some(block_language)
268                    } else {
269                        None
270                    },
271                });
272            }
273        }
274    }
275
276    /// Detect and populate code spans
277    fn populate_code_spans(&mut self) {
278        if let Some(content) = &self.content {
279            // Find inline code spans using regex for backticks
280            let mut i = 0;
281            while i < content.len() {
282                if let Some(m) = CODE_SPAN_REGEX.find_at(content, i) {
283                    let backtick_length = m.end() - m.start();
284                    let start = m.start();
285
286                    // Find matching closing backticks
287                    if let Some(end_pos) = content[m.end()..].find(&"`".repeat(backtick_length)) {
288                        let end = m.end() + end_pos + backtick_length;
289                        self.code_spans.push(Range { start, end });
290                        i = end;
291                    } else {
292                        i = m.end();
293                    }
294                } else {
295                    break;
296                }
297            }
298        }
299    }
300
301    /// Detect and populate list items
302    fn populate_list_items(&mut self) {
303        if let Some(content) = &self.content {
304            let lines: Vec<&str> = content.lines().collect();
305            let mut prev_items: Vec<(usize, usize, usize)> = Vec::new(); // (blockquote_depth, nesting_level, line_number)
306            for (i, line) in lines.iter().enumerate() {
307                // Skip blank lines but don't reset nesting context
308                if line.trim().is_empty() {
309                    continue;
310                }
311                // Parse and strip blockquote prefix
312                let (blockquote_depth, blockquote_prefix, rest) = Self::parse_blockquote_prefix(line);
313                // Always call parse_list_item and always push if Some
314                if let Some(item) = self.parse_list_item(
315                    rest,
316                    i + 1,
317                    &mut prev_items,
318                    blockquote_depth,
319                    blockquote_prefix.clone(),
320                ) {
321                    self.list_items.push(item);
322                    self.list_line_map[i] = true;
323                }
324            }
325        }
326    }
327
328    /// Parse and strip all leading blockquote markers, returning (depth, prefix, rest_of_line)
329    fn parse_blockquote_prefix(line: &str) -> (usize, String, &str) {
330        let mut rest = line;
331        let mut prefix = String::new();
332        let mut depth = 0;
333        loop {
334            let trimmed = rest.trim_start();
335            if let Some(after) = trimmed.strip_prefix('>') {
336                // Find the '>' and a single optional space
337                let mut chars = after.chars();
338                let mut space_count = 0;
339                if let Some(' ') = chars.next() {
340                    space_count = 1;
341                }
342                let (spaces, after_marker) = after.split_at(space_count);
343                prefix.push('>');
344                prefix.push_str(spaces);
345                rest = after_marker;
346                depth += 1;
347            } else {
348                break;
349            }
350        }
351        (depth, prefix, rest)
352    }
353
354    /// Calculate the nesting level for a list item, considering blockquote depth
355    fn calculate_nesting_level(
356        &self,
357        indent: usize,
358        blockquote_depth: usize,
359        prev_items: &mut Vec<(usize, usize, usize)>,
360    ) -> usize {
361        let mut nesting_level = 0;
362
363        // Only consider previous items with the same blockquote depth
364        if let Some(&(_last_bq, last_indent, last_level)) =
365            prev_items.iter().rev().find(|(bq, _, _)| *bq == blockquote_depth)
366        {
367            use std::cmp::Ordering;
368            match indent.cmp(&last_indent) {
369                Ordering::Greater => {
370                    // More indented - increase nesting level
371                    nesting_level = last_level + 1;
372                }
373                Ordering::Equal => {
374                    // Same indentation - same level
375                    nesting_level = last_level;
376                }
377                Ordering::Less => {
378                    // Less indented - find the appropriate level
379                    let mut found_level = None;
380
381                    // First look for exact match
382                    for &(prev_bq, prev_indent, prev_level) in prev_items.iter().rev() {
383                        if prev_bq == blockquote_depth && prev_indent == indent {
384                            found_level = Some(prev_level);
385                            break;
386                        }
387                    }
388
389                    // If no exact match, check if this is a case where we should treat similar indentations as same level
390                    // This handles mixed tab/space scenarios where 4 and 6 spaces should be at the same level
391                    if found_level.is_none() && indent > 0 && last_indent > 0 {
392                        // Only apply similar indentation logic if the difference is small and we're dealing with small indentations
393                        let diff = (indent as i32 - last_indent as i32).abs();
394                        if diff <= 2 && indent <= 8 && last_indent <= 8 {
395                            // Check if there's a recent item at a lower indentation level
396                            let has_lower_indent = prev_items.iter().rev().take(3).any(|(bq, prev_indent, _)| {
397                                *bq == blockquote_depth && *prev_indent < indent.min(last_indent)
398                            });
399                            if has_lower_indent {
400                                found_level = Some(last_level);
401                            }
402                        }
403                    }
404
405                    // If still no match, look for the most recent less indented item
406                    if found_level.is_none() {
407                        for &(prev_bq, prev_indent, prev_level) in prev_items.iter().rev() {
408                            if prev_bq == blockquote_depth && prev_indent < indent {
409                                found_level = Some(prev_level);
410                                break;
411                            }
412                        }
413                    }
414
415                    nesting_level = found_level.unwrap_or(0);
416                }
417            }
418        }
419
420        // Remove stack entries with indent >= current indent and same blockquote depth
421        while let Some(&(prev_bq, prev_indent, _)) = prev_items.last() {
422            if prev_bq != blockquote_depth || prev_indent < indent {
423                break;
424            }
425            prev_items.pop();
426        }
427        prev_items.push((blockquote_depth, indent, nesting_level));
428        nesting_level
429    }
430
431    /// Parse a line as a list item and determine its nesting level
432    fn parse_list_item(
433        &self,
434        line: &str,
435        line_num: usize,
436        prev_items: &mut Vec<(usize, usize, usize)>,
437        blockquote_depth: usize,
438        blockquote_prefix: String,
439    ) -> Option<ListItem> {
440        match UNORDERED_LIST_REGEX.captures(line) {
441            Ok(Some(captures)) => {
442                let indent_str = captures.name("indent").map_or("", |m| m.as_str()).to_string();
443                let indentation = Self::calculate_indentation_width_default(&indent_str);
444                let marker = captures.name("marker").unwrap().as_str();
445                let after = captures.name("after").map_or("", |m| m.as_str());
446                let spaces = after.len();
447                let raw_content = captures.name("content").map_or("", |m| m.as_str());
448                let content = raw_content.trim_start().to_string();
449                let marker_type = match marker {
450                    "*" => ListMarkerType::Asterisk,
451                    "+" => ListMarkerType::Plus,
452                    "-" => ListMarkerType::Minus,
453                    other => {
454                        // This should never happen due to regex validation,
455                        // but default to dash if it does
456                        eprintln!("Warning: Unexpected list marker '{other}', defaulting to dash");
457                        ListMarkerType::Minus
458                    }
459                };
460                let nesting_level = self.calculate_nesting_level(indentation, blockquote_depth, prev_items);
461                // Find parent: most recent previous item with lower nesting_level and same blockquote depth
462                let parent_line_number = prev_items
463                    .iter()
464                    .rev()
465                    .find(|(bq, _, level)| *bq == blockquote_depth && *level < nesting_level)
466                    .map(|(_, _, line_num)| *line_num);
467                return Some(ListItem {
468                    line_number: line_num,
469                    indentation,
470                    indent_str,
471                    marker_type,
472                    marker: marker.to_string(),
473                    content,
474                    spaces_after_marker: spaces,
475                    nesting_level,
476                    parent_line_number,
477                    blockquote_depth,
478                    blockquote_prefix,
479                });
480            }
481            Ok(None) => {
482                // No debug output
483            }
484            Err(_) => {}
485        }
486        match ORDERED_LIST_REGEX.captures(line) {
487            Ok(Some(captures)) => {
488                let indent_str = captures.name("indent").map_or("", |m| m.as_str()).to_string();
489                let indentation = Self::calculate_indentation_width_default(&indent_str);
490                let marker = captures.name("marker").unwrap().as_str();
491                let spaces = captures.name("after").map_or(0, |m| m.as_str().len());
492                let content = captures
493                    .name("content")
494                    .map_or("", |m| m.as_str())
495                    .trim_start()
496                    .to_string();
497                let nesting_level = self.calculate_nesting_level(indentation, blockquote_depth, prev_items);
498                // Find parent: most recent previous item with lower nesting_level and same blockquote depth
499                let parent_line_number = prev_items
500                    .iter()
501                    .rev()
502                    .find(|(bq, _, level)| *bq == blockquote_depth && *level < nesting_level)
503                    .map(|(_, _, line_num)| *line_num);
504                return Some(ListItem {
505                    line_number: line_num,
506                    indentation,
507                    indent_str,
508                    marker_type: ListMarkerType::Ordered,
509                    marker: marker.to_string(),
510                    content,
511                    spaces_after_marker: spaces,
512                    nesting_level,
513                    parent_line_number,
514                    blockquote_depth,
515                    blockquote_prefix,
516                });
517            }
518            Ok(None) => {}
519            Err(_) => {}
520        }
521        None
522    }
523}
524
525// Global cache for sharing across threads
526static ELEMENT_CACHE: LazyLock<Arc<Mutex<Option<ElementCache>>>> = LazyLock::new(|| Arc::new(Mutex::new(None)));
527
528/// Get or create element cache for document content
529///
530/// If the mutex is poisoned, creates a fresh cache without storing it globally.
531/// This ensures the library never panics due to mutex poisoning.
532pub fn get_element_cache(content: &str) -> ElementCache {
533    // Try to get existing cache
534    if let Ok(cache_guard) = ELEMENT_CACHE.lock() {
535        // If cache exists and content matches, return it
536        if let Some(existing_cache) = &*cache_guard
537            && let Some(cached_content) = &existing_cache.content
538            && cached_content == content
539        {
540            return existing_cache.clone();
541        }
542    }
543
544    // Content doesn't match or mutex poisoned, create new cache
545    let new_cache = ElementCache::new(content);
546
547    // Store in global cache (ignore if mutex is poisoned)
548    if let Ok(mut cache_guard) = ELEMENT_CACHE.lock() {
549        *cache_guard = Some(new_cache.clone());
550    }
551
552    new_cache
553}
554
555/// Reset the element cache
556///
557/// If the mutex is poisoned, this is a no-op.
558pub fn reset_element_cache() {
559    if let Ok(mut cache_guard) = ELEMENT_CACHE.lock() {
560        *cache_guard = None;
561    }
562}
563
564#[cfg(test)]
565mod tests {
566    use super::*;
567
568    #[test]
569    fn test_code_block_detection() {
570        let content = "Regular text\n\n```rust\nfn main() {\n    println!(\"Hello\");\n}\n```\n\nMore text";
571        let cache = ElementCache::new(content);
572
573        assert_eq!(cache.code_blocks.len(), 1);
574        assert_eq!(cache.code_blocks[0].start_line, 3);
575        assert_eq!(cache.code_blocks[0].end_line, 7);
576        assert_eq!(cache.code_blocks[0].block_type, CodeBlockType::Fenced);
577        assert_eq!(cache.code_blocks[0].language, Some("rust".to_string()));
578
579        assert!(!cache.is_in_code_block(1));
580        assert!(!cache.is_in_code_block(2));
581        assert!(cache.is_in_code_block(3));
582        assert!(cache.is_in_code_block(4));
583        assert!(cache.is_in_code_block(5));
584        assert!(cache.is_in_code_block(6));
585        assert!(cache.is_in_code_block(7));
586        assert!(!cache.is_in_code_block(8));
587        assert!(!cache.is_in_code_block(9));
588    }
589
590    #[test]
591    fn test_list_item_detection_simple() {
592        let content =
593            "# Heading\n\n- First item\n  - Nested item\n- Second item\n\n1. Ordered item\n   1. Nested ordered\n";
594        let cache = ElementCache::new(content);
595        assert_eq!(cache.list_items.len(), 5);
596        // Check the first item
597        assert_eq!(cache.list_items[0].line_number, 3);
598        assert_eq!(cache.list_items[0].marker, "-");
599        assert_eq!(cache.list_items[0].nesting_level, 0);
600        // Check the nested item
601        assert_eq!(cache.list_items[1].line_number, 4);
602        assert_eq!(cache.list_items[1].marker, "-");
603        assert_eq!(cache.list_items[1].nesting_level, 1);
604        // Check the second list item
605        assert_eq!(cache.list_items[2].line_number, 5);
606        assert_eq!(cache.list_items[2].marker, "-");
607        assert_eq!(cache.list_items[2].nesting_level, 0);
608        // Check ordered list item
609        assert_eq!(cache.list_items[3].line_number, 7);
610        assert_eq!(cache.list_items[3].marker, "1.");
611        assert_eq!(cache.list_items[3].nesting_level, 0);
612        // Check nested ordered list item
613        assert_eq!(cache.list_items[4].line_number, 8);
614        assert_eq!(cache.list_items[4].marker, "1.");
615        assert_eq!(cache.list_items[4].nesting_level, 1);
616    }
617
618    #[test]
619    fn test_list_item_detection_complex() {
620        let complex = "  * Level 1 item 1\n    - Level 2 item 1\n      + Level 3 item 1\n    - Level 2 item 2\n  * Level 1 item 2\n\n* Top\n  + Nested\n    - Deep\n      * Deeper\n        + Deepest\n";
621        let cache = ElementCache::new(complex);
622
623        // Should detect all 10 list items
624        assert_eq!(cache.list_items.len(), 10);
625        // Check markers and nesting levels
626        assert_eq!(cache.list_items[0].marker, "*");
627        assert_eq!(cache.list_items[0].nesting_level, 0);
628        assert_eq!(cache.list_items[1].marker, "-");
629        assert_eq!(cache.list_items[1].nesting_level, 1);
630        assert_eq!(cache.list_items[2].marker, "+");
631        assert_eq!(cache.list_items[2].nesting_level, 2);
632        assert_eq!(cache.list_items[3].marker, "-");
633        assert_eq!(cache.list_items[3].nesting_level, 1);
634        assert_eq!(cache.list_items[4].marker, "*");
635        assert_eq!(cache.list_items[4].nesting_level, 0);
636        assert_eq!(cache.list_items[5].marker, "*");
637        assert_eq!(cache.list_items[5].nesting_level, 0);
638        assert_eq!(cache.list_items[6].marker, "+");
639        assert_eq!(cache.list_items[6].nesting_level, 1);
640        assert_eq!(cache.list_items[7].marker, "-");
641        assert_eq!(cache.list_items[7].nesting_level, 2);
642        assert_eq!(cache.list_items[8].marker, "*");
643        assert_eq!(cache.list_items[8].nesting_level, 3);
644        assert_eq!(cache.list_items[9].marker, "+");
645        assert_eq!(cache.list_items[9].nesting_level, 4);
646        let expected_nesting = vec![0, 1, 2, 1, 0, 0, 1, 2, 3, 4];
647        let actual_nesting: Vec<_> = cache.list_items.iter().map(|item| item.nesting_level).collect();
648        assert_eq!(
649            actual_nesting, expected_nesting,
650            "Nesting levels should match expected values"
651        );
652    }
653
654    #[test]
655    fn test_list_item_detection_edge() {
656        let edge = "* Item 1\n\n    - Nested 1\n  + Nested 2\n\n* Item 2\n";
657        let cache = ElementCache::new(edge);
658        assert_eq!(cache.list_items.len(), 4);
659
660        // Check correct nesting levels according to CommonMark:
661        // * Item 1 (indent=0) -> level 0
662        // - Nested 1 (indent=4) -> level 1 (nested under Item 1)
663        // + Nested 2 (indent=2) -> level 1 (nested under Item 1)
664        // * Item 2 (indent=0) -> level 0
665        let expected_nesting = vec![0, 1, 1, 0];
666        let actual_nesting: Vec<_> = cache.list_items.iter().map(|item| item.nesting_level).collect();
667        assert_eq!(
668            actual_nesting, expected_nesting,
669            "Nesting levels should be calculated based on indentation, not reset by blank lines"
670        );
671    }
672
673    #[test]
674    fn test_code_span_detection() {
675        let content = "Here is some `inline code` and here are ``nested `code` spans``";
676        let cache = ElementCache::new(content);
677
678        // Should have two code spans
679        assert_eq!(cache.code_spans.len(), 2);
680
681        // Check spans
682        let span1_content = &content[cache.code_spans[0].start..cache.code_spans[0].end];
683        assert_eq!(span1_content, "`inline code`");
684
685        let span2_content = &content[cache.code_spans[1].start..cache.code_spans[1].end];
686        assert_eq!(span2_content, "``nested `code` spans``");
687    }
688
689    #[test]
690    fn test_get_element_cache() {
691        let content1 = "Test content";
692        let content2 = "Different content";
693
694        // First call should create a new cache
695        let cache1 = get_element_cache(content1);
696
697        // Second call with same content should return the same cache
698        let cache2 = get_element_cache(content1);
699
700        // Third call with different content should create new cache
701        let cache3 = get_element_cache(content2);
702
703        assert_eq!(cache1.content.as_ref().unwrap(), content1);
704        assert_eq!(cache2.content.as_ref().unwrap(), content1);
705        assert_eq!(cache3.content.as_ref().unwrap(), content2);
706    }
707
708    #[test]
709    fn test_list_item_detection_deep_nesting_and_edge_cases() {
710        // Deeply nested unordered lists, mixed markers, excessive indentation, tabs, and blank lines
711        let content = "\
712* Level 1
713  - Level 2
714    + Level 3
715      * Level 4
716        - Level 5
717          + Level 6
718* Sibling 1
719    * Sibling 2
720\n    - After blank line, not nested\n\n\t* Tab indented\n        * 8 spaces indented\n* After excessive indent\n";
721        let cache = ElementCache::new(content);
722        // Should detect all lines that start with a valid unordered list marker
723        let _expected_markers = ["*", "-", "+", "*", "-", "+", "*", "*", "-", "*", "*", "*"];
724        let _expected_indents = [0, 4, 8, 0, 4, 8, 0, 4, 8, 12, 16, 20];
725        let expected_content = vec![
726            "Level 1",
727            "Level 2",
728            "Level 3",
729            "Level 4",
730            "Level 5",
731            "Level 6",
732            "Sibling 1",
733            "Sibling 2",
734            "After blank line, not nested",
735            "Tab indented",      // Content after marker
736            "8 spaces indented", // Content after marker
737            "After excessive indent",
738        ];
739        let actual_content: Vec<_> = cache.list_items.iter().map(|item| item.content.clone()).collect();
740        assert_eq!(
741            actual_content, expected_content,
742            "List item contents should match expected values"
743        );
744        // Updated expected nesting levels based on correct CommonMark behavior:
745        // Blank lines should NOT reset nesting context
746        let expected_nesting = vec![0, 1, 2, 3, 4, 5, 0, 1, 1, 1, 2, 0];
747        let actual_nesting: Vec<_> = cache.list_items.iter().map(|item| item.nesting_level).collect();
748        assert_eq!(
749            actual_nesting, expected_nesting,
750            "Nesting levels should match expected values"
751        );
752        // Check that tab-indented and 8-space-indented items are detected
753        assert!(
754            cache
755                .list_items
756                .iter()
757                .any(|item| item.marker == "*" && item.indentation >= 1),
758            "Tab or 8-space indented item not detected"
759        );
760        // Check that after blank lines, items maintain correct nesting based on indentation
761        let after_blank = cache
762            .list_items
763            .iter()
764            .find(|item| item.content.contains("After blank line"));
765        assert!(after_blank.is_some());
766        assert_eq!(
767            after_blank.unwrap().nesting_level,
768            1,
769            "Item after blank line should maintain nesting based on indentation"
770        );
771    }
772
773    #[test]
774    fn test_tab_indentation_calculation() {
775        // Test that tabs are properly converted to spaces for indentation calculation
776        let content = "* Level 0\n\t* Tab indented (should be level 1)\n\t\t* Double tab (should be level 2)\n    * 4 spaces (should be level 1)\n        * 8 spaces (should be level 2)\n";
777        let cache = ElementCache::new(content);
778
779        assert_eq!(cache.list_items.len(), 5);
780
781        // Check indentation values (tabs should be converted to spaces)
782        assert_eq!(cache.list_items[0].indentation, 0); // "* Level 0"
783        assert_eq!(cache.list_items[1].indentation, 4); // "\t* Tab indented" (tab = 4 spaces)
784        assert_eq!(cache.list_items[2].indentation, 8); // "\t\t* Double tab" (2 tabs = 8 spaces)
785        assert_eq!(cache.list_items[3].indentation, 4); // "    * 4 spaces"
786        assert_eq!(cache.list_items[4].indentation, 8); // "        * 8 spaces"
787
788        // Check nesting levels
789        assert_eq!(cache.list_items[0].nesting_level, 0);
790        assert_eq!(cache.list_items[1].nesting_level, 1);
791        assert_eq!(cache.list_items[2].nesting_level, 2);
792        assert_eq!(cache.list_items[3].nesting_level, 1);
793        assert_eq!(cache.list_items[4].nesting_level, 2);
794    }
795
796    #[test]
797    fn test_mixed_tabs_and_spaces_indentation() {
798        // Test mixed tabs and spaces
799        let content = "* Level 0\n\t  * Tab + 2 spaces (should be level 1)\n  \t* 2 spaces + tab (should be level 1)\n\t\t  * 2 tabs + 2 spaces (should be level 2)\n";
800
801        // Clear any cached data to ensure fresh parsing
802        reset_element_cache();
803        let cache = ElementCache::new(content);
804
805        assert_eq!(cache.list_items.len(), 4);
806
807        // Check indentation values
808        assert_eq!(cache.list_items[0].indentation, 0); // "* Level 0"
809        assert_eq!(cache.list_items[1].indentation, 6); // "\t  * Tab + 2 spaces" (tab to 4 + 2 spaces = 6)
810        assert_eq!(cache.list_items[2].indentation, 4); // "  \t* 2 spaces + tab" (2 spaces, then tab to next stop = 4)
811        assert_eq!(cache.list_items[3].indentation, 10); // "\t\t  * 2 tabs + 2 spaces" (2 tabs = 8 + 2 spaces = 10)
812
813        // Check nesting levels
814        assert_eq!(cache.list_items[0].nesting_level, 0);
815        assert_eq!(cache.list_items[1].nesting_level, 1);
816        assert_eq!(cache.list_items[2].nesting_level, 1);
817        assert_eq!(cache.list_items[3].nesting_level, 2);
818    }
819
820    #[test]
821    fn test_tab_width_configuration() {
822        // Test with different tab widths (default should be 4)
823        let content = "\t* Single tab\n\t\t* Double tab\n";
824        let cache = ElementCache::new(content);
825
826        assert_eq!(cache.list_items.len(), 2);
827
828        // With default tab width of 4
829        assert_eq!(cache.list_items[0].indentation, 4); // "\t*" = 4 spaces
830        assert_eq!(cache.list_items[1].indentation, 8); // "\t\t*" = 8 spaces
831
832        // Check nesting levels
833        assert_eq!(cache.list_items[0].nesting_level, 0);
834        assert_eq!(cache.list_items[1].nesting_level, 1);
835    }
836
837    #[test]
838    fn test_tab_expansion_debug() {
839        // Debug the tab expansion logic
840        assert_eq!(ElementCache::calculate_indentation_width_default(""), 0);
841        assert_eq!(ElementCache::calculate_indentation_width_default(" "), 1);
842        assert_eq!(ElementCache::calculate_indentation_width_default("  "), 2);
843        assert_eq!(ElementCache::calculate_indentation_width_default("    "), 4);
844        assert_eq!(ElementCache::calculate_indentation_width_default("\t"), 4);
845        assert_eq!(ElementCache::calculate_indentation_width_default("\t\t"), 8);
846        assert_eq!(ElementCache::calculate_indentation_width_default("\t  "), 6); // tab to 4, then 2 spaces = 6
847        assert_eq!(ElementCache::calculate_indentation_width_default("  \t"), 4); // 2 spaces, then tab to next stop (4)
848        assert_eq!(ElementCache::calculate_indentation_width_default("\t\t  "), 10);
849        // 2 tabs = 8, then 2 spaces = 10
850    }
851
852    #[test]
853    fn test_mixed_tabs_debug() {
854        // Debug the specific failing case
855        let content = "* Level 0\n\t  * Tab + 2 spaces (should be level 1)\n  \t* 2 spaces + tab (should be level 1)\n\t\t  * 2 tabs + 2 spaces (should be level 2)\n";
856        let cache = ElementCache::new(content);
857
858        println!("Number of list items: {}", cache.list_items.len());
859        for (i, item) in cache.list_items.iter().enumerate() {
860            println!(
861                "Item {}: indent_str={:?}, indentation={}, content={:?}",
862                i, item.indent_str, item.indentation, item.content
863            );
864        }
865
866        // Test the specific indentation strings
867        assert_eq!(ElementCache::calculate_indentation_width_default("\t  "), 6); // tab + 2 spaces
868        assert_eq!(ElementCache::calculate_indentation_width_default("  \t"), 4); // 2 spaces + tab
869        assert_eq!(ElementCache::calculate_indentation_width_default("\t\t  "), 10);
870        // 2 tabs + 2 spaces
871    }
872}
rumdl_lib/utils/element_cache.rs

rumdl_lib/utils/
element_cache.rs