rumdl_lib/utils/
element_cache.rs

1use fancy_regex::Regex as FancyRegex;
2use regex::Regex;
3use std::hash::{Hash, Hasher};
4use std::sync::LazyLock;
5use std::sync::{Arc, Mutex};
6
7// Efficient regex patterns
8static CODE_BLOCK_START_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(```|~~~)(.*)$").unwrap());
9static INDENTED_CODE_BLOCK_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s{4,})(.+)$").unwrap());
10
11// List detection patterns
12static UNORDERED_LIST_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| {
13    FancyRegex::new(r"^(?P<indent>[ \t]*)(?P<marker>[*+-])(?P<after>[ \t]*)(?P<content>.*)$").unwrap()
14});
15static ORDERED_LIST_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| {
16    FancyRegex::new(r"^(?P<indent>[ \t]*)(?P<marker>\d+\.)(?P<after>[ \t]*)(?P<content>.*)$").unwrap()
17});
18
19// Inline code span pattern
20static CODE_SPAN_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"`+").unwrap());
21
22/// Represents a range in the document with start and end positions
23#[derive(Debug, Clone, Copy, PartialEq, Eq)]
24pub struct Range {
25    pub start: usize,
26    pub end: usize,
27}
28
29/// Represents the type of code block
30#[derive(Debug, Clone, Copy, PartialEq, Eq)]
31pub enum CodeBlockType {
32    Fenced,
33    Indented,
34}
35
36/// Represents a code block in the document
37#[derive(Debug, Clone)]
38pub struct CodeBlock {
39    pub range: Range,
40    pub block_type: CodeBlockType,
41    pub start_line: usize,
42    pub end_line: usize,
43    pub language: Option<String>,
44}
45
46/// Represents the type of list marker
47#[derive(Debug, Clone, Copy, PartialEq, Eq)]
48pub enum ListMarkerType {
49    Asterisk,
50    Plus,
51    Minus,
52    Ordered,
53}
54
55/// Represents a list item in the document
56#[derive(Debug, Clone)]
57pub struct ListItem {
58    pub line_number: usize, // 1-indexed
59    pub indentation: usize,
60    pub indent_str: String, // Actual leading whitespace
61    pub marker_type: ListMarkerType,
62    pub marker: String,
63    pub content: String,
64    pub spaces_after_marker: usize,
65    pub nesting_level: usize,
66    pub parent_line_number: Option<usize>,
67    pub blockquote_depth: usize,   // Number of leading blockquote markers
68    pub blockquote_prefix: String, // The actual prefix (e.g., "> > ")
69}
70
71/// Cache for Markdown document structural elements
72/// This allows sharing computed data across multiple rule checks
73#[derive(Debug, Default, Clone)]
74pub struct ElementCache {
75    // Document content hash for cache validation (avoids storing full content)
76    content_hash: u64,
77
78    // Code blocks
79    code_blocks: Vec<CodeBlock>,
80    code_block_line_map: Vec<bool>, // Line index -> is in code block
81
82    // Code spans (inline code)
83    code_spans: Vec<Range>,
84
85    // Lists
86    list_items: Vec<ListItem>,
87    list_line_map: Vec<bool>, // Line index -> is list item
88}
89
90impl ElementCache {
91    /// Compute a hash of the content for cache validation
92    fn compute_content_hash(content: &str) -> u64 {
93        let mut hasher = std::collections::hash_map::DefaultHasher::new();
94        content.hash(&mut hasher);
95        hasher.finish()
96    }
97
98    /// Create a new cache from document content
99    pub fn new(content: &str) -> Self {
100        let content_hash = Self::compute_content_hash(content);
101        let line_count = content.lines().count();
102
103        let mut cache = ElementCache {
104            content_hash,
105            code_blocks: Vec::new(),
106            code_block_line_map: vec![false; line_count],
107            code_spans: Vec::new(),
108            list_items: Vec::new(),
109            list_line_map: vec![false; line_count],
110        };
111
112        // Populate the cache - pass content directly to avoid storing it
113        cache.populate_code_blocks(content);
114        cache.populate_code_spans(content);
115        cache.populate_list_items(content);
116
117        cache
118    }
119
120    /// Check if this cache is valid for the given content
121    pub fn is_valid_for(&self, content: &str) -> bool {
122        Self::compute_content_hash(content) == self.content_hash
123    }
124
125    /// Calculate the visual indentation width of a string, expanding tabs to spaces.
126    ///
127    /// Per CommonMark, tabs expand to the next tab stop (columns 4, 8, 12, ...).
128    /// This means:
129    /// - " \t" (1 space + tab) → 4 columns
130    /// - "  \t" (2 spaces + tab) → 4 columns
131    /// - "   \t" (3 spaces + tab) → 4 columns
132    /// - "\t" (just tab) → 4 columns
133    /// - "    " (4 spaces) → 4 columns
134    pub fn calculate_indentation_width(indent_str: &str, tab_width: usize) -> usize {
135        let mut width = 0;
136        for ch in indent_str.chars() {
137            if ch == '\t' {
138                // Round up to next tab stop
139                width = ((width / tab_width) + 1) * tab_width;
140            } else if ch == ' ' {
141                width += 1;
142            } else {
143                // Non-whitespace character, stop counting
144                break;
145            }
146        }
147        width
148    }
149
150    /// Calculate the visual indentation width using default tab width of 4
151    pub fn calculate_indentation_width_default(indent_str: &str) -> usize {
152        Self::calculate_indentation_width(indent_str, 4)
153    }
154
155    /// Check if a line is within a code block
156    pub fn is_in_code_block(&self, line_num: usize) -> bool {
157        if line_num == 0 || line_num > self.code_block_line_map.len() {
158            return false;
159        }
160        self.code_block_line_map[line_num - 1] // Convert 1-indexed to 0-indexed
161    }
162
163    /// Check if a position is within a code span
164    pub fn is_in_code_span(&self, position: usize) -> bool {
165        self.code_spans
166            .iter()
167            .any(|span| position >= span.start && position < span.end)
168    }
169
170    /// Check if a line is a list item
171    pub fn is_list_item(&self, line_num: usize) -> bool {
172        if line_num == 0 || line_num > self.list_line_map.len() {
173            return false;
174        }
175        self.list_line_map[line_num - 1] // Convert 1-indexed to 0-indexed
176    }
177
178    /// Get list item at line
179    pub fn get_list_item(&self, line_num: usize) -> Option<&ListItem> {
180        self.list_items.iter().find(|item| item.line_number == line_num)
181    }
182
183    /// Get all list items
184    pub fn get_list_items(&self) -> &[ListItem] {
185        &self.list_items
186    }
187
188    /// Get all code blocks
189    pub fn get_code_blocks(&self) -> &[CodeBlock] {
190        &self.code_blocks
191    }
192
193    /// Get all code spans
194    pub fn get_code_spans(&self) -> &[Range] {
195        &self.code_spans
196    }
197
198    /// Detect and populate code blocks
199    fn populate_code_blocks(&mut self, content: &str) {
200        let lines: Vec<&str> = content.lines().collect();
201        let mut in_fenced_block = false;
202        let mut fence_marker = String::new();
203        let mut block_start_line = 0;
204        let mut block_language = String::new();
205
206        for (i, line) in lines.iter().enumerate() {
207            if in_fenced_block {
208                // Already in a fenced code block, look for the end
209                self.code_block_line_map[i] = true;
210
211                if line.trim().starts_with(&fence_marker) {
212                    // End of code block
213                    let start_pos =
214                        lines[0..block_start_line].join("\n").len() + if block_start_line > 0 { 1 } else { 0 };
215                    let end_pos = lines[0..=i].join("\n").len();
216
217                    self.code_blocks.push(CodeBlock {
218                        range: Range {
219                            start: start_pos,
220                            end: end_pos,
221                        },
222                        block_type: CodeBlockType::Fenced,
223                        start_line: block_start_line + 1, // 1-indexed
224                        end_line: i + 1,                  // 1-indexed
225                        language: if !block_language.is_empty() {
226                            Some(block_language.clone())
227                        } else {
228                            None
229                        },
230                    });
231
232                    in_fenced_block = false;
233                    fence_marker.clear();
234                    block_language.clear();
235                }
236            } else if let Some(caps) = CODE_BLOCK_START_REGEX.captures(line) {
237                // Start of a new code block
238                fence_marker = caps.get(2).map_or("```", |m| m.as_str()).to_string();
239                in_fenced_block = true;
240                block_start_line = i;
241                block_language = caps.get(3).map_or("", |m| m.as_str().trim()).to_string();
242                self.code_block_line_map[i] = true;
243            } else if INDENTED_CODE_BLOCK_REGEX.is_match(line) {
244                // Only mark as indented code block if not a list item
245                let is_unordered_list = UNORDERED_LIST_REGEX.is_match(line).unwrap_or(false);
246                let is_ordered_list = ORDERED_LIST_REGEX.is_match(line).unwrap_or(false);
247                if !is_unordered_list && !is_ordered_list {
248                    // Indented code block
249                    self.code_block_line_map[i] = true;
250                    // For indented code blocks, we handle them as individual lines
251                    // We don't track them as blocks with start/end because they can be
252                    // interrupted by blank lines, etc.
253                    let start_pos = lines[0..i].join("\n").len() + if i > 0 { 1 } else { 0 };
254                    let end_pos = start_pos + line.len();
255                    self.code_blocks.push(CodeBlock {
256                        range: Range {
257                            start: start_pos,
258                            end: end_pos,
259                        },
260                        block_type: CodeBlockType::Indented,
261                        start_line: i + 1, // 1-indexed
262                        end_line: i + 1,   // 1-indexed
263                        language: None,
264                    });
265                }
266            }
267        }
268
269        // Handle unclosed code block
270        if in_fenced_block {
271            let start_pos = lines[0..block_start_line].join("\n").len() + if block_start_line > 0 { 1 } else { 0 };
272            let end_pos = content.len();
273
274            self.code_blocks.push(CodeBlock {
275                range: Range {
276                    start: start_pos,
277                    end: end_pos,
278                },
279                block_type: CodeBlockType::Fenced,
280                start_line: block_start_line + 1, // 1-indexed
281                end_line: lines.len(),            // 1-indexed
282                language: if !block_language.is_empty() {
283                    Some(block_language)
284                } else {
285                    None
286                },
287            });
288        }
289    }
290
291    /// Detect and populate code spans
292    fn populate_code_spans(&mut self, content: &str) {
293        // Find inline code spans using regex for backticks
294        let mut i = 0;
295        while i < content.len() {
296            if let Some(m) = CODE_SPAN_REGEX.find_at(content, i) {
297                let backtick_length = m.end() - m.start();
298                let start = m.start();
299
300                // Find matching closing backticks
301                if let Some(end_pos) = content[m.end()..].find(&"`".repeat(backtick_length)) {
302                    let end = m.end() + end_pos + backtick_length;
303                    self.code_spans.push(Range { start, end });
304                    i = end;
305                } else {
306                    i = m.end();
307                }
308            } else {
309                break;
310            }
311        }
312    }
313
314    /// Detect and populate list items
315    fn populate_list_items(&mut self, content: &str) {
316        let lines: Vec<&str> = content.lines().collect();
317        let mut prev_items: Vec<(usize, usize, usize)> = Vec::new(); // (blockquote_depth, nesting_level, line_number)
318        for (i, line) in lines.iter().enumerate() {
319            // Skip blank lines but don't reset nesting context
320            if line.trim().is_empty() {
321                continue;
322            }
323            // Parse and strip blockquote prefix
324            let (blockquote_depth, blockquote_prefix, rest) = Self::parse_blockquote_prefix(line);
325            // Always call parse_list_item and always push if Some
326            if let Some(item) = self.parse_list_item(
327                rest,
328                i + 1,
329                &mut prev_items,
330                blockquote_depth,
331                blockquote_prefix.clone(),
332            ) {
333                self.list_items.push(item);
334                self.list_line_map[i] = true;
335            }
336        }
337    }
338
339    /// Parse and strip all leading blockquote markers, returning (depth, prefix, rest_of_line)
340    fn parse_blockquote_prefix(line: &str) -> (usize, String, &str) {
341        let mut rest = line;
342        let mut prefix = String::new();
343        let mut depth = 0;
344        loop {
345            let trimmed = rest.trim_start();
346            if let Some(after) = trimmed.strip_prefix('>') {
347                // Find the '>' and a single optional space
348                let mut chars = after.chars();
349                let mut space_count = 0;
350                if let Some(' ') = chars.next() {
351                    space_count = 1;
352                }
353                let (spaces, after_marker) = after.split_at(space_count);
354                prefix.push('>');
355                prefix.push_str(spaces);
356                rest = after_marker;
357                depth += 1;
358            } else {
359                break;
360            }
361        }
362        (depth, prefix, rest)
363    }
364
365    /// Calculate the nesting level for a list item, considering blockquote depth
366    fn calculate_nesting_level(
367        &self,
368        indent: usize,
369        blockquote_depth: usize,
370        prev_items: &mut Vec<(usize, usize, usize)>,
371    ) -> usize {
372        let mut nesting_level = 0;
373
374        // Only consider previous items with the same blockquote depth
375        if let Some(&(_last_bq, last_indent, last_level)) =
376            prev_items.iter().rev().find(|(bq, _, _)| *bq == blockquote_depth)
377        {
378            use std::cmp::Ordering;
379            match indent.cmp(&last_indent) {
380                Ordering::Greater => {
381                    // More indented - increase nesting level
382                    nesting_level = last_level + 1;
383                }
384                Ordering::Equal => {
385                    // Same indentation - same level
386                    nesting_level = last_level;
387                }
388                Ordering::Less => {
389                    // Less indented - find the appropriate level
390                    let mut found_level = None;
391
392                    // First look for exact match
393                    for &(prev_bq, prev_indent, prev_level) in prev_items.iter().rev() {
394                        if prev_bq == blockquote_depth && prev_indent == indent {
395                            found_level = Some(prev_level);
396                            break;
397                        }
398                    }
399
400                    // If no exact match, check if this is a case where we should treat similar indentations as same level
401                    // This handles mixed tab/space scenarios where 4 and 6 spaces should be at the same level
402                    if found_level.is_none() && indent > 0 && last_indent > 0 {
403                        // Only apply similar indentation logic if the difference is small and we're dealing with small indentations
404                        let diff = (indent as i32 - last_indent as i32).abs();
405                        if diff <= 2 && indent <= 8 && last_indent <= 8 {
406                            // Check if there's a recent item at a lower indentation level
407                            let has_lower_indent = prev_items.iter().rev().take(3).any(|(bq, prev_indent, _)| {
408                                *bq == blockquote_depth && *prev_indent < indent.min(last_indent)
409                            });
410                            if has_lower_indent {
411                                found_level = Some(last_level);
412                            }
413                        }
414                    }
415
416                    // If still no match, look for the most recent less indented item
417                    if found_level.is_none() {
418                        for &(prev_bq, prev_indent, prev_level) in prev_items.iter().rev() {
419                            if prev_bq == blockquote_depth && prev_indent < indent {
420                                found_level = Some(prev_level);
421                                break;
422                            }
423                        }
424                    }
425
426                    nesting_level = found_level.unwrap_or(0);
427                }
428            }
429        }
430
431        // Remove stack entries with indent >= current indent and same blockquote depth
432        while let Some(&(prev_bq, prev_indent, _)) = prev_items.last() {
433            if prev_bq != blockquote_depth || prev_indent < indent {
434                break;
435            }
436            prev_items.pop();
437        }
438        prev_items.push((blockquote_depth, indent, nesting_level));
439        nesting_level
440    }
441
442    /// Parse a line as a list item and determine its nesting level
443    fn parse_list_item(
444        &self,
445        line: &str,
446        line_num: usize,
447        prev_items: &mut Vec<(usize, usize, usize)>,
448        blockquote_depth: usize,
449        blockquote_prefix: String,
450    ) -> Option<ListItem> {
451        match UNORDERED_LIST_REGEX.captures(line) {
452            Ok(Some(captures)) => {
453                let indent_str = captures.name("indent").map_or("", |m| m.as_str()).to_string();
454                let indentation = Self::calculate_indentation_width_default(&indent_str);
455                let marker = captures.name("marker").unwrap().as_str();
456                let after = captures.name("after").map_or("", |m| m.as_str());
457                let spaces = after.len();
458                let raw_content = captures.name("content").map_or("", |m| m.as_str());
459                let content = raw_content.trim_start().to_string();
460                let marker_type = match marker {
461                    "*" => ListMarkerType::Asterisk,
462                    "+" => ListMarkerType::Plus,
463                    "-" => ListMarkerType::Minus,
464                    other => {
465                        // This should never happen due to regex validation,
466                        // but default to dash if it does
467                        eprintln!("Warning: Unexpected list marker '{other}', defaulting to dash");
468                        ListMarkerType::Minus
469                    }
470                };
471                let nesting_level = self.calculate_nesting_level(indentation, blockquote_depth, prev_items);
472                // Find parent: most recent previous item with lower nesting_level and same blockquote depth
473                let parent_line_number = prev_items
474                    .iter()
475                    .rev()
476                    .find(|(bq, _, level)| *bq == blockquote_depth && *level < nesting_level)
477                    .map(|(_, _, line_num)| *line_num);
478                return Some(ListItem {
479                    line_number: line_num,
480                    indentation,
481                    indent_str,
482                    marker_type,
483                    marker: marker.to_string(),
484                    content,
485                    spaces_after_marker: spaces,
486                    nesting_level,
487                    parent_line_number,
488                    blockquote_depth,
489                    blockquote_prefix,
490                });
491            }
492            Ok(None) => {
493                // No debug output
494            }
495            Err(_) => {}
496        }
497        match ORDERED_LIST_REGEX.captures(line) {
498            Ok(Some(captures)) => {
499                let indent_str = captures.name("indent").map_or("", |m| m.as_str()).to_string();
500                let indentation = Self::calculate_indentation_width_default(&indent_str);
501                let marker = captures.name("marker").unwrap().as_str();
502                let spaces = captures.name("after").map_or(0, |m| m.as_str().len());
503                let content = captures
504                    .name("content")
505                    .map_or("", |m| m.as_str())
506                    .trim_start()
507                    .to_string();
508                let nesting_level = self.calculate_nesting_level(indentation, blockquote_depth, prev_items);
509                // Find parent: most recent previous item with lower nesting_level and same blockquote depth
510                let parent_line_number = prev_items
511                    .iter()
512                    .rev()
513                    .find(|(bq, _, level)| *bq == blockquote_depth && *level < nesting_level)
514                    .map(|(_, _, line_num)| *line_num);
515                return Some(ListItem {
516                    line_number: line_num,
517                    indentation,
518                    indent_str,
519                    marker_type: ListMarkerType::Ordered,
520                    marker: marker.to_string(),
521                    content,
522                    spaces_after_marker: spaces,
523                    nesting_level,
524                    parent_line_number,
525                    blockquote_depth,
526                    blockquote_prefix,
527                });
528            }
529            Ok(None) => {}
530            Err(_) => {}
531        }
532        None
533    }
534}
535
536// Global cache for sharing across threads
537static ELEMENT_CACHE: LazyLock<Arc<Mutex<Option<ElementCache>>>> = LazyLock::new(|| Arc::new(Mutex::new(None)));
538
539/// Get or create element cache for document content
540///
541/// If the mutex is poisoned, creates a fresh cache without storing it globally.
542/// This ensures the library never panics due to mutex poisoning.
543pub fn get_element_cache(content: &str) -> ElementCache {
544    // Try to get existing cache
545    if let Ok(cache_guard) = ELEMENT_CACHE.lock() {
546        // If cache exists and content matches (by hash), return it
547        if let Some(existing_cache) = &*cache_guard
548            && existing_cache.is_valid_for(content)
549        {
550            return existing_cache.clone();
551        }
552    }
553
554    // Content doesn't match or mutex poisoned, create new cache
555    let new_cache = ElementCache::new(content);
556
557    // Store in global cache (ignore if mutex is poisoned)
558    if let Ok(mut cache_guard) = ELEMENT_CACHE.lock() {
559        *cache_guard = Some(new_cache.clone());
560    }
561
562    new_cache
563}
564
565/// Reset the element cache
566///
567/// If the mutex is poisoned, this is a no-op.
568pub fn reset_element_cache() {
569    if let Ok(mut cache_guard) = ELEMENT_CACHE.lock() {
570        *cache_guard = None;
571    }
572}
573
574#[cfg(test)]
575mod tests {
576    use super::*;
577
578    #[test]
579    fn test_code_block_detection() {
580        let content = "Regular text\n\n```rust\nfn main() {\n    println!(\"Hello\");\n}\n```\n\nMore text";
581        let cache = ElementCache::new(content);
582
583        assert_eq!(cache.code_blocks.len(), 1);
584        assert_eq!(cache.code_blocks[0].start_line, 3);
585        assert_eq!(cache.code_blocks[0].end_line, 7);
586        assert_eq!(cache.code_blocks[0].block_type, CodeBlockType::Fenced);
587        assert_eq!(cache.code_blocks[0].language, Some("rust".to_string()));
588
589        assert!(!cache.is_in_code_block(1));
590        assert!(!cache.is_in_code_block(2));
591        assert!(cache.is_in_code_block(3));
592        assert!(cache.is_in_code_block(4));
593        assert!(cache.is_in_code_block(5));
594        assert!(cache.is_in_code_block(6));
595        assert!(cache.is_in_code_block(7));
596        assert!(!cache.is_in_code_block(8));
597        assert!(!cache.is_in_code_block(9));
598    }
599
600    #[test]
601    fn test_list_item_detection_simple() {
602        let content =
603            "# Heading\n\n- First item\n  - Nested item\n- Second item\n\n1. Ordered item\n   1. Nested ordered\n";
604        let cache = ElementCache::new(content);
605        assert_eq!(cache.list_items.len(), 5);
606        // Check the first item
607        assert_eq!(cache.list_items[0].line_number, 3);
608        assert_eq!(cache.list_items[0].marker, "-");
609        assert_eq!(cache.list_items[0].nesting_level, 0);
610        // Check the nested item
611        assert_eq!(cache.list_items[1].line_number, 4);
612        assert_eq!(cache.list_items[1].marker, "-");
613        assert_eq!(cache.list_items[1].nesting_level, 1);
614        // Check the second list item
615        assert_eq!(cache.list_items[2].line_number, 5);
616        assert_eq!(cache.list_items[2].marker, "-");
617        assert_eq!(cache.list_items[2].nesting_level, 0);
618        // Check ordered list item
619        assert_eq!(cache.list_items[3].line_number, 7);
620        assert_eq!(cache.list_items[3].marker, "1.");
621        assert_eq!(cache.list_items[3].nesting_level, 0);
622        // Check nested ordered list item
623        assert_eq!(cache.list_items[4].line_number, 8);
624        assert_eq!(cache.list_items[4].marker, "1.");
625        assert_eq!(cache.list_items[4].nesting_level, 1);
626    }
627
628    #[test]
629    fn test_list_item_detection_complex() {
630        let complex = "  * Level 1 item 1\n    - Level 2 item 1\n      + Level 3 item 1\n    - Level 2 item 2\n  * Level 1 item 2\n\n* Top\n  + Nested\n    - Deep\n      * Deeper\n        + Deepest\n";
631        let cache = ElementCache::new(complex);
632
633        // Should detect all 10 list items
634        assert_eq!(cache.list_items.len(), 10);
635        // Check markers and nesting levels
636        assert_eq!(cache.list_items[0].marker, "*");
637        assert_eq!(cache.list_items[0].nesting_level, 0);
638        assert_eq!(cache.list_items[1].marker, "-");
639        assert_eq!(cache.list_items[1].nesting_level, 1);
640        assert_eq!(cache.list_items[2].marker, "+");
641        assert_eq!(cache.list_items[2].nesting_level, 2);
642        assert_eq!(cache.list_items[3].marker, "-");
643        assert_eq!(cache.list_items[3].nesting_level, 1);
644        assert_eq!(cache.list_items[4].marker, "*");
645        assert_eq!(cache.list_items[4].nesting_level, 0);
646        assert_eq!(cache.list_items[5].marker, "*");
647        assert_eq!(cache.list_items[5].nesting_level, 0);
648        assert_eq!(cache.list_items[6].marker, "+");
649        assert_eq!(cache.list_items[6].nesting_level, 1);
650        assert_eq!(cache.list_items[7].marker, "-");
651        assert_eq!(cache.list_items[7].nesting_level, 2);
652        assert_eq!(cache.list_items[8].marker, "*");
653        assert_eq!(cache.list_items[8].nesting_level, 3);
654        assert_eq!(cache.list_items[9].marker, "+");
655        assert_eq!(cache.list_items[9].nesting_level, 4);
656        let expected_nesting = vec![0, 1, 2, 1, 0, 0, 1, 2, 3, 4];
657        let actual_nesting: Vec<_> = cache.list_items.iter().map(|item| item.nesting_level).collect();
658        assert_eq!(
659            actual_nesting, expected_nesting,
660            "Nesting levels should match expected values"
661        );
662    }
663
664    #[test]
665    fn test_list_item_detection_edge() {
666        let edge = "* Item 1\n\n    - Nested 1\n  + Nested 2\n\n* Item 2\n";
667        let cache = ElementCache::new(edge);
668        assert_eq!(cache.list_items.len(), 4);
669
670        // Check correct nesting levels according to CommonMark:
671        // * Item 1 (indent=0) -> level 0
672        // - Nested 1 (indent=4) -> level 1 (nested under Item 1)
673        // + Nested 2 (indent=2) -> level 1 (nested under Item 1)
674        // * Item 2 (indent=0) -> level 0
675        let expected_nesting = vec![0, 1, 1, 0];
676        let actual_nesting: Vec<_> = cache.list_items.iter().map(|item| item.nesting_level).collect();
677        assert_eq!(
678            actual_nesting, expected_nesting,
679            "Nesting levels should be calculated based on indentation, not reset by blank lines"
680        );
681    }
682
683    #[test]
684    fn test_code_span_detection() {
685        let content = "Here is some `inline code` and here are ``nested `code` spans``";
686        let cache = ElementCache::new(content);
687
688        // Should have two code spans
689        assert_eq!(cache.code_spans.len(), 2);
690
691        // Check spans
692        let span1_content = &content[cache.code_spans[0].start..cache.code_spans[0].end];
693        assert_eq!(span1_content, "`inline code`");
694
695        let span2_content = &content[cache.code_spans[1].start..cache.code_spans[1].end];
696        assert_eq!(span2_content, "``nested `code` spans``");
697    }
698
699    #[test]
700    fn test_get_element_cache() {
701        let content1 = "Test content";
702        let content2 = "Different content";
703
704        // First call should create a new cache
705        let cache1 = get_element_cache(content1);
706
707        // Second call with same content should return the same cache
708        let cache2 = get_element_cache(content1);
709
710        // Third call with different content should create new cache
711        let cache3 = get_element_cache(content2);
712
713        // Verify caches are valid for their respective content
714        assert!(cache1.is_valid_for(content1));
715        assert!(cache2.is_valid_for(content1));
716        assert!(cache3.is_valid_for(content2));
717
718        // Verify caches are NOT valid for different content
719        assert!(!cache1.is_valid_for(content2));
720        assert!(!cache3.is_valid_for(content1));
721    }
722
723    #[test]
724    fn test_list_item_detection_deep_nesting_and_edge_cases() {
725        // Deeply nested unordered lists, mixed markers, excessive indentation, tabs, and blank lines
726        let content = "\
727* Level 1
728  - Level 2
729    + Level 3
730      * Level 4
731        - Level 5
732          + Level 6
733* Sibling 1
734    * Sibling 2
735\n    - After blank line, not nested\n\n\t* Tab indented\n        * 8 spaces indented\n* After excessive indent\n";
736        let cache = ElementCache::new(content);
737        // Should detect all lines that start with a valid unordered list marker
738        let _expected_markers = ["*", "-", "+", "*", "-", "+", "*", "*", "-", "*", "*", "*"];
739        let _expected_indents = [0, 4, 8, 0, 4, 8, 0, 4, 8, 12, 16, 20];
740        let expected_content = vec![
741            "Level 1",
742            "Level 2",
743            "Level 3",
744            "Level 4",
745            "Level 5",
746            "Level 6",
747            "Sibling 1",
748            "Sibling 2",
749            "After blank line, not nested",
750            "Tab indented",      // Content after marker
751            "8 spaces indented", // Content after marker
752            "After excessive indent",
753        ];
754        let actual_content: Vec<_> = cache.list_items.iter().map(|item| item.content.clone()).collect();
755        assert_eq!(
756            actual_content, expected_content,
757            "List item contents should match expected values"
758        );
759        // Updated expected nesting levels based on correct CommonMark behavior:
760        // Blank lines should NOT reset nesting context
761        let expected_nesting = vec![0, 1, 2, 3, 4, 5, 0, 1, 1, 1, 2, 0];
762        let actual_nesting: Vec<_> = cache.list_items.iter().map(|item| item.nesting_level).collect();
763        assert_eq!(
764            actual_nesting, expected_nesting,
765            "Nesting levels should match expected values"
766        );
767        // Check that tab-indented and 8-space-indented items are detected
768        assert!(
769            cache
770                .list_items
771                .iter()
772                .any(|item| item.marker == "*" && item.indentation >= 1),
773            "Tab or 8-space indented item not detected"
774        );
775        // Check that after blank lines, items maintain correct nesting based on indentation
776        let after_blank = cache
777            .list_items
778            .iter()
779            .find(|item| item.content.contains("After blank line"));
780        assert!(after_blank.is_some());
781        assert_eq!(
782            after_blank.unwrap().nesting_level,
783            1,
784            "Item after blank line should maintain nesting based on indentation"
785        );
786    }
787
788    #[test]
789    fn test_tab_indentation_calculation() {
790        // Test that tabs are properly converted to spaces for indentation calculation
791        let content = "* Level 0\n\t* Tab indented (should be level 1)\n\t\t* Double tab (should be level 2)\n    * 4 spaces (should be level 1)\n        * 8 spaces (should be level 2)\n";
792        let cache = ElementCache::new(content);
793
794        assert_eq!(cache.list_items.len(), 5);
795
796        // Check indentation values (tabs should be converted to spaces)
797        assert_eq!(cache.list_items[0].indentation, 0); // "* Level 0"
798        assert_eq!(cache.list_items[1].indentation, 4); // "\t* Tab indented" (tab = 4 spaces)
799        assert_eq!(cache.list_items[2].indentation, 8); // "\t\t* Double tab" (2 tabs = 8 spaces)
800        assert_eq!(cache.list_items[3].indentation, 4); // "    * 4 spaces"
801        assert_eq!(cache.list_items[4].indentation, 8); // "        * 8 spaces"
802
803        // Check nesting levels
804        assert_eq!(cache.list_items[0].nesting_level, 0);
805        assert_eq!(cache.list_items[1].nesting_level, 1);
806        assert_eq!(cache.list_items[2].nesting_level, 2);
807        assert_eq!(cache.list_items[3].nesting_level, 1);
808        assert_eq!(cache.list_items[4].nesting_level, 2);
809    }
810
811    #[test]
812    fn test_mixed_tabs_and_spaces_indentation() {
813        // Test mixed tabs and spaces
814        let content = "* Level 0\n\t  * Tab + 2 spaces (should be level 1)\n  \t* 2 spaces + tab (should be level 1)\n\t\t  * 2 tabs + 2 spaces (should be level 2)\n";
815
816        // Clear any cached data to ensure fresh parsing
817        reset_element_cache();
818        let cache = ElementCache::new(content);
819
820        assert_eq!(cache.list_items.len(), 4);
821
822        // Check indentation values
823        assert_eq!(cache.list_items[0].indentation, 0); // "* Level 0"
824        assert_eq!(cache.list_items[1].indentation, 6); // "\t  * Tab + 2 spaces" (tab to 4 + 2 spaces = 6)
825        assert_eq!(cache.list_items[2].indentation, 4); // "  \t* 2 spaces + tab" (2 spaces, then tab to next stop = 4)
826        assert_eq!(cache.list_items[3].indentation, 10); // "\t\t  * 2 tabs + 2 spaces" (2 tabs = 8 + 2 spaces = 10)
827
828        // Check nesting levels
829        assert_eq!(cache.list_items[0].nesting_level, 0);
830        assert_eq!(cache.list_items[1].nesting_level, 1);
831        assert_eq!(cache.list_items[2].nesting_level, 1);
832        assert_eq!(cache.list_items[3].nesting_level, 2);
833    }
834
835    #[test]
836    fn test_tab_width_configuration() {
837        // Test with different tab widths (default should be 4)
838        let content = "\t* Single tab\n\t\t* Double tab\n";
839        let cache = ElementCache::new(content);
840
841        assert_eq!(cache.list_items.len(), 2);
842
843        // With default tab width of 4
844        assert_eq!(cache.list_items[0].indentation, 4); // "\t*" = 4 spaces
845        assert_eq!(cache.list_items[1].indentation, 8); // "\t\t*" = 8 spaces
846
847        // Check nesting levels
848        assert_eq!(cache.list_items[0].nesting_level, 0);
849        assert_eq!(cache.list_items[1].nesting_level, 1);
850    }
851
852    #[test]
853    fn test_tab_expansion_debug() {
854        // Debug the tab expansion logic
855        assert_eq!(ElementCache::calculate_indentation_width_default(""), 0);
856        assert_eq!(ElementCache::calculate_indentation_width_default(" "), 1);
857        assert_eq!(ElementCache::calculate_indentation_width_default("  "), 2);
858        assert_eq!(ElementCache::calculate_indentation_width_default("    "), 4);
859        assert_eq!(ElementCache::calculate_indentation_width_default("\t"), 4);
860        assert_eq!(ElementCache::calculate_indentation_width_default("\t\t"), 8);
861        assert_eq!(ElementCache::calculate_indentation_width_default("\t  "), 6); // tab to 4, then 2 spaces = 6
862        assert_eq!(ElementCache::calculate_indentation_width_default("  \t"), 4); // 2 spaces, then tab to next stop (4)
863        assert_eq!(ElementCache::calculate_indentation_width_default("\t\t  "), 10);
864        // 2 tabs = 8, then 2 spaces = 10
865    }
866
867    #[test]
868    fn test_mixed_tabs_debug() {
869        // Debug the specific failing case
870        let content = "* Level 0\n\t  * Tab + 2 spaces (should be level 1)\n  \t* 2 spaces + tab (should be level 1)\n\t\t  * 2 tabs + 2 spaces (should be level 2)\n";
871        let cache = ElementCache::new(content);
872
873        println!("Number of list items: {}", cache.list_items.len());
874        for (i, item) in cache.list_items.iter().enumerate() {
875            println!(
876                "Item {}: indent_str={:?}, indentation={}, content={:?}",
877                i, item.indent_str, item.indentation, item.content
878            );
879        }
880
881        // Test the specific indentation strings
882        assert_eq!(ElementCache::calculate_indentation_width_default("\t  "), 6); // tab + 2 spaces
883        assert_eq!(ElementCache::calculate_indentation_width_default("  \t"), 4); // 2 spaces + tab
884        assert_eq!(ElementCache::calculate_indentation_width_default("\t\t  "), 10);
885        // 2 tabs + 2 spaces
886    }
887}
rumdl_lib/utils/element_cache.rs

rumdl_lib/utils/
element_cache.rs