rumdl_lib/utils/
element_cache.rs

1use fancy_regex::Regex as FancyRegex;
2use regex::Regex;
3use std::hash::{Hash, Hasher};
4use std::sync::LazyLock;
5use std::sync::{Arc, Mutex};
6
7// Efficient regex patterns
8static CODE_BLOCK_START_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(```|~~~)(.*)$").unwrap());
9static INDENTED_CODE_BLOCK_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s{4,})(.+)$").unwrap());
10
11// List detection patterns
12static UNORDERED_LIST_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| {
13    FancyRegex::new(r"^(?P<indent>[ \t]*)(?P<marker>[*+-])(?P<after>[ \t]*)(?P<content>.*)$").unwrap()
14});
15static ORDERED_LIST_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| {
16    FancyRegex::new(r"^(?P<indent>[ \t]*)(?P<marker>\d+\.)(?P<after>[ \t]*)(?P<content>.*)$").unwrap()
17});
18
19// Inline code span pattern
20static CODE_SPAN_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"`+").unwrap());
21
22/// Represents a range in the document with start and end positions
23#[derive(Debug, Clone, Copy, PartialEq, Eq)]
24pub struct Range {
25    pub start: usize,
26    pub end: usize,
27}
28
29/// Represents the type of code block
30#[derive(Debug, Clone, Copy, PartialEq, Eq)]
31pub enum CodeBlockType {
32    Fenced,
33    Indented,
34}
35
36/// Represents a code block in the document
37#[derive(Debug, Clone)]
38pub struct CodeBlock {
39    pub range: Range,
40    pub block_type: CodeBlockType,
41    pub start_line: usize,
42    pub end_line: usize,
43    pub language: Option<String>,
44}
45
46/// Represents the type of list marker
47#[derive(Debug, Clone, Copy, PartialEq, Eq)]
48pub enum ListMarkerType {
49    Asterisk,
50    Plus,
51    Minus,
52    Ordered,
53}
54
55/// Represents a list item in the document
56#[derive(Debug, Clone)]
57pub struct ListItem {
58    pub line_number: usize, // 1-indexed
59    pub indentation: usize,
60    pub indent_str: String, // Actual leading whitespace
61    pub marker_type: ListMarkerType,
62    pub marker: String,
63    pub content: String,
64    pub spaces_after_marker: usize,
65    pub nesting_level: usize,
66    pub parent_line_number: Option<usize>,
67    pub blockquote_depth: usize,   // Number of leading blockquote markers
68    pub blockquote_prefix: String, // The actual prefix (e.g., "> > ")
69}
70
71/// Cache for Markdown document structural elements
72/// This allows sharing computed data across multiple rule checks
73#[derive(Debug, Default, Clone)]
74pub struct ElementCache {
75    // Document content hash for cache validation (avoids storing full content)
76    content_hash: u64,
77
78    // Code blocks
79    code_blocks: Vec<CodeBlock>,
80    code_block_line_map: Vec<bool>, // Line index -> is in code block
81
82    // Code spans (inline code)
83    code_spans: Vec<Range>,
84
85    // Lists
86    list_items: Vec<ListItem>,
87    list_line_map: Vec<bool>, // Line index -> is list item
88}
89
90impl ElementCache {
91    /// Compute a hash of the content for cache validation
92    fn compute_content_hash(content: &str) -> u64 {
93        let mut hasher = std::collections::hash_map::DefaultHasher::new();
94        content.hash(&mut hasher);
95        hasher.finish()
96    }
97
98    /// Create a new cache from document content
99    pub fn new(content: &str) -> Self {
100        let content_hash = Self::compute_content_hash(content);
101        let line_count = content.lines().count();
102
103        let mut cache = ElementCache {
104            content_hash,
105            code_blocks: Vec::new(),
106            code_block_line_map: vec![false; line_count],
107            code_spans: Vec::new(),
108            list_items: Vec::new(),
109            list_line_map: vec![false; line_count],
110        };
111
112        // Populate the cache - pass content directly to avoid storing it
113        cache.populate_code_blocks(content);
114        cache.populate_code_spans(content);
115        cache.populate_list_items(content);
116
117        cache
118    }
119
120    /// Check if this cache is valid for the given content
121    pub fn is_valid_for(&self, content: &str) -> bool {
122        Self::compute_content_hash(content) == self.content_hash
123    }
124
125    /// Calculate the visual indentation width of a string, expanding tabs to spaces
126    /// Default tab width is 4 spaces
127    fn calculate_indentation_width(indent_str: &str, tab_width: usize) -> usize {
128        let mut width = 0;
129        for ch in indent_str.chars() {
130            if ch == '\t' {
131                // Round up to next tab stop
132                width = ((width / tab_width) + 1) * tab_width;
133            } else if ch == ' ' {
134                width += 1;
135            } else {
136                // Non-whitespace character, stop counting
137                break;
138            }
139        }
140        width
141    }
142
143    /// Calculate the visual indentation width using default tab width of 4
144    fn calculate_indentation_width_default(indent_str: &str) -> usize {
145        Self::calculate_indentation_width(indent_str, 4)
146    }
147
148    /// Check if a line is within a code block
149    pub fn is_in_code_block(&self, line_num: usize) -> bool {
150        if line_num == 0 || line_num > self.code_block_line_map.len() {
151            return false;
152        }
153        self.code_block_line_map[line_num - 1] // Convert 1-indexed to 0-indexed
154    }
155
156    /// Check if a position is within a code span
157    pub fn is_in_code_span(&self, position: usize) -> bool {
158        self.code_spans
159            .iter()
160            .any(|span| position >= span.start && position < span.end)
161    }
162
163    /// Check if a line is a list item
164    pub fn is_list_item(&self, line_num: usize) -> bool {
165        if line_num == 0 || line_num > self.list_line_map.len() {
166            return false;
167        }
168        self.list_line_map[line_num - 1] // Convert 1-indexed to 0-indexed
169    }
170
171    /// Get list item at line
172    pub fn get_list_item(&self, line_num: usize) -> Option<&ListItem> {
173        self.list_items.iter().find(|item| item.line_number == line_num)
174    }
175
176    /// Get all list items
177    pub fn get_list_items(&self) -> &[ListItem] {
178        &self.list_items
179    }
180
181    /// Get all code blocks
182    pub fn get_code_blocks(&self) -> &[CodeBlock] {
183        &self.code_blocks
184    }
185
186    /// Get all code spans
187    pub fn get_code_spans(&self) -> &[Range] {
188        &self.code_spans
189    }
190
191    /// Detect and populate code blocks
192    fn populate_code_blocks(&mut self, content: &str) {
193        let lines: Vec<&str> = content.lines().collect();
194        let mut in_fenced_block = false;
195        let mut fence_marker = String::new();
196        let mut block_start_line = 0;
197        let mut block_language = String::new();
198
199        for (i, line) in lines.iter().enumerate() {
200            if in_fenced_block {
201                // Already in a fenced code block, look for the end
202                self.code_block_line_map[i] = true;
203
204                if line.trim().starts_with(&fence_marker) {
205                    // End of code block
206                    let start_pos =
207                        lines[0..block_start_line].join("\n").len() + if block_start_line > 0 { 1 } else { 0 };
208                    let end_pos = lines[0..=i].join("\n").len();
209
210                    self.code_blocks.push(CodeBlock {
211                        range: Range {
212                            start: start_pos,
213                            end: end_pos,
214                        },
215                        block_type: CodeBlockType::Fenced,
216                        start_line: block_start_line + 1, // 1-indexed
217                        end_line: i + 1,                  // 1-indexed
218                        language: if !block_language.is_empty() {
219                            Some(block_language.clone())
220                        } else {
221                            None
222                        },
223                    });
224
225                    in_fenced_block = false;
226                    fence_marker.clear();
227                    block_language.clear();
228                }
229            } else if let Some(caps) = CODE_BLOCK_START_REGEX.captures(line) {
230                // Start of a new code block
231                fence_marker = caps.get(2).map_or("```", |m| m.as_str()).to_string();
232                in_fenced_block = true;
233                block_start_line = i;
234                block_language = caps.get(3).map_or("", |m| m.as_str().trim()).to_string();
235                self.code_block_line_map[i] = true;
236            } else if INDENTED_CODE_BLOCK_REGEX.is_match(line) {
237                // Only mark as indented code block if not a list item
238                let is_unordered_list = UNORDERED_LIST_REGEX.is_match(line).unwrap_or(false);
239                let is_ordered_list = ORDERED_LIST_REGEX.is_match(line).unwrap_or(false);
240                if !is_unordered_list && !is_ordered_list {
241                    // Indented code block
242                    self.code_block_line_map[i] = true;
243                    // For indented code blocks, we handle them as individual lines
244                    // We don't track them as blocks with start/end because they can be
245                    // interrupted by blank lines, etc.
246                    let start_pos = lines[0..i].join("\n").len() + if i > 0 { 1 } else { 0 };
247                    let end_pos = start_pos + line.len();
248                    self.code_blocks.push(CodeBlock {
249                        range: Range {
250                            start: start_pos,
251                            end: end_pos,
252                        },
253                        block_type: CodeBlockType::Indented,
254                        start_line: i + 1, // 1-indexed
255                        end_line: i + 1,   // 1-indexed
256                        language: None,
257                    });
258                }
259            }
260        }
261
262        // Handle unclosed code block
263        if in_fenced_block {
264            let start_pos = lines[0..block_start_line].join("\n").len() + if block_start_line > 0 { 1 } else { 0 };
265            let end_pos = content.len();
266
267            self.code_blocks.push(CodeBlock {
268                range: Range {
269                    start: start_pos,
270                    end: end_pos,
271                },
272                block_type: CodeBlockType::Fenced,
273                start_line: block_start_line + 1, // 1-indexed
274                end_line: lines.len(),            // 1-indexed
275                language: if !block_language.is_empty() {
276                    Some(block_language)
277                } else {
278                    None
279                },
280            });
281        }
282    }
283
284    /// Detect and populate code spans
285    fn populate_code_spans(&mut self, content: &str) {
286        // Find inline code spans using regex for backticks
287        let mut i = 0;
288        while i < content.len() {
289            if let Some(m) = CODE_SPAN_REGEX.find_at(content, i) {
290                let backtick_length = m.end() - m.start();
291                let start = m.start();
292
293                // Find matching closing backticks
294                if let Some(end_pos) = content[m.end()..].find(&"`".repeat(backtick_length)) {
295                    let end = m.end() + end_pos + backtick_length;
296                    self.code_spans.push(Range { start, end });
297                    i = end;
298                } else {
299                    i = m.end();
300                }
301            } else {
302                break;
303            }
304        }
305    }
306
307    /// Detect and populate list items
308    fn populate_list_items(&mut self, content: &str) {
309        let lines: Vec<&str> = content.lines().collect();
310        let mut prev_items: Vec<(usize, usize, usize)> = Vec::new(); // (blockquote_depth, nesting_level, line_number)
311        for (i, line) in lines.iter().enumerate() {
312            // Skip blank lines but don't reset nesting context
313            if line.trim().is_empty() {
314                continue;
315            }
316            // Parse and strip blockquote prefix
317            let (blockquote_depth, blockquote_prefix, rest) = Self::parse_blockquote_prefix(line);
318            // Always call parse_list_item and always push if Some
319            if let Some(item) = self.parse_list_item(
320                rest,
321                i + 1,
322                &mut prev_items,
323                blockquote_depth,
324                blockquote_prefix.clone(),
325            ) {
326                self.list_items.push(item);
327                self.list_line_map[i] = true;
328            }
329        }
330    }
331
332    /// Parse and strip all leading blockquote markers, returning (depth, prefix, rest_of_line)
333    fn parse_blockquote_prefix(line: &str) -> (usize, String, &str) {
334        let mut rest = line;
335        let mut prefix = String::new();
336        let mut depth = 0;
337        loop {
338            let trimmed = rest.trim_start();
339            if let Some(after) = trimmed.strip_prefix('>') {
340                // Find the '>' and a single optional space
341                let mut chars = after.chars();
342                let mut space_count = 0;
343                if let Some(' ') = chars.next() {
344                    space_count = 1;
345                }
346                let (spaces, after_marker) = after.split_at(space_count);
347                prefix.push('>');
348                prefix.push_str(spaces);
349                rest = after_marker;
350                depth += 1;
351            } else {
352                break;
353            }
354        }
355        (depth, prefix, rest)
356    }
357
358    /// Calculate the nesting level for a list item, considering blockquote depth
359    fn calculate_nesting_level(
360        &self,
361        indent: usize,
362        blockquote_depth: usize,
363        prev_items: &mut Vec<(usize, usize, usize)>,
364    ) -> usize {
365        let mut nesting_level = 0;
366
367        // Only consider previous items with the same blockquote depth
368        if let Some(&(_last_bq, last_indent, last_level)) =
369            prev_items.iter().rev().find(|(bq, _, _)| *bq == blockquote_depth)
370        {
371            use std::cmp::Ordering;
372            match indent.cmp(&last_indent) {
373                Ordering::Greater => {
374                    // More indented - increase nesting level
375                    nesting_level = last_level + 1;
376                }
377                Ordering::Equal => {
378                    // Same indentation - same level
379                    nesting_level = last_level;
380                }
381                Ordering::Less => {
382                    // Less indented - find the appropriate level
383                    let mut found_level = None;
384
385                    // First look for exact match
386                    for &(prev_bq, prev_indent, prev_level) in prev_items.iter().rev() {
387                        if prev_bq == blockquote_depth && prev_indent == indent {
388                            found_level = Some(prev_level);
389                            break;
390                        }
391                    }
392
393                    // If no exact match, check if this is a case where we should treat similar indentations as same level
394                    // This handles mixed tab/space scenarios where 4 and 6 spaces should be at the same level
395                    if found_level.is_none() && indent > 0 && last_indent > 0 {
396                        // Only apply similar indentation logic if the difference is small and we're dealing with small indentations
397                        let diff = (indent as i32 - last_indent as i32).abs();
398                        if diff <= 2 && indent <= 8 && last_indent <= 8 {
399                            // Check if there's a recent item at a lower indentation level
400                            let has_lower_indent = prev_items.iter().rev().take(3).any(|(bq, prev_indent, _)| {
401                                *bq == blockquote_depth && *prev_indent < indent.min(last_indent)
402                            });
403                            if has_lower_indent {
404                                found_level = Some(last_level);
405                            }
406                        }
407                    }
408
409                    // If still no match, look for the most recent less indented item
410                    if found_level.is_none() {
411                        for &(prev_bq, prev_indent, prev_level) in prev_items.iter().rev() {
412                            if prev_bq == blockquote_depth && prev_indent < indent {
413                                found_level = Some(prev_level);
414                                break;
415                            }
416                        }
417                    }
418
419                    nesting_level = found_level.unwrap_or(0);
420                }
421            }
422        }
423
424        // Remove stack entries with indent >= current indent and same blockquote depth
425        while let Some(&(prev_bq, prev_indent, _)) = prev_items.last() {
426            if prev_bq != blockquote_depth || prev_indent < indent {
427                break;
428            }
429            prev_items.pop();
430        }
431        prev_items.push((blockquote_depth, indent, nesting_level));
432        nesting_level
433    }
434
435    /// Parse a line as a list item and determine its nesting level
436    fn parse_list_item(
437        &self,
438        line: &str,
439        line_num: usize,
440        prev_items: &mut Vec<(usize, usize, usize)>,
441        blockquote_depth: usize,
442        blockquote_prefix: String,
443    ) -> Option<ListItem> {
444        match UNORDERED_LIST_REGEX.captures(line) {
445            Ok(Some(captures)) => {
446                let indent_str = captures.name("indent").map_or("", |m| m.as_str()).to_string();
447                let indentation = Self::calculate_indentation_width_default(&indent_str);
448                let marker = captures.name("marker").unwrap().as_str();
449                let after = captures.name("after").map_or("", |m| m.as_str());
450                let spaces = after.len();
451                let raw_content = captures.name("content").map_or("", |m| m.as_str());
452                let content = raw_content.trim_start().to_string();
453                let marker_type = match marker {
454                    "*" => ListMarkerType::Asterisk,
455                    "+" => ListMarkerType::Plus,
456                    "-" => ListMarkerType::Minus,
457                    other => {
458                        // This should never happen due to regex validation,
459                        // but default to dash if it does
460                        eprintln!("Warning: Unexpected list marker '{other}', defaulting to dash");
461                        ListMarkerType::Minus
462                    }
463                };
464                let nesting_level = self.calculate_nesting_level(indentation, blockquote_depth, prev_items);
465                // Find parent: most recent previous item with lower nesting_level and same blockquote depth
466                let parent_line_number = prev_items
467                    .iter()
468                    .rev()
469                    .find(|(bq, _, level)| *bq == blockquote_depth && *level < nesting_level)
470                    .map(|(_, _, line_num)| *line_num);
471                return Some(ListItem {
472                    line_number: line_num,
473                    indentation,
474                    indent_str,
475                    marker_type,
476                    marker: marker.to_string(),
477                    content,
478                    spaces_after_marker: spaces,
479                    nesting_level,
480                    parent_line_number,
481                    blockquote_depth,
482                    blockquote_prefix,
483                });
484            }
485            Ok(None) => {
486                // No debug output
487            }
488            Err(_) => {}
489        }
490        match ORDERED_LIST_REGEX.captures(line) {
491            Ok(Some(captures)) => {
492                let indent_str = captures.name("indent").map_or("", |m| m.as_str()).to_string();
493                let indentation = Self::calculate_indentation_width_default(&indent_str);
494                let marker = captures.name("marker").unwrap().as_str();
495                let spaces = captures.name("after").map_or(0, |m| m.as_str().len());
496                let content = captures
497                    .name("content")
498                    .map_or("", |m| m.as_str())
499                    .trim_start()
500                    .to_string();
501                let nesting_level = self.calculate_nesting_level(indentation, blockquote_depth, prev_items);
502                // Find parent: most recent previous item with lower nesting_level and same blockquote depth
503                let parent_line_number = prev_items
504                    .iter()
505                    .rev()
506                    .find(|(bq, _, level)| *bq == blockquote_depth && *level < nesting_level)
507                    .map(|(_, _, line_num)| *line_num);
508                return Some(ListItem {
509                    line_number: line_num,
510                    indentation,
511                    indent_str,
512                    marker_type: ListMarkerType::Ordered,
513                    marker: marker.to_string(),
514                    content,
515                    spaces_after_marker: spaces,
516                    nesting_level,
517                    parent_line_number,
518                    blockquote_depth,
519                    blockquote_prefix,
520                });
521            }
522            Ok(None) => {}
523            Err(_) => {}
524        }
525        None
526    }
527}
528
529// Global cache for sharing across threads
530static ELEMENT_CACHE: LazyLock<Arc<Mutex<Option<ElementCache>>>> = LazyLock::new(|| Arc::new(Mutex::new(None)));
531
532/// Get or create element cache for document content
533///
534/// If the mutex is poisoned, creates a fresh cache without storing it globally.
535/// This ensures the library never panics due to mutex poisoning.
536pub fn get_element_cache(content: &str) -> ElementCache {
537    // Try to get existing cache
538    if let Ok(cache_guard) = ELEMENT_CACHE.lock() {
539        // If cache exists and content matches (by hash), return it
540        if let Some(existing_cache) = &*cache_guard
541            && existing_cache.is_valid_for(content)
542        {
543            return existing_cache.clone();
544        }
545    }
546
547    // Content doesn't match or mutex poisoned, create new cache
548    let new_cache = ElementCache::new(content);
549
550    // Store in global cache (ignore if mutex is poisoned)
551    if let Ok(mut cache_guard) = ELEMENT_CACHE.lock() {
552        *cache_guard = Some(new_cache.clone());
553    }
554
555    new_cache
556}
557
558/// Reset the element cache
559///
560/// If the mutex is poisoned, this is a no-op.
561pub fn reset_element_cache() {
562    if let Ok(mut cache_guard) = ELEMENT_CACHE.lock() {
563        *cache_guard = None;
564    }
565}
566
567#[cfg(test)]
568mod tests {
569    use super::*;
570
571    #[test]
572    fn test_code_block_detection() {
573        let content = "Regular text\n\n```rust\nfn main() {\n    println!(\"Hello\");\n}\n```\n\nMore text";
574        let cache = ElementCache::new(content);
575
576        assert_eq!(cache.code_blocks.len(), 1);
577        assert_eq!(cache.code_blocks[0].start_line, 3);
578        assert_eq!(cache.code_blocks[0].end_line, 7);
579        assert_eq!(cache.code_blocks[0].block_type, CodeBlockType::Fenced);
580        assert_eq!(cache.code_blocks[0].language, Some("rust".to_string()));
581
582        assert!(!cache.is_in_code_block(1));
583        assert!(!cache.is_in_code_block(2));
584        assert!(cache.is_in_code_block(3));
585        assert!(cache.is_in_code_block(4));
586        assert!(cache.is_in_code_block(5));
587        assert!(cache.is_in_code_block(6));
588        assert!(cache.is_in_code_block(7));
589        assert!(!cache.is_in_code_block(8));
590        assert!(!cache.is_in_code_block(9));
591    }
592
593    #[test]
594    fn test_list_item_detection_simple() {
595        let content =
596            "# Heading\n\n- First item\n  - Nested item\n- Second item\n\n1. Ordered item\n   1. Nested ordered\n";
597        let cache = ElementCache::new(content);
598        assert_eq!(cache.list_items.len(), 5);
599        // Check the first item
600        assert_eq!(cache.list_items[0].line_number, 3);
601        assert_eq!(cache.list_items[0].marker, "-");
602        assert_eq!(cache.list_items[0].nesting_level, 0);
603        // Check the nested item
604        assert_eq!(cache.list_items[1].line_number, 4);
605        assert_eq!(cache.list_items[1].marker, "-");
606        assert_eq!(cache.list_items[1].nesting_level, 1);
607        // Check the second list item
608        assert_eq!(cache.list_items[2].line_number, 5);
609        assert_eq!(cache.list_items[2].marker, "-");
610        assert_eq!(cache.list_items[2].nesting_level, 0);
611        // Check ordered list item
612        assert_eq!(cache.list_items[3].line_number, 7);
613        assert_eq!(cache.list_items[3].marker, "1.");
614        assert_eq!(cache.list_items[3].nesting_level, 0);
615        // Check nested ordered list item
616        assert_eq!(cache.list_items[4].line_number, 8);
617        assert_eq!(cache.list_items[4].marker, "1.");
618        assert_eq!(cache.list_items[4].nesting_level, 1);
619    }
620
621    #[test]
622    fn test_list_item_detection_complex() {
623        let complex = "  * Level 1 item 1\n    - Level 2 item 1\n      + Level 3 item 1\n    - Level 2 item 2\n  * Level 1 item 2\n\n* Top\n  + Nested\n    - Deep\n      * Deeper\n        + Deepest\n";
624        let cache = ElementCache::new(complex);
625
626        // Should detect all 10 list items
627        assert_eq!(cache.list_items.len(), 10);
628        // Check markers and nesting levels
629        assert_eq!(cache.list_items[0].marker, "*");
630        assert_eq!(cache.list_items[0].nesting_level, 0);
631        assert_eq!(cache.list_items[1].marker, "-");
632        assert_eq!(cache.list_items[1].nesting_level, 1);
633        assert_eq!(cache.list_items[2].marker, "+");
634        assert_eq!(cache.list_items[2].nesting_level, 2);
635        assert_eq!(cache.list_items[3].marker, "-");
636        assert_eq!(cache.list_items[3].nesting_level, 1);
637        assert_eq!(cache.list_items[4].marker, "*");
638        assert_eq!(cache.list_items[4].nesting_level, 0);
639        assert_eq!(cache.list_items[5].marker, "*");
640        assert_eq!(cache.list_items[5].nesting_level, 0);
641        assert_eq!(cache.list_items[6].marker, "+");
642        assert_eq!(cache.list_items[6].nesting_level, 1);
643        assert_eq!(cache.list_items[7].marker, "-");
644        assert_eq!(cache.list_items[7].nesting_level, 2);
645        assert_eq!(cache.list_items[8].marker, "*");
646        assert_eq!(cache.list_items[8].nesting_level, 3);
647        assert_eq!(cache.list_items[9].marker, "+");
648        assert_eq!(cache.list_items[9].nesting_level, 4);
649        let expected_nesting = vec![0, 1, 2, 1, 0, 0, 1, 2, 3, 4];
650        let actual_nesting: Vec<_> = cache.list_items.iter().map(|item| item.nesting_level).collect();
651        assert_eq!(
652            actual_nesting, expected_nesting,
653            "Nesting levels should match expected values"
654        );
655    }
656
657    #[test]
658    fn test_list_item_detection_edge() {
659        let edge = "* Item 1\n\n    - Nested 1\n  + Nested 2\n\n* Item 2\n";
660        let cache = ElementCache::new(edge);
661        assert_eq!(cache.list_items.len(), 4);
662
663        // Check correct nesting levels according to CommonMark:
664        // * Item 1 (indent=0) -> level 0
665        // - Nested 1 (indent=4) -> level 1 (nested under Item 1)
666        // + Nested 2 (indent=2) -> level 1 (nested under Item 1)
667        // * Item 2 (indent=0) -> level 0
668        let expected_nesting = vec![0, 1, 1, 0];
669        let actual_nesting: Vec<_> = cache.list_items.iter().map(|item| item.nesting_level).collect();
670        assert_eq!(
671            actual_nesting, expected_nesting,
672            "Nesting levels should be calculated based on indentation, not reset by blank lines"
673        );
674    }
675
676    #[test]
677    fn test_code_span_detection() {
678        let content = "Here is some `inline code` and here are ``nested `code` spans``";
679        let cache = ElementCache::new(content);
680
681        // Should have two code spans
682        assert_eq!(cache.code_spans.len(), 2);
683
684        // Check spans
685        let span1_content = &content[cache.code_spans[0].start..cache.code_spans[0].end];
686        assert_eq!(span1_content, "`inline code`");
687
688        let span2_content = &content[cache.code_spans[1].start..cache.code_spans[1].end];
689        assert_eq!(span2_content, "``nested `code` spans``");
690    }
691
692    #[test]
693    fn test_get_element_cache() {
694        let content1 = "Test content";
695        let content2 = "Different content";
696
697        // First call should create a new cache
698        let cache1 = get_element_cache(content1);
699
700        // Second call with same content should return the same cache
701        let cache2 = get_element_cache(content1);
702
703        // Third call with different content should create new cache
704        let cache3 = get_element_cache(content2);
705
706        // Verify caches are valid for their respective content
707        assert!(cache1.is_valid_for(content1));
708        assert!(cache2.is_valid_for(content1));
709        assert!(cache3.is_valid_for(content2));
710
711        // Verify caches are NOT valid for different content
712        assert!(!cache1.is_valid_for(content2));
713        assert!(!cache3.is_valid_for(content1));
714    }
715
716    #[test]
717    fn test_list_item_detection_deep_nesting_and_edge_cases() {
718        // Deeply nested unordered lists, mixed markers, excessive indentation, tabs, and blank lines
719        let content = "\
720* Level 1
721  - Level 2
722    + Level 3
723      * Level 4
724        - Level 5
725          + Level 6
726* Sibling 1
727    * Sibling 2
728\n    - After blank line, not nested\n\n\t* Tab indented\n        * 8 spaces indented\n* After excessive indent\n";
729        let cache = ElementCache::new(content);
730        // Should detect all lines that start with a valid unordered list marker
731        let _expected_markers = ["*", "-", "+", "*", "-", "+", "*", "*", "-", "*", "*", "*"];
732        let _expected_indents = [0, 4, 8, 0, 4, 8, 0, 4, 8, 12, 16, 20];
733        let expected_content = vec![
734            "Level 1",
735            "Level 2",
736            "Level 3",
737            "Level 4",
738            "Level 5",
739            "Level 6",
740            "Sibling 1",
741            "Sibling 2",
742            "After blank line, not nested",
743            "Tab indented",      // Content after marker
744            "8 spaces indented", // Content after marker
745            "After excessive indent",
746        ];
747        let actual_content: Vec<_> = cache.list_items.iter().map(|item| item.content.clone()).collect();
748        assert_eq!(
749            actual_content, expected_content,
750            "List item contents should match expected values"
751        );
752        // Updated expected nesting levels based on correct CommonMark behavior:
753        // Blank lines should NOT reset nesting context
754        let expected_nesting = vec![0, 1, 2, 3, 4, 5, 0, 1, 1, 1, 2, 0];
755        let actual_nesting: Vec<_> = cache.list_items.iter().map(|item| item.nesting_level).collect();
756        assert_eq!(
757            actual_nesting, expected_nesting,
758            "Nesting levels should match expected values"
759        );
760        // Check that tab-indented and 8-space-indented items are detected
761        assert!(
762            cache
763                .list_items
764                .iter()
765                .any(|item| item.marker == "*" && item.indentation >= 1),
766            "Tab or 8-space indented item not detected"
767        );
768        // Check that after blank lines, items maintain correct nesting based on indentation
769        let after_blank = cache
770            .list_items
771            .iter()
772            .find(|item| item.content.contains("After blank line"));
773        assert!(after_blank.is_some());
774        assert_eq!(
775            after_blank.unwrap().nesting_level,
776            1,
777            "Item after blank line should maintain nesting based on indentation"
778        );
779    }
780
781    #[test]
782    fn test_tab_indentation_calculation() {
783        // Test that tabs are properly converted to spaces for indentation calculation
784        let content = "* Level 0\n\t* Tab indented (should be level 1)\n\t\t* Double tab (should be level 2)\n    * 4 spaces (should be level 1)\n        * 8 spaces (should be level 2)\n";
785        let cache = ElementCache::new(content);
786
787        assert_eq!(cache.list_items.len(), 5);
788
789        // Check indentation values (tabs should be converted to spaces)
790        assert_eq!(cache.list_items[0].indentation, 0); // "* Level 0"
791        assert_eq!(cache.list_items[1].indentation, 4); // "\t* Tab indented" (tab = 4 spaces)
792        assert_eq!(cache.list_items[2].indentation, 8); // "\t\t* Double tab" (2 tabs = 8 spaces)
793        assert_eq!(cache.list_items[3].indentation, 4); // "    * 4 spaces"
794        assert_eq!(cache.list_items[4].indentation, 8); // "        * 8 spaces"
795
796        // Check nesting levels
797        assert_eq!(cache.list_items[0].nesting_level, 0);
798        assert_eq!(cache.list_items[1].nesting_level, 1);
799        assert_eq!(cache.list_items[2].nesting_level, 2);
800        assert_eq!(cache.list_items[3].nesting_level, 1);
801        assert_eq!(cache.list_items[4].nesting_level, 2);
802    }
803
804    #[test]
805    fn test_mixed_tabs_and_spaces_indentation() {
806        // Test mixed tabs and spaces
807        let content = "* Level 0\n\t  * Tab + 2 spaces (should be level 1)\n  \t* 2 spaces + tab (should be level 1)\n\t\t  * 2 tabs + 2 spaces (should be level 2)\n";
808
809        // Clear any cached data to ensure fresh parsing
810        reset_element_cache();
811        let cache = ElementCache::new(content);
812
813        assert_eq!(cache.list_items.len(), 4);
814
815        // Check indentation values
816        assert_eq!(cache.list_items[0].indentation, 0); // "* Level 0"
817        assert_eq!(cache.list_items[1].indentation, 6); // "\t  * Tab + 2 spaces" (tab to 4 + 2 spaces = 6)
818        assert_eq!(cache.list_items[2].indentation, 4); // "  \t* 2 spaces + tab" (2 spaces, then tab to next stop = 4)
819        assert_eq!(cache.list_items[3].indentation, 10); // "\t\t  * 2 tabs + 2 spaces" (2 tabs = 8 + 2 spaces = 10)
820
821        // Check nesting levels
822        assert_eq!(cache.list_items[0].nesting_level, 0);
823        assert_eq!(cache.list_items[1].nesting_level, 1);
824        assert_eq!(cache.list_items[2].nesting_level, 1);
825        assert_eq!(cache.list_items[3].nesting_level, 2);
826    }
827
828    #[test]
829    fn test_tab_width_configuration() {
830        // Test with different tab widths (default should be 4)
831        let content = "\t* Single tab\n\t\t* Double tab\n";
832        let cache = ElementCache::new(content);
833
834        assert_eq!(cache.list_items.len(), 2);
835
836        // With default tab width of 4
837        assert_eq!(cache.list_items[0].indentation, 4); // "\t*" = 4 spaces
838        assert_eq!(cache.list_items[1].indentation, 8); // "\t\t*" = 8 spaces
839
840        // Check nesting levels
841        assert_eq!(cache.list_items[0].nesting_level, 0);
842        assert_eq!(cache.list_items[1].nesting_level, 1);
843    }
844
845    #[test]
846    fn test_tab_expansion_debug() {
847        // Debug the tab expansion logic
848        assert_eq!(ElementCache::calculate_indentation_width_default(""), 0);
849        assert_eq!(ElementCache::calculate_indentation_width_default(" "), 1);
850        assert_eq!(ElementCache::calculate_indentation_width_default("  "), 2);
851        assert_eq!(ElementCache::calculate_indentation_width_default("    "), 4);
852        assert_eq!(ElementCache::calculate_indentation_width_default("\t"), 4);
853        assert_eq!(ElementCache::calculate_indentation_width_default("\t\t"), 8);
854        assert_eq!(ElementCache::calculate_indentation_width_default("\t  "), 6); // tab to 4, then 2 spaces = 6
855        assert_eq!(ElementCache::calculate_indentation_width_default("  \t"), 4); // 2 spaces, then tab to next stop (4)
856        assert_eq!(ElementCache::calculate_indentation_width_default("\t\t  "), 10);
857        // 2 tabs = 8, then 2 spaces = 10
858    }
859
860    #[test]
861    fn test_mixed_tabs_debug() {
862        // Debug the specific failing case
863        let content = "* Level 0\n\t  * Tab + 2 spaces (should be level 1)\n  \t* 2 spaces + tab (should be level 1)\n\t\t  * 2 tabs + 2 spaces (should be level 2)\n";
864        let cache = ElementCache::new(content);
865
866        println!("Number of list items: {}", cache.list_items.len());
867        for (i, item) in cache.list_items.iter().enumerate() {
868            println!(
869                "Item {}: indent_str={:?}, indentation={}, content={:?}",
870                i, item.indent_str, item.indentation, item.content
871            );
872        }
873
874        // Test the specific indentation strings
875        assert_eq!(ElementCache::calculate_indentation_width_default("\t  "), 6); // tab + 2 spaces
876        assert_eq!(ElementCache::calculate_indentation_width_default("  \t"), 4); // 2 spaces + tab
877        assert_eq!(ElementCache::calculate_indentation_width_default("\t\t  "), 10);
878        // 2 tabs + 2 spaces
879    }
880}
rumdl_lib/utils/element_cache.rs

rumdl_lib/utils/
element_cache.rs