rumdl_lib/utils/
element_cache.rs

1use fancy_regex::Regex as FancyRegex;
2use lazy_static::lazy_static;
3use regex::Regex;
4use std::sync::{Arc, Mutex};
5
6lazy_static! {
7    // Efficient regex patterns
8    static ref CODE_BLOCK_START_REGEX: Regex = Regex::new(r"^(\s*)(```|~~~)(.*)$").unwrap();
9    static ref CODE_BLOCK_END_REGEX: Regex = Regex::new(r"^(\s*)(```|~~~)\s*$").unwrap();
10    static ref INDENTED_CODE_BLOCK_REGEX: Regex = Regex::new(r"^(\s{4,})(.+)$").unwrap();
11
12    // List detection patterns
13    static ref UNORDERED_LIST_REGEX: FancyRegex = FancyRegex::new(r"^(?P<indent>[ \t]*)(?P<marker>[*+-])(?P<after>[ \t]*)(?P<content>.*)$").unwrap();
14    static ref ORDERED_LIST_REGEX: FancyRegex = FancyRegex::new(r"^(?P<indent>[ \t]*)(?P<marker>\d+\.)(?P<after>[ \t]*)(?P<content>.*)$").unwrap();
15
16    // Inline code span pattern
17    static ref CODE_SPAN_REGEX: Regex = Regex::new(r"`+").unwrap();
18}
19
20/// Represents a range in the document with start and end positions
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub struct Range {
23    pub start: usize,
24    pub end: usize,
25}
26
27/// Represents the type of code block
28#[derive(Debug, Clone, Copy, PartialEq, Eq)]
29pub enum CodeBlockType {
30    Fenced,
31    Indented,
32}
33
34/// Represents a code block in the document
35#[derive(Debug, Clone)]
36pub struct CodeBlock {
37    pub range: Range,
38    pub block_type: CodeBlockType,
39    pub start_line: usize,
40    pub end_line: usize,
41    pub language: Option<String>,
42}
43
44/// Represents the type of list marker
45#[derive(Debug, Clone, Copy, PartialEq, Eq)]
46pub enum ListMarkerType {
47    Asterisk,
48    Plus,
49    Minus,
50    Ordered,
51}
52
53/// Represents a list item in the document
54#[derive(Debug, Clone)]
55pub struct ListItem {
56    pub line_number: usize, // 1-indexed
57    pub indentation: usize,
58    pub indent_str: String, // Actual leading whitespace
59    pub marker_type: ListMarkerType,
60    pub marker: String,
61    pub content: String,
62    pub spaces_after_marker: usize,
63    pub nesting_level: usize,
64    pub parent_line_number: Option<usize>,
65    pub blockquote_depth: usize,   // Number of leading blockquote markers
66    pub blockquote_prefix: String, // The actual prefix (e.g., "> > ")
67}
68
69/// Cache for Markdown document structural elements
70/// This allows sharing computed data across multiple rule checks
71#[derive(Debug, Default, Clone)]
72pub struct ElementCache {
73    // Document content and metadata
74    content: Option<String>,
75    line_count: usize,
76
77    // Code blocks
78    code_blocks: Vec<CodeBlock>,
79    code_block_line_map: Vec<bool>, // Line index -> is in code block
80
81    // Code spans (inline code)
82    code_spans: Vec<Range>,
83
84    // Lists
85    list_items: Vec<ListItem>,
86    list_line_map: Vec<bool>, // Line index -> is list item
87}
88
89impl ElementCache {
90    /// Create a new cache from document content
91    pub fn new(content: &str) -> Self {
92        let mut cache = ElementCache {
93            content: Some(content.to_string()),
94            line_count: content.lines().count(),
95            code_blocks: Vec::new(),
96            code_block_line_map: Vec::new(),
97            code_spans: Vec::new(),
98            list_items: Vec::new(),
99            list_line_map: Vec::new(),
100        };
101
102        // Initialize maps
103        cache.code_block_line_map = vec![false; cache.line_count];
104        cache.list_line_map = vec![false; cache.line_count];
105
106        // Populate the cache
107        cache.populate_code_blocks();
108        cache.populate_code_spans();
109        cache.populate_list_items();
110
111        cache
112    }
113
114    /// Calculate the visual indentation width of a string, expanding tabs to spaces
115    /// Default tab width is 4 spaces
116    fn calculate_indentation_width(indent_str: &str, tab_width: usize) -> usize {
117        let mut width = 0;
118        for ch in indent_str.chars() {
119            if ch == '\t' {
120                // Round up to next tab stop
121                width = ((width / tab_width) + 1) * tab_width;
122            } else if ch == ' ' {
123                width += 1;
124            } else {
125                // Non-whitespace character, stop counting
126                break;
127            }
128        }
129        width
130    }
131
132    /// Calculate the visual indentation width using default tab width of 4
133    fn calculate_indentation_width_default(indent_str: &str) -> usize {
134        Self::calculate_indentation_width(indent_str, 4)
135    }
136
137    /// Check if a line is within a code block
138    pub fn is_in_code_block(&self, line_num: usize) -> bool {
139        if line_num == 0 || line_num > self.code_block_line_map.len() {
140            return false;
141        }
142        self.code_block_line_map[line_num - 1] // Convert 1-indexed to 0-indexed
143    }
144
145    /// Check if a position is within a code span
146    pub fn is_in_code_span(&self, position: usize) -> bool {
147        self.code_spans
148            .iter()
149            .any(|span| position >= span.start && position < span.end)
150    }
151
152    /// Check if a line is a list item
153    pub fn is_list_item(&self, line_num: usize) -> bool {
154        if line_num == 0 || line_num > self.list_line_map.len() {
155            return false;
156        }
157        self.list_line_map[line_num - 1] // Convert 1-indexed to 0-indexed
158    }
159
160    /// Get list item at line
161    pub fn get_list_item(&self, line_num: usize) -> Option<&ListItem> {
162        self.list_items.iter().find(|item| item.line_number == line_num)
163    }
164
165    /// Get all list items
166    pub fn get_list_items(&self) -> &[ListItem] {
167        &self.list_items
168    }
169
170    /// Get all code blocks
171    pub fn get_code_blocks(&self) -> &[CodeBlock] {
172        &self.code_blocks
173    }
174
175    /// Get all code spans
176    pub fn get_code_spans(&self) -> &[Range] {
177        &self.code_spans
178    }
179
180    /// Detect and populate code blocks
181    fn populate_code_blocks(&mut self) {
182        if let Some(content) = &self.content {
183            let lines: Vec<&str> = content.lines().collect();
184            let mut in_fenced_block = false;
185            let mut fence_marker = String::new();
186            let mut block_start_line = 0;
187            let mut block_language = String::new();
188
189            for (i, line) in lines.iter().enumerate() {
190                if in_fenced_block {
191                    // Already in a fenced code block, look for the end
192                    self.code_block_line_map[i] = true;
193
194                    if line.trim().starts_with(&fence_marker) {
195                        // End of code block
196                        let start_pos =
197                            lines[0..block_start_line].join("\n").len() + if block_start_line > 0 { 1 } else { 0 };
198                        let end_pos = lines[0..=i].join("\n").len();
199
200                        self.code_blocks.push(CodeBlock {
201                            range: Range {
202                                start: start_pos,
203                                end: end_pos,
204                            },
205                            block_type: CodeBlockType::Fenced,
206                            start_line: block_start_line + 1, // 1-indexed
207                            end_line: i + 1,                  // 1-indexed
208                            language: if !block_language.is_empty() {
209                                Some(block_language.clone())
210                            } else {
211                                None
212                            },
213                        });
214
215                        in_fenced_block = false;
216                        fence_marker.clear();
217                        block_language.clear();
218                    }
219                } else if let Some(caps) = CODE_BLOCK_START_REGEX.captures(line) {
220                    // Start of a new code block
221                    fence_marker = caps.get(2).map_or("```", |m| m.as_str()).to_string();
222                    in_fenced_block = true;
223                    block_start_line = i;
224                    block_language = caps.get(3).map_or("", |m| m.as_str().trim()).to_string();
225                    self.code_block_line_map[i] = true;
226                } else if INDENTED_CODE_BLOCK_REGEX.is_match(line) {
227                    // Only mark as indented code block if not a list item
228                    let is_unordered_list = UNORDERED_LIST_REGEX.is_match(line).unwrap_or(false);
229                    let is_ordered_list = ORDERED_LIST_REGEX.is_match(line).unwrap_or(false);
230                    if !is_unordered_list && !is_ordered_list {
231                        // Indented code block
232                        self.code_block_line_map[i] = true;
233                        // For indented code blocks, we handle them as individual lines
234                        // We don't track them as blocks with start/end because they can be
235                        // interrupted by blank lines, etc.
236                        let start_pos = lines[0..i].join("\n").len() + if i > 0 { 1 } else { 0 };
237                        let end_pos = start_pos + line.len();
238                        self.code_blocks.push(CodeBlock {
239                            range: Range {
240                                start: start_pos,
241                                end: end_pos,
242                            },
243                            block_type: CodeBlockType::Indented,
244                            start_line: i + 1, // 1-indexed
245                            end_line: i + 1,   // 1-indexed
246                            language: None,
247                        });
248                    }
249                }
250            }
251
252            // Handle unclosed code block
253            if in_fenced_block {
254                let start_pos = lines[0..block_start_line].join("\n").len() + if block_start_line > 0 { 1 } else { 0 };
255                let end_pos = content.len();
256
257                self.code_blocks.push(CodeBlock {
258                    range: Range {
259                        start: start_pos,
260                        end: end_pos,
261                    },
262                    block_type: CodeBlockType::Fenced,
263                    start_line: block_start_line + 1, // 1-indexed
264                    end_line: lines.len(),            // 1-indexed
265                    language: if !block_language.is_empty() {
266                        Some(block_language)
267                    } else {
268                        None
269                    },
270                });
271            }
272        }
273    }
274
275    /// Detect and populate code spans
276    fn populate_code_spans(&mut self) {
277        if let Some(content) = &self.content {
278            // Find inline code spans using regex for backticks
279            let mut i = 0;
280            while i < content.len() {
281                if let Some(m) = CODE_SPAN_REGEX.find_at(content, i) {
282                    let backtick_length = m.end() - m.start();
283                    let start = m.start();
284
285                    // Find matching closing backticks
286                    if let Some(end_pos) = content[m.end()..].find(&"`".repeat(backtick_length)) {
287                        let end = m.end() + end_pos + backtick_length;
288                        self.code_spans.push(Range { start, end });
289                        i = end;
290                    } else {
291                        i = m.end();
292                    }
293                } else {
294                    break;
295                }
296            }
297        }
298    }
299
300    /// Detect and populate list items
301    fn populate_list_items(&mut self) {
302        if let Some(content) = &self.content {
303            let lines: Vec<&str> = content.lines().collect();
304            let mut prev_items: Vec<(usize, usize, usize)> = Vec::new(); // (blockquote_depth, nesting_level, line_number)
305            for (i, line) in lines.iter().enumerate() {
306                // Skip blank lines but don't reset nesting context
307                if line.trim().is_empty() {
308                    continue;
309                }
310                // Parse and strip blockquote prefix
311                let (blockquote_depth, blockquote_prefix, rest) = Self::parse_blockquote_prefix(line);
312                // Always call parse_list_item and always push if Some
313                if let Some(item) = self.parse_list_item(
314                    rest,
315                    i + 1,
316                    &mut prev_items,
317                    blockquote_depth,
318                    blockquote_prefix.clone(),
319                ) {
320                    self.list_items.push(item);
321                    self.list_line_map[i] = true;
322                }
323            }
324        }
325    }
326
327    /// Parse and strip all leading blockquote markers, returning (depth, prefix, rest_of_line)
328    fn parse_blockquote_prefix(line: &str) -> (usize, String, &str) {
329        let mut rest = line;
330        let mut prefix = String::new();
331        let mut depth = 0;
332        loop {
333            let trimmed = rest.trim_start();
334            if let Some(after) = trimmed.strip_prefix('>') {
335                // Find the '>' and a single optional space
336                let mut chars = after.chars();
337                let mut space_count = 0;
338                if let Some(' ') = chars.next() {
339                    space_count = 1;
340                }
341                let (spaces, after_marker) = after.split_at(space_count);
342                prefix.push('>');
343                prefix.push_str(spaces);
344                rest = after_marker;
345                depth += 1;
346            } else {
347                break;
348            }
349        }
350        (depth, prefix, rest)
351    }
352
353    /// Calculate the nesting level for a list item, considering blockquote depth
354    fn calculate_nesting_level(
355        &self,
356        indent: usize,
357        blockquote_depth: usize,
358        prev_items: &mut Vec<(usize, usize, usize)>,
359    ) -> usize {
360        let mut nesting_level = 0;
361
362        // Only consider previous items with the same blockquote depth
363        if let Some(&(_last_bq, last_indent, last_level)) =
364            prev_items.iter().rev().find(|(bq, _, _)| *bq == blockquote_depth)
365        {
366            use std::cmp::Ordering;
367            match indent.cmp(&last_indent) {
368                Ordering::Greater => {
369                    // More indented - increase nesting level
370                    nesting_level = last_level + 1;
371                }
372                Ordering::Equal => {
373                    // Same indentation - same level
374                    nesting_level = last_level;
375                }
376                Ordering::Less => {
377                    // Less indented - find the appropriate level
378                    let mut found_level = None;
379
380                    // First look for exact match
381                    for &(prev_bq, prev_indent, prev_level) in prev_items.iter().rev() {
382                        if prev_bq == blockquote_depth && prev_indent == indent {
383                            found_level = Some(prev_level);
384                            break;
385                        }
386                    }
387
388                    // If no exact match, check if this is a case where we should treat similar indentations as same level
389                    // This handles mixed tab/space scenarios where 4 and 6 spaces should be at the same level
390                    if found_level.is_none() && indent > 0 && last_indent > 0 {
391                        // Only apply similar indentation logic if the difference is small and we're dealing with small indentations
392                        let diff = (indent as i32 - last_indent as i32).abs();
393                        if diff <= 2 && indent <= 8 && last_indent <= 8 {
394                            // Check if there's a recent item at a lower indentation level
395                            let has_lower_indent = prev_items.iter().rev().take(3).any(|(bq, prev_indent, _)| {
396                                *bq == blockquote_depth && *prev_indent < indent.min(last_indent)
397                            });
398                            if has_lower_indent {
399                                found_level = Some(last_level);
400                            }
401                        }
402                    }
403
404                    // If still no match, look for the most recent less indented item
405                    if found_level.is_none() {
406                        for &(prev_bq, prev_indent, prev_level) in prev_items.iter().rev() {
407                            if prev_bq == blockquote_depth && prev_indent < indent {
408                                found_level = Some(prev_level);
409                                break;
410                            }
411                        }
412                    }
413
414                    nesting_level = found_level.unwrap_or(0);
415                }
416            }
417        }
418
419        // Remove stack entries with indent >= current indent and same blockquote depth
420        while let Some(&(prev_bq, prev_indent, _)) = prev_items.last() {
421            if prev_bq != blockquote_depth || prev_indent < indent {
422                break;
423            }
424            prev_items.pop();
425        }
426        prev_items.push((blockquote_depth, indent, nesting_level));
427        nesting_level
428    }
429
430    /// Parse a line as a list item and determine its nesting level
431    fn parse_list_item(
432        &self,
433        line: &str,
434        line_num: usize,
435        prev_items: &mut Vec<(usize, usize, usize)>,
436        blockquote_depth: usize,
437        blockquote_prefix: String,
438    ) -> Option<ListItem> {
439        match UNORDERED_LIST_REGEX.captures(line) {
440            Ok(Some(captures)) => {
441                let indent_str = captures.name("indent").map_or("", |m| m.as_str()).to_string();
442                let indentation = Self::calculate_indentation_width_default(&indent_str);
443                let marker = captures.name("marker").unwrap().as_str();
444                let after = captures.name("after").map_or("", |m| m.as_str());
445                let spaces = after.len();
446                let raw_content = captures.name("content").map_or("", |m| m.as_str());
447                let content = raw_content.trim_start().to_string();
448                let marker_type = match marker {
449                    "*" => ListMarkerType::Asterisk,
450                    "+" => ListMarkerType::Plus,
451                    "-" => ListMarkerType::Minus,
452                    _ => unreachable!(),
453                };
454                let nesting_level = self.calculate_nesting_level(indentation, blockquote_depth, prev_items);
455                // Find parent: most recent previous item with lower nesting_level and same blockquote depth
456                let parent_line_number = prev_items
457                    .iter()
458                    .rev()
459                    .find(|(bq, _, level)| *bq == blockquote_depth && *level < nesting_level)
460                    .map(|(_, _, line_num)| *line_num);
461                return Some(ListItem {
462                    line_number: line_num,
463                    indentation,
464                    indent_str,
465                    marker_type,
466                    marker: marker.to_string(),
467                    content,
468                    spaces_after_marker: spaces,
469                    nesting_level,
470                    parent_line_number,
471                    blockquote_depth,
472                    blockquote_prefix,
473                });
474            }
475            Ok(None) => {
476                // No debug output
477            }
478            Err(_) => {}
479        }
480        match ORDERED_LIST_REGEX.captures(line) {
481            Ok(Some(captures)) => {
482                let indent_str = captures.name("indent").map_or("", |m| m.as_str()).to_string();
483                let indentation = Self::calculate_indentation_width_default(&indent_str);
484                let marker = captures.name("marker").unwrap().as_str();
485                let spaces = captures.name("after").map_or(0, |m| m.as_str().len());
486                let content = captures
487                    .name("content")
488                    .map_or("", |m| m.as_str())
489                    .trim_start()
490                    .to_string();
491                let nesting_level = self.calculate_nesting_level(indentation, blockquote_depth, prev_items);
492                // Find parent: most recent previous item with lower nesting_level and same blockquote depth
493                let parent_line_number = prev_items
494                    .iter()
495                    .rev()
496                    .find(|(bq, _, level)| *bq == blockquote_depth && *level < nesting_level)
497                    .map(|(_, _, line_num)| *line_num);
498                return Some(ListItem {
499                    line_number: line_num,
500                    indentation,
501                    indent_str,
502                    marker_type: ListMarkerType::Ordered,
503                    marker: marker.to_string(),
504                    content,
505                    spaces_after_marker: spaces,
506                    nesting_level,
507                    parent_line_number,
508                    blockquote_depth,
509                    blockquote_prefix,
510                });
511            }
512            Ok(None) => {}
513            Err(_) => {}
514        }
515        None
516    }
517}
518
519// Global cache for sharing across threads
520lazy_static! {
521    static ref ELEMENT_CACHE: Arc<Mutex<Option<ElementCache>>> = Arc::new(Mutex::new(None));
522}
523
524/// Get or create element cache for document content
525pub fn get_element_cache(content: &str) -> ElementCache {
526    // Try to get existing cache
527    {
528        let cache_guard = ELEMENT_CACHE.lock().unwrap();
529
530        // If cache exists and content matches, return it
531        if let Some(existing_cache) = &*cache_guard
532            && let Some(cached_content) = &existing_cache.content
533            && cached_content == content
534        {
535            return existing_cache.clone(); // Keep existing cache
536        }
537    }
538
539    // Content doesn't match, create new cache
540    let new_cache = ElementCache::new(content);
541
542    // Store in global cache
543    {
544        let mut cache_guard = ELEMENT_CACHE.lock().unwrap();
545        *cache_guard = Some(new_cache.clone());
546    }
547
548    new_cache
549}
550
551/// Reset the element cache
552pub fn reset_element_cache() {
553    let mut cache_guard = ELEMENT_CACHE.lock().unwrap();
554    *cache_guard = None;
555}
556
557#[cfg(test)]
558mod tests {
559    use super::*;
560
561    #[test]
562    fn test_code_block_detection() {
563        let content = "Regular text\n\n```rust\nfn main() {\n    println!(\"Hello\");\n}\n```\n\nMore text";
564        let cache = ElementCache::new(content);
565
566        assert_eq!(cache.code_blocks.len(), 1);
567        assert_eq!(cache.code_blocks[0].start_line, 3);
568        assert_eq!(cache.code_blocks[0].end_line, 7);
569        assert_eq!(cache.code_blocks[0].block_type, CodeBlockType::Fenced);
570        assert_eq!(cache.code_blocks[0].language, Some("rust".to_string()));
571
572        assert!(!cache.is_in_code_block(1));
573        assert!(!cache.is_in_code_block(2));
574        assert!(cache.is_in_code_block(3));
575        assert!(cache.is_in_code_block(4));
576        assert!(cache.is_in_code_block(5));
577        assert!(cache.is_in_code_block(6));
578        assert!(cache.is_in_code_block(7));
579        assert!(!cache.is_in_code_block(8));
580        assert!(!cache.is_in_code_block(9));
581    }
582
583    #[test]
584    fn test_list_item_detection_simple() {
585        let content =
586            "# Heading\n\n- First item\n  - Nested item\n- Second item\n\n1. Ordered item\n   1. Nested ordered\n";
587        let cache = ElementCache::new(content);
588        assert_eq!(cache.list_items.len(), 5);
589        // Check the first item
590        assert_eq!(cache.list_items[0].line_number, 3);
591        assert_eq!(cache.list_items[0].marker, "-");
592        assert_eq!(cache.list_items[0].nesting_level, 0);
593        // Check the nested item
594        assert_eq!(cache.list_items[1].line_number, 4);
595        assert_eq!(cache.list_items[1].marker, "-");
596        assert_eq!(cache.list_items[1].nesting_level, 1);
597        // Check the second list item
598        assert_eq!(cache.list_items[2].line_number, 5);
599        assert_eq!(cache.list_items[2].marker, "-");
600        assert_eq!(cache.list_items[2].nesting_level, 0);
601        // Check ordered list item
602        assert_eq!(cache.list_items[3].line_number, 7);
603        assert_eq!(cache.list_items[3].marker, "1.");
604        assert_eq!(cache.list_items[3].nesting_level, 0);
605        // Check nested ordered list item
606        assert_eq!(cache.list_items[4].line_number, 8);
607        assert_eq!(cache.list_items[4].marker, "1.");
608        assert_eq!(cache.list_items[4].nesting_level, 1);
609    }
610
611    #[test]
612    fn test_list_item_detection_complex() {
613        let complex = "  * Level 1 item 1\n    - Level 2 item 1\n      + Level 3 item 1\n    - Level 2 item 2\n  * Level 1 item 2\n\n* Top\n  + Nested\n    - Deep\n      * Deeper\n        + Deepest\n";
614        let cache = ElementCache::new(complex);
615
616        // Should detect all 10 list items
617        assert_eq!(cache.list_items.len(), 10);
618        // Check markers and nesting levels
619        assert_eq!(cache.list_items[0].marker, "*");
620        assert_eq!(cache.list_items[0].nesting_level, 0);
621        assert_eq!(cache.list_items[1].marker, "-");
622        assert_eq!(cache.list_items[1].nesting_level, 1);
623        assert_eq!(cache.list_items[2].marker, "+");
624        assert_eq!(cache.list_items[2].nesting_level, 2);
625        assert_eq!(cache.list_items[3].marker, "-");
626        assert_eq!(cache.list_items[3].nesting_level, 1);
627        assert_eq!(cache.list_items[4].marker, "*");
628        assert_eq!(cache.list_items[4].nesting_level, 0);
629        assert_eq!(cache.list_items[5].marker, "*");
630        assert_eq!(cache.list_items[5].nesting_level, 0);
631        assert_eq!(cache.list_items[6].marker, "+");
632        assert_eq!(cache.list_items[6].nesting_level, 1);
633        assert_eq!(cache.list_items[7].marker, "-");
634        assert_eq!(cache.list_items[7].nesting_level, 2);
635        assert_eq!(cache.list_items[8].marker, "*");
636        assert_eq!(cache.list_items[8].nesting_level, 3);
637        assert_eq!(cache.list_items[9].marker, "+");
638        assert_eq!(cache.list_items[9].nesting_level, 4);
639        let expected_nesting = vec![0, 1, 2, 1, 0, 0, 1, 2, 3, 4];
640        let actual_nesting: Vec<_> = cache.list_items.iter().map(|item| item.nesting_level).collect();
641        assert_eq!(
642            actual_nesting, expected_nesting,
643            "Nesting levels should match expected values"
644        );
645    }
646
647    #[test]
648    fn test_list_item_detection_edge() {
649        let edge = "* Item 1\n\n    - Nested 1\n  + Nested 2\n\n* Item 2\n";
650        let cache = ElementCache::new(edge);
651        assert_eq!(cache.list_items.len(), 4);
652
653        // Check correct nesting levels according to CommonMark:
654        // * Item 1 (indent=0) -> level 0
655        // - Nested 1 (indent=4) -> level 1 (nested under Item 1)
656        // + Nested 2 (indent=2) -> level 1 (nested under Item 1)
657        // * Item 2 (indent=0) -> level 0
658        let expected_nesting = vec![0, 1, 1, 0];
659        let actual_nesting: Vec<_> = cache.list_items.iter().map(|item| item.nesting_level).collect();
660        assert_eq!(
661            actual_nesting, expected_nesting,
662            "Nesting levels should be calculated based on indentation, not reset by blank lines"
663        );
664    }
665
666    #[test]
667    fn test_code_span_detection() {
668        let content = "Here is some `inline code` and here are ``nested `code` spans``";
669        let cache = ElementCache::new(content);
670
671        // Should have two code spans
672        assert_eq!(cache.code_spans.len(), 2);
673
674        // Check spans
675        let span1_content = &content[cache.code_spans[0].start..cache.code_spans[0].end];
676        assert_eq!(span1_content, "`inline code`");
677
678        let span2_content = &content[cache.code_spans[1].start..cache.code_spans[1].end];
679        assert_eq!(span2_content, "``nested `code` spans``");
680    }
681
682    #[test]
683    fn test_get_element_cache() {
684        let content1 = "Test content";
685        let content2 = "Different content";
686
687        // First call should create a new cache
688        let cache1 = get_element_cache(content1);
689
690        // Second call with same content should return the same cache
691        let cache2 = get_element_cache(content1);
692
693        // Third call with different content should create new cache
694        let cache3 = get_element_cache(content2);
695
696        assert_eq!(cache1.content.as_ref().unwrap(), content1);
697        assert_eq!(cache2.content.as_ref().unwrap(), content1);
698        assert_eq!(cache3.content.as_ref().unwrap(), content2);
699    }
700
701    #[test]
702    fn test_list_item_detection_deep_nesting_and_edge_cases() {
703        // Deeply nested unordered lists, mixed markers, excessive indentation, tabs, and blank lines
704        let content = "\
705* Level 1
706  - Level 2
707    + Level 3
708      * Level 4
709        - Level 5
710          + Level 6
711* Sibling 1
712    * Sibling 2
713\n    - After blank line, not nested\n\n\t* Tab indented\n        * 8 spaces indented\n* After excessive indent\n";
714        let cache = ElementCache::new(content);
715        // Should detect all lines that start with a valid unordered list marker
716        let _expected_markers = ["*", "-", "+", "*", "-", "+", "*", "*", "-", "*", "*", "*"];
717        let _expected_indents = [0, 4, 8, 0, 4, 8, 0, 4, 8, 12, 16, 20];
718        let expected_content = vec![
719            "Level 1",
720            "Level 2",
721            "Level 3",
722            "Level 4",
723            "Level 5",
724            "Level 6",
725            "Sibling 1",
726            "Sibling 2",
727            "After blank line, not nested",
728            "Tab indented",      // Content after marker
729            "8 spaces indented", // Content after marker
730            "After excessive indent",
731        ];
732        let actual_content: Vec<_> = cache.list_items.iter().map(|item| item.content.clone()).collect();
733        assert_eq!(
734            actual_content, expected_content,
735            "List item contents should match expected values"
736        );
737        // Updated expected nesting levels based on correct CommonMark behavior:
738        // Blank lines should NOT reset nesting context
739        let expected_nesting = vec![0, 1, 2, 3, 4, 5, 0, 1, 1, 1, 2, 0];
740        let actual_nesting: Vec<_> = cache.list_items.iter().map(|item| item.nesting_level).collect();
741        assert_eq!(
742            actual_nesting, expected_nesting,
743            "Nesting levels should match expected values"
744        );
745        // Check that tab-indented and 8-space-indented items are detected
746        assert!(
747            cache
748                .list_items
749                .iter()
750                .any(|item| item.marker == "*" && item.indentation >= 1),
751            "Tab or 8-space indented item not detected"
752        );
753        // Check that after blank lines, items maintain correct nesting based on indentation
754        let after_blank = cache
755            .list_items
756            .iter()
757            .find(|item| item.content.contains("After blank line"));
758        assert!(after_blank.is_some());
759        assert_eq!(
760            after_blank.unwrap().nesting_level,
761            1,
762            "Item after blank line should maintain nesting based on indentation"
763        );
764    }
765
766    #[test]
767    fn test_tab_indentation_calculation() {
768        // Test that tabs are properly converted to spaces for indentation calculation
769        let content = "* Level 0\n\t* Tab indented (should be level 1)\n\t\t* Double tab (should be level 2)\n    * 4 spaces (should be level 1)\n        * 8 spaces (should be level 2)\n";
770        let cache = ElementCache::new(content);
771
772        assert_eq!(cache.list_items.len(), 5);
773
774        // Check indentation values (tabs should be converted to spaces)
775        assert_eq!(cache.list_items[0].indentation, 0); // "* Level 0"
776        assert_eq!(cache.list_items[1].indentation, 4); // "\t* Tab indented" (tab = 4 spaces)
777        assert_eq!(cache.list_items[2].indentation, 8); // "\t\t* Double tab" (2 tabs = 8 spaces)
778        assert_eq!(cache.list_items[3].indentation, 4); // "    * 4 spaces"
779        assert_eq!(cache.list_items[4].indentation, 8); // "        * 8 spaces"
780
781        // Check nesting levels
782        assert_eq!(cache.list_items[0].nesting_level, 0);
783        assert_eq!(cache.list_items[1].nesting_level, 1);
784        assert_eq!(cache.list_items[2].nesting_level, 2);
785        assert_eq!(cache.list_items[3].nesting_level, 1);
786        assert_eq!(cache.list_items[4].nesting_level, 2);
787    }
788
789    #[test]
790    fn test_mixed_tabs_and_spaces_indentation() {
791        // Test mixed tabs and spaces
792        let content = "* Level 0\n\t  * Tab + 2 spaces (should be level 1)\n  \t* 2 spaces + tab (should be level 1)\n\t\t  * 2 tabs + 2 spaces (should be level 2)\n";
793
794        // Clear any cached data to ensure fresh parsing
795        reset_element_cache();
796        let cache = ElementCache::new(content);
797
798        assert_eq!(cache.list_items.len(), 4);
799
800        // Check indentation values
801        assert_eq!(cache.list_items[0].indentation, 0); // "* Level 0"
802        assert_eq!(cache.list_items[1].indentation, 6); // "\t  * Tab + 2 spaces" (tab to 4 + 2 spaces = 6)
803        assert_eq!(cache.list_items[2].indentation, 4); // "  \t* 2 spaces + tab" (2 spaces, then tab to next stop = 4)
804        assert_eq!(cache.list_items[3].indentation, 10); // "\t\t  * 2 tabs + 2 spaces" (2 tabs = 8 + 2 spaces = 10)
805
806        // Check nesting levels
807        assert_eq!(cache.list_items[0].nesting_level, 0);
808        assert_eq!(cache.list_items[1].nesting_level, 1);
809        assert_eq!(cache.list_items[2].nesting_level, 1);
810        assert_eq!(cache.list_items[3].nesting_level, 2);
811    }
812
813    #[test]
814    fn test_tab_width_configuration() {
815        // Test with different tab widths (default should be 4)
816        let content = "\t* Single tab\n\t\t* Double tab\n";
817        let cache = ElementCache::new(content);
818
819        assert_eq!(cache.list_items.len(), 2);
820
821        // With default tab width of 4
822        assert_eq!(cache.list_items[0].indentation, 4); // "\t*" = 4 spaces
823        assert_eq!(cache.list_items[1].indentation, 8); // "\t\t*" = 8 spaces
824
825        // Check nesting levels
826        assert_eq!(cache.list_items[0].nesting_level, 0);
827        assert_eq!(cache.list_items[1].nesting_level, 1);
828    }
829
830    #[test]
831    fn test_tab_expansion_debug() {
832        // Debug the tab expansion logic
833        assert_eq!(ElementCache::calculate_indentation_width_default(""), 0);
834        assert_eq!(ElementCache::calculate_indentation_width_default(" "), 1);
835        assert_eq!(ElementCache::calculate_indentation_width_default("  "), 2);
836        assert_eq!(ElementCache::calculate_indentation_width_default("    "), 4);
837        assert_eq!(ElementCache::calculate_indentation_width_default("\t"), 4);
838        assert_eq!(ElementCache::calculate_indentation_width_default("\t\t"), 8);
839        assert_eq!(ElementCache::calculate_indentation_width_default("\t  "), 6); // tab to 4, then 2 spaces = 6
840        assert_eq!(ElementCache::calculate_indentation_width_default("  \t"), 4); // 2 spaces, then tab to next stop (4)
841        assert_eq!(ElementCache::calculate_indentation_width_default("\t\t  "), 10);
842        // 2 tabs = 8, then 2 spaces = 10
843    }
844
845    #[test]
846    fn test_mixed_tabs_debug() {
847        // Debug the specific failing case
848        let content = "* Level 0\n\t  * Tab + 2 spaces (should be level 1)\n  \t* 2 spaces + tab (should be level 1)\n\t\t  * 2 tabs + 2 spaces (should be level 2)\n";
849        let cache = ElementCache::new(content);
850
851        println!("Number of list items: {}", cache.list_items.len());
852        for (i, item) in cache.list_items.iter().enumerate() {
853            println!(
854                "Item {}: indent_str={:?}, indentation={}, content={:?}",
855                i, item.indent_str, item.indentation, item.content
856            );
857        }
858
859        // Test the specific indentation strings
860        assert_eq!(ElementCache::calculate_indentation_width_default("\t  "), 6); // tab + 2 spaces
861        assert_eq!(ElementCache::calculate_indentation_width_default("  \t"), 4); // 2 spaces + tab
862        assert_eq!(ElementCache::calculate_indentation_width_default("\t\t  "), 10);
863        // 2 tabs + 2 spaces
864    }
865}
rumdl_lib/utils/element_cache.rs

rumdl_lib/utils/
element_cache.rs