rumdl_lib/utils/
element_cache.rs

1use fancy_regex::Regex as FancyRegex;
2use regex::Regex;
3use std::sync::LazyLock;
4use std::sync::{Arc, Mutex};
5
6// Efficient regex patterns
7static CODE_BLOCK_START_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(```|~~~)(.*)$").unwrap());
8static INDENTED_CODE_BLOCK_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s{4,})(.+)$").unwrap());
9
10// List detection patterns
11static UNORDERED_LIST_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| {
12    FancyRegex::new(r"^(?P<indent>[ \t]*)(?P<marker>[*+-])(?P<after>[ \t]*)(?P<content>.*)$").unwrap()
13});
14static ORDERED_LIST_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| {
15    FancyRegex::new(r"^(?P<indent>[ \t]*)(?P<marker>\d+\.)(?P<after>[ \t]*)(?P<content>.*)$").unwrap()
16});
17
18// Inline code span pattern
19static CODE_SPAN_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"`+").unwrap());
20
21/// Represents a range in the document with start and end positions
22#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23pub struct Range {
24    pub start: usize,
25    pub end: usize,
26}
27
28/// Represents the type of code block
29#[derive(Debug, Clone, Copy, PartialEq, Eq)]
30pub enum CodeBlockType {
31    Fenced,
32    Indented,
33}
34
35/// Represents a code block in the document
36#[derive(Debug, Clone)]
37pub struct CodeBlock {
38    pub range: Range,
39    pub block_type: CodeBlockType,
40    pub start_line: usize,
41    pub end_line: usize,
42    pub language: Option<String>,
43}
44
45/// Represents the type of list marker
46#[derive(Debug, Clone, Copy, PartialEq, Eq)]
47pub enum ListMarkerType {
48    Asterisk,
49    Plus,
50    Minus,
51    Ordered,
52}
53
54/// Represents a list item in the document
55#[derive(Debug, Clone)]
56pub struct ListItem {
57    pub line_number: usize, // 1-indexed
58    pub indentation: usize,
59    pub indent_str: String, // Actual leading whitespace
60    pub marker_type: ListMarkerType,
61    pub marker: String,
62    pub content: String,
63    pub spaces_after_marker: usize,
64    pub nesting_level: usize,
65    pub parent_line_number: Option<usize>,
66    pub blockquote_depth: usize,   // Number of leading blockquote markers
67    pub blockquote_prefix: String, // The actual prefix (e.g., "> > ")
68}
69
70/// Cache for Markdown document structural elements
71/// This allows sharing computed data across multiple rule checks
72#[derive(Debug, Default, Clone)]
73pub struct ElementCache {
74    // Document content and metadata
75    content: Option<String>,
76    line_count: usize,
77
78    // Code blocks
79    code_blocks: Vec<CodeBlock>,
80    code_block_line_map: Vec<bool>, // Line index -> is in code block
81
82    // Code spans (inline code)
83    code_spans: Vec<Range>,
84
85    // Lists
86    list_items: Vec<ListItem>,
87    list_line_map: Vec<bool>, // Line index -> is list item
88}
89
90impl ElementCache {
91    /// Create a new cache from document content
92    pub fn new(content: &str) -> Self {
93        let mut cache = ElementCache {
94            content: Some(content.to_string()),
95            line_count: content.lines().count(),
96            code_blocks: Vec::new(),
97            code_block_line_map: Vec::new(),
98            code_spans: Vec::new(),
99            list_items: Vec::new(),
100            list_line_map: Vec::new(),
101        };
102
103        // Initialize maps
104        cache.code_block_line_map = vec![false; cache.line_count];
105        cache.list_line_map = vec![false; cache.line_count];
106
107        // Populate the cache
108        cache.populate_code_blocks();
109        cache.populate_code_spans();
110        cache.populate_list_items();
111
112        cache
113    }
114
115    /// Calculate the visual indentation width of a string, expanding tabs to spaces
116    /// Default tab width is 4 spaces
117    fn calculate_indentation_width(indent_str: &str, tab_width: usize) -> usize {
118        let mut width = 0;
119        for ch in indent_str.chars() {
120            if ch == '\t' {
121                // Round up to next tab stop
122                width = ((width / tab_width) + 1) * tab_width;
123            } else if ch == ' ' {
124                width += 1;
125            } else {
126                // Non-whitespace character, stop counting
127                break;
128            }
129        }
130        width
131    }
132
133    /// Calculate the visual indentation width using default tab width of 4
134    fn calculate_indentation_width_default(indent_str: &str) -> usize {
135        Self::calculate_indentation_width(indent_str, 4)
136    }
137
138    /// Check if a line is within a code block
139    pub fn is_in_code_block(&self, line_num: usize) -> bool {
140        if line_num == 0 || line_num > self.code_block_line_map.len() {
141            return false;
142        }
143        self.code_block_line_map[line_num - 1] // Convert 1-indexed to 0-indexed
144    }
145
146    /// Check if a position is within a code span
147    pub fn is_in_code_span(&self, position: usize) -> bool {
148        self.code_spans
149            .iter()
150            .any(|span| position >= span.start && position < span.end)
151    }
152
153    /// Check if a line is a list item
154    pub fn is_list_item(&self, line_num: usize) -> bool {
155        if line_num == 0 || line_num > self.list_line_map.len() {
156            return false;
157        }
158        self.list_line_map[line_num - 1] // Convert 1-indexed to 0-indexed
159    }
160
161    /// Get list item at line
162    pub fn get_list_item(&self, line_num: usize) -> Option<&ListItem> {
163        self.list_items.iter().find(|item| item.line_number == line_num)
164    }
165
166    /// Get all list items
167    pub fn get_list_items(&self) -> &[ListItem] {
168        &self.list_items
169    }
170
171    /// Get all code blocks
172    pub fn get_code_blocks(&self) -> &[CodeBlock] {
173        &self.code_blocks
174    }
175
176    /// Get all code spans
177    pub fn get_code_spans(&self) -> &[Range] {
178        &self.code_spans
179    }
180
181    /// Detect and populate code blocks
182    fn populate_code_blocks(&mut self) {
183        if let Some(content) = &self.content {
184            let lines: Vec<&str> = content.lines().collect();
185            let mut in_fenced_block = false;
186            let mut fence_marker = String::new();
187            let mut block_start_line = 0;
188            let mut block_language = String::new();
189
190            for (i, line) in lines.iter().enumerate() {
191                if in_fenced_block {
192                    // Already in a fenced code block, look for the end
193                    self.code_block_line_map[i] = true;
194
195                    if line.trim().starts_with(&fence_marker) {
196                        // End of code block
197                        let start_pos =
198                            lines[0..block_start_line].join("\n").len() + if block_start_line > 0 { 1 } else { 0 };
199                        let end_pos = lines[0..=i].join("\n").len();
200
201                        self.code_blocks.push(CodeBlock {
202                            range: Range {
203                                start: start_pos,
204                                end: end_pos,
205                            },
206                            block_type: CodeBlockType::Fenced,
207                            start_line: block_start_line + 1, // 1-indexed
208                            end_line: i + 1,                  // 1-indexed
209                            language: if !block_language.is_empty() {
210                                Some(block_language.clone())
211                            } else {
212                                None
213                            },
214                        });
215
216                        in_fenced_block = false;
217                        fence_marker.clear();
218                        block_language.clear();
219                    }
220                } else if let Some(caps) = CODE_BLOCK_START_REGEX.captures(line) {
221                    // Start of a new code block
222                    fence_marker = caps.get(2).map_or("```", |m| m.as_str()).to_string();
223                    in_fenced_block = true;
224                    block_start_line = i;
225                    block_language = caps.get(3).map_or("", |m| m.as_str().trim()).to_string();
226                    self.code_block_line_map[i] = true;
227                } else if INDENTED_CODE_BLOCK_REGEX.is_match(line) {
228                    // Only mark as indented code block if not a list item
229                    let is_unordered_list = UNORDERED_LIST_REGEX.is_match(line).unwrap_or(false);
230                    let is_ordered_list = ORDERED_LIST_REGEX.is_match(line).unwrap_or(false);
231                    if !is_unordered_list && !is_ordered_list {
232                        // Indented code block
233                        self.code_block_line_map[i] = true;
234                        // For indented code blocks, we handle them as individual lines
235                        // We don't track them as blocks with start/end because they can be
236                        // interrupted by blank lines, etc.
237                        let start_pos = lines[0..i].join("\n").len() + if i > 0 { 1 } else { 0 };
238                        let end_pos = start_pos + line.len();
239                        self.code_blocks.push(CodeBlock {
240                            range: Range {
241                                start: start_pos,
242                                end: end_pos,
243                            },
244                            block_type: CodeBlockType::Indented,
245                            start_line: i + 1, // 1-indexed
246                            end_line: i + 1,   // 1-indexed
247                            language: None,
248                        });
249                    }
250                }
251            }
252
253            // Handle unclosed code block
254            if in_fenced_block {
255                let start_pos = lines[0..block_start_line].join("\n").len() + if block_start_line > 0 { 1 } else { 0 };
256                let end_pos = content.len();
257
258                self.code_blocks.push(CodeBlock {
259                    range: Range {
260                        start: start_pos,
261                        end: end_pos,
262                    },
263                    block_type: CodeBlockType::Fenced,
264                    start_line: block_start_line + 1, // 1-indexed
265                    end_line: lines.len(),            // 1-indexed
266                    language: if !block_language.is_empty() {
267                        Some(block_language)
268                    } else {
269                        None
270                    },
271                });
272            }
273        }
274    }
275
276    /// Detect and populate code spans
277    fn populate_code_spans(&mut self) {
278        if let Some(content) = &self.content {
279            // Find inline code spans using regex for backticks
280            let mut i = 0;
281            while i < content.len() {
282                if let Some(m) = CODE_SPAN_REGEX.find_at(content, i) {
283                    let backtick_length = m.end() - m.start();
284                    let start = m.start();
285
286                    // Find matching closing backticks
287                    if let Some(end_pos) = content[m.end()..].find(&"`".repeat(backtick_length)) {
288                        let end = m.end() + end_pos + backtick_length;
289                        self.code_spans.push(Range { start, end });
290                        i = end;
291                    } else {
292                        i = m.end();
293                    }
294                } else {
295                    break;
296                }
297            }
298        }
299    }
300
301    /// Detect and populate list items
302    fn populate_list_items(&mut self) {
303        if let Some(content) = &self.content {
304            let lines: Vec<&str> = content.lines().collect();
305            let mut prev_items: Vec<(usize, usize, usize)> = Vec::new(); // (blockquote_depth, nesting_level, line_number)
306            for (i, line) in lines.iter().enumerate() {
307                // Skip blank lines but don't reset nesting context
308                if line.trim().is_empty() {
309                    continue;
310                }
311                // Parse and strip blockquote prefix
312                let (blockquote_depth, blockquote_prefix, rest) = Self::parse_blockquote_prefix(line);
313                // Always call parse_list_item and always push if Some
314                if let Some(item) = self.parse_list_item(
315                    rest,
316                    i + 1,
317                    &mut prev_items,
318                    blockquote_depth,
319                    blockquote_prefix.clone(),
320                ) {
321                    self.list_items.push(item);
322                    self.list_line_map[i] = true;
323                }
324            }
325        }
326    }
327
328    /// Parse and strip all leading blockquote markers, returning (depth, prefix, rest_of_line)
329    fn parse_blockquote_prefix(line: &str) -> (usize, String, &str) {
330        let mut rest = line;
331        let mut prefix = String::new();
332        let mut depth = 0;
333        loop {
334            let trimmed = rest.trim_start();
335            if let Some(after) = trimmed.strip_prefix('>') {
336                // Find the '>' and a single optional space
337                let mut chars = after.chars();
338                let mut space_count = 0;
339                if let Some(' ') = chars.next() {
340                    space_count = 1;
341                }
342                let (spaces, after_marker) = after.split_at(space_count);
343                prefix.push('>');
344                prefix.push_str(spaces);
345                rest = after_marker;
346                depth += 1;
347            } else {
348                break;
349            }
350        }
351        (depth, prefix, rest)
352    }
353
354    /// Calculate the nesting level for a list item, considering blockquote depth
355    fn calculate_nesting_level(
356        &self,
357        indent: usize,
358        blockquote_depth: usize,
359        prev_items: &mut Vec<(usize, usize, usize)>,
360    ) -> usize {
361        let mut nesting_level = 0;
362
363        // Only consider previous items with the same blockquote depth
364        if let Some(&(_last_bq, last_indent, last_level)) =
365            prev_items.iter().rev().find(|(bq, _, _)| *bq == blockquote_depth)
366        {
367            use std::cmp::Ordering;
368            match indent.cmp(&last_indent) {
369                Ordering::Greater => {
370                    // More indented - increase nesting level
371                    nesting_level = last_level + 1;
372                }
373                Ordering::Equal => {
374                    // Same indentation - same level
375                    nesting_level = last_level;
376                }
377                Ordering::Less => {
378                    // Less indented - find the appropriate level
379                    let mut found_level = None;
380
381                    // First look for exact match
382                    for &(prev_bq, prev_indent, prev_level) in prev_items.iter().rev() {
383                        if prev_bq == blockquote_depth && prev_indent == indent {
384                            found_level = Some(prev_level);
385                            break;
386                        }
387                    }
388
389                    // If no exact match, check if this is a case where we should treat similar indentations as same level
390                    // This handles mixed tab/space scenarios where 4 and 6 spaces should be at the same level
391                    if found_level.is_none() && indent > 0 && last_indent > 0 {
392                        // Only apply similar indentation logic if the difference is small and we're dealing with small indentations
393                        let diff = (indent as i32 - last_indent as i32).abs();
394                        if diff <= 2 && indent <= 8 && last_indent <= 8 {
395                            // Check if there's a recent item at a lower indentation level
396                            let has_lower_indent = prev_items.iter().rev().take(3).any(|(bq, prev_indent, _)| {
397                                *bq == blockquote_depth && *prev_indent < indent.min(last_indent)
398                            });
399                            if has_lower_indent {
400                                found_level = Some(last_level);
401                            }
402                        }
403                    }
404
405                    // If still no match, look for the most recent less indented item
406                    if found_level.is_none() {
407                        for &(prev_bq, prev_indent, prev_level) in prev_items.iter().rev() {
408                            if prev_bq == blockquote_depth && prev_indent < indent {
409                                found_level = Some(prev_level);
410                                break;
411                            }
412                        }
413                    }
414
415                    nesting_level = found_level.unwrap_or(0);
416                }
417            }
418        }
419
420        // Remove stack entries with indent >= current indent and same blockquote depth
421        while let Some(&(prev_bq, prev_indent, _)) = prev_items.last() {
422            if prev_bq != blockquote_depth || prev_indent < indent {
423                break;
424            }
425            prev_items.pop();
426        }
427        prev_items.push((blockquote_depth, indent, nesting_level));
428        nesting_level
429    }
430
431    /// Parse a line as a list item and determine its nesting level
432    fn parse_list_item(
433        &self,
434        line: &str,
435        line_num: usize,
436        prev_items: &mut Vec<(usize, usize, usize)>,
437        blockquote_depth: usize,
438        blockquote_prefix: String,
439    ) -> Option<ListItem> {
440        match UNORDERED_LIST_REGEX.captures(line) {
441            Ok(Some(captures)) => {
442                let indent_str = captures.name("indent").map_or("", |m| m.as_str()).to_string();
443                let indentation = Self::calculate_indentation_width_default(&indent_str);
444                let marker = captures.name("marker").unwrap().as_str();
445                let after = captures.name("after").map_or("", |m| m.as_str());
446                let spaces = after.len();
447                let raw_content = captures.name("content").map_or("", |m| m.as_str());
448                let content = raw_content.trim_start().to_string();
449                let marker_type = match marker {
450                    "*" => ListMarkerType::Asterisk,
451                    "+" => ListMarkerType::Plus,
452                    "-" => ListMarkerType::Minus,
453                    other => {
454                        // This should never happen due to regex validation,
455                        // but default to dash if it does
456                        eprintln!("Warning: Unexpected list marker '{other}', defaulting to dash");
457                        ListMarkerType::Minus
458                    }
459                };
460                let nesting_level = self.calculate_nesting_level(indentation, blockquote_depth, prev_items);
461                // Find parent: most recent previous item with lower nesting_level and same blockquote depth
462                let parent_line_number = prev_items
463                    .iter()
464                    .rev()
465                    .find(|(bq, _, level)| *bq == blockquote_depth && *level < nesting_level)
466                    .map(|(_, _, line_num)| *line_num);
467                return Some(ListItem {
468                    line_number: line_num,
469                    indentation,
470                    indent_str,
471                    marker_type,
472                    marker: marker.to_string(),
473                    content,
474                    spaces_after_marker: spaces,
475                    nesting_level,
476                    parent_line_number,
477                    blockquote_depth,
478                    blockquote_prefix,
479                });
480            }
481            Ok(None) => {
482                // No debug output
483            }
484            Err(_) => {}
485        }
486        match ORDERED_LIST_REGEX.captures(line) {
487            Ok(Some(captures)) => {
488                let indent_str = captures.name("indent").map_or("", |m| m.as_str()).to_string();
489                let indentation = Self::calculate_indentation_width_default(&indent_str);
490                let marker = captures.name("marker").unwrap().as_str();
491                let spaces = captures.name("after").map_or(0, |m| m.as_str().len());
492                let content = captures
493                    .name("content")
494                    .map_or("", |m| m.as_str())
495                    .trim_start()
496                    .to_string();
497                let nesting_level = self.calculate_nesting_level(indentation, blockquote_depth, prev_items);
498                // Find parent: most recent previous item with lower nesting_level and same blockquote depth
499                let parent_line_number = prev_items
500                    .iter()
501                    .rev()
502                    .find(|(bq, _, level)| *bq == blockquote_depth && *level < nesting_level)
503                    .map(|(_, _, line_num)| *line_num);
504                return Some(ListItem {
505                    line_number: line_num,
506                    indentation,
507                    indent_str,
508                    marker_type: ListMarkerType::Ordered,
509                    marker: marker.to_string(),
510                    content,
511                    spaces_after_marker: spaces,
512                    nesting_level,
513                    parent_line_number,
514                    blockquote_depth,
515                    blockquote_prefix,
516                });
517            }
518            Ok(None) => {}
519            Err(_) => {}
520        }
521        None
522    }
523}
524
525// Global cache for sharing across threads
526static ELEMENT_CACHE: LazyLock<Arc<Mutex<Option<ElementCache>>>> = LazyLock::new(|| Arc::new(Mutex::new(None)));
527
528/// Get or create element cache for document content
529pub fn get_element_cache(content: &str) -> ElementCache {
530    // Try to get existing cache
531    {
532        let cache_guard = ELEMENT_CACHE.lock().expect("Element cache mutex poisoned");
533
534        // If cache exists and content matches, return it
535        if let Some(existing_cache) = &*cache_guard
536            && let Some(cached_content) = &existing_cache.content
537            && cached_content == content
538        {
539            return existing_cache.clone(); // Keep existing cache
540        }
541    }
542
543    // Content doesn't match, create new cache
544    let new_cache = ElementCache::new(content);
545
546    // Store in global cache
547    {
548        let mut cache_guard = ELEMENT_CACHE.lock().expect("Element cache mutex poisoned");
549        *cache_guard = Some(new_cache.clone());
550    }
551
552    new_cache
553}
554
555/// Reset the element cache
556pub fn reset_element_cache() {
557    let mut cache_guard = ELEMENT_CACHE.lock().expect("Element cache mutex poisoned");
558    *cache_guard = None;
559}
560
561#[cfg(test)]
562mod tests {
563    use super::*;
564
565    #[test]
566    fn test_code_block_detection() {
567        let content = "Regular text\n\n```rust\nfn main() {\n    println!(\"Hello\");\n}\n```\n\nMore text";
568        let cache = ElementCache::new(content);
569
570        assert_eq!(cache.code_blocks.len(), 1);
571        assert_eq!(cache.code_blocks[0].start_line, 3);
572        assert_eq!(cache.code_blocks[0].end_line, 7);
573        assert_eq!(cache.code_blocks[0].block_type, CodeBlockType::Fenced);
574        assert_eq!(cache.code_blocks[0].language, Some("rust".to_string()));
575
576        assert!(!cache.is_in_code_block(1));
577        assert!(!cache.is_in_code_block(2));
578        assert!(cache.is_in_code_block(3));
579        assert!(cache.is_in_code_block(4));
580        assert!(cache.is_in_code_block(5));
581        assert!(cache.is_in_code_block(6));
582        assert!(cache.is_in_code_block(7));
583        assert!(!cache.is_in_code_block(8));
584        assert!(!cache.is_in_code_block(9));
585    }
586
587    #[test]
588    fn test_list_item_detection_simple() {
589        let content =
590            "# Heading\n\n- First item\n  - Nested item\n- Second item\n\n1. Ordered item\n   1. Nested ordered\n";
591        let cache = ElementCache::new(content);
592        assert_eq!(cache.list_items.len(), 5);
593        // Check the first item
594        assert_eq!(cache.list_items[0].line_number, 3);
595        assert_eq!(cache.list_items[0].marker, "-");
596        assert_eq!(cache.list_items[0].nesting_level, 0);
597        // Check the nested item
598        assert_eq!(cache.list_items[1].line_number, 4);
599        assert_eq!(cache.list_items[1].marker, "-");
600        assert_eq!(cache.list_items[1].nesting_level, 1);
601        // Check the second list item
602        assert_eq!(cache.list_items[2].line_number, 5);
603        assert_eq!(cache.list_items[2].marker, "-");
604        assert_eq!(cache.list_items[2].nesting_level, 0);
605        // Check ordered list item
606        assert_eq!(cache.list_items[3].line_number, 7);
607        assert_eq!(cache.list_items[3].marker, "1.");
608        assert_eq!(cache.list_items[3].nesting_level, 0);
609        // Check nested ordered list item
610        assert_eq!(cache.list_items[4].line_number, 8);
611        assert_eq!(cache.list_items[4].marker, "1.");
612        assert_eq!(cache.list_items[4].nesting_level, 1);
613    }
614
615    #[test]
616    fn test_list_item_detection_complex() {
617        let complex = "  * Level 1 item 1\n    - Level 2 item 1\n      + Level 3 item 1\n    - Level 2 item 2\n  * Level 1 item 2\n\n* Top\n  + Nested\n    - Deep\n      * Deeper\n        + Deepest\n";
618        let cache = ElementCache::new(complex);
619
620        // Should detect all 10 list items
621        assert_eq!(cache.list_items.len(), 10);
622        // Check markers and nesting levels
623        assert_eq!(cache.list_items[0].marker, "*");
624        assert_eq!(cache.list_items[0].nesting_level, 0);
625        assert_eq!(cache.list_items[1].marker, "-");
626        assert_eq!(cache.list_items[1].nesting_level, 1);
627        assert_eq!(cache.list_items[2].marker, "+");
628        assert_eq!(cache.list_items[2].nesting_level, 2);
629        assert_eq!(cache.list_items[3].marker, "-");
630        assert_eq!(cache.list_items[3].nesting_level, 1);
631        assert_eq!(cache.list_items[4].marker, "*");
632        assert_eq!(cache.list_items[4].nesting_level, 0);
633        assert_eq!(cache.list_items[5].marker, "*");
634        assert_eq!(cache.list_items[5].nesting_level, 0);
635        assert_eq!(cache.list_items[6].marker, "+");
636        assert_eq!(cache.list_items[6].nesting_level, 1);
637        assert_eq!(cache.list_items[7].marker, "-");
638        assert_eq!(cache.list_items[7].nesting_level, 2);
639        assert_eq!(cache.list_items[8].marker, "*");
640        assert_eq!(cache.list_items[8].nesting_level, 3);
641        assert_eq!(cache.list_items[9].marker, "+");
642        assert_eq!(cache.list_items[9].nesting_level, 4);
643        let expected_nesting = vec![0, 1, 2, 1, 0, 0, 1, 2, 3, 4];
644        let actual_nesting: Vec<_> = cache.list_items.iter().map(|item| item.nesting_level).collect();
645        assert_eq!(
646            actual_nesting, expected_nesting,
647            "Nesting levels should match expected values"
648        );
649    }
650
651    #[test]
652    fn test_list_item_detection_edge() {
653        let edge = "* Item 1\n\n    - Nested 1\n  + Nested 2\n\n* Item 2\n";
654        let cache = ElementCache::new(edge);
655        assert_eq!(cache.list_items.len(), 4);
656
657        // Check correct nesting levels according to CommonMark:
658        // * Item 1 (indent=0) -> level 0
659        // - Nested 1 (indent=4) -> level 1 (nested under Item 1)
660        // + Nested 2 (indent=2) -> level 1 (nested under Item 1)
661        // * Item 2 (indent=0) -> level 0
662        let expected_nesting = vec![0, 1, 1, 0];
663        let actual_nesting: Vec<_> = cache.list_items.iter().map(|item| item.nesting_level).collect();
664        assert_eq!(
665            actual_nesting, expected_nesting,
666            "Nesting levels should be calculated based on indentation, not reset by blank lines"
667        );
668    }
669
670    #[test]
671    fn test_code_span_detection() {
672        let content = "Here is some `inline code` and here are ``nested `code` spans``";
673        let cache = ElementCache::new(content);
674
675        // Should have two code spans
676        assert_eq!(cache.code_spans.len(), 2);
677
678        // Check spans
679        let span1_content = &content[cache.code_spans[0].start..cache.code_spans[0].end];
680        assert_eq!(span1_content, "`inline code`");
681
682        let span2_content = &content[cache.code_spans[1].start..cache.code_spans[1].end];
683        assert_eq!(span2_content, "``nested `code` spans``");
684    }
685
686    #[test]
687    fn test_get_element_cache() {
688        let content1 = "Test content";
689        let content2 = "Different content";
690
691        // First call should create a new cache
692        let cache1 = get_element_cache(content1);
693
694        // Second call with same content should return the same cache
695        let cache2 = get_element_cache(content1);
696
697        // Third call with different content should create new cache
698        let cache3 = get_element_cache(content2);
699
700        assert_eq!(cache1.content.as_ref().unwrap(), content1);
701        assert_eq!(cache2.content.as_ref().unwrap(), content1);
702        assert_eq!(cache3.content.as_ref().unwrap(), content2);
703    }
704
705    #[test]
706    fn test_list_item_detection_deep_nesting_and_edge_cases() {
707        // Deeply nested unordered lists, mixed markers, excessive indentation, tabs, and blank lines
708        let content = "\
709* Level 1
710  - Level 2
711    + Level 3
712      * Level 4
713        - Level 5
714          + Level 6
715* Sibling 1
716    * Sibling 2
717\n    - After blank line, not nested\n\n\t* Tab indented\n        * 8 spaces indented\n* After excessive indent\n";
718        let cache = ElementCache::new(content);
719        // Should detect all lines that start with a valid unordered list marker
720        let _expected_markers = ["*", "-", "+", "*", "-", "+", "*", "*", "-", "*", "*", "*"];
721        let _expected_indents = [0, 4, 8, 0, 4, 8, 0, 4, 8, 12, 16, 20];
722        let expected_content = vec![
723            "Level 1",
724            "Level 2",
725            "Level 3",
726            "Level 4",
727            "Level 5",
728            "Level 6",
729            "Sibling 1",
730            "Sibling 2",
731            "After blank line, not nested",
732            "Tab indented",      // Content after marker
733            "8 spaces indented", // Content after marker
734            "After excessive indent",
735        ];
736        let actual_content: Vec<_> = cache.list_items.iter().map(|item| item.content.clone()).collect();
737        assert_eq!(
738            actual_content, expected_content,
739            "List item contents should match expected values"
740        );
741        // Updated expected nesting levels based on correct CommonMark behavior:
742        // Blank lines should NOT reset nesting context
743        let expected_nesting = vec![0, 1, 2, 3, 4, 5, 0, 1, 1, 1, 2, 0];
744        let actual_nesting: Vec<_> = cache.list_items.iter().map(|item| item.nesting_level).collect();
745        assert_eq!(
746            actual_nesting, expected_nesting,
747            "Nesting levels should match expected values"
748        );
749        // Check that tab-indented and 8-space-indented items are detected
750        assert!(
751            cache
752                .list_items
753                .iter()
754                .any(|item| item.marker == "*" && item.indentation >= 1),
755            "Tab or 8-space indented item not detected"
756        );
757        // Check that after blank lines, items maintain correct nesting based on indentation
758        let after_blank = cache
759            .list_items
760            .iter()
761            .find(|item| item.content.contains("After blank line"));
762        assert!(after_blank.is_some());
763        assert_eq!(
764            after_blank.unwrap().nesting_level,
765            1,
766            "Item after blank line should maintain nesting based on indentation"
767        );
768    }
769
770    #[test]
771    fn test_tab_indentation_calculation() {
772        // Test that tabs are properly converted to spaces for indentation calculation
773        let content = "* Level 0\n\t* Tab indented (should be level 1)\n\t\t* Double tab (should be level 2)\n    * 4 spaces (should be level 1)\n        * 8 spaces (should be level 2)\n";
774        let cache = ElementCache::new(content);
775
776        assert_eq!(cache.list_items.len(), 5);
777
778        // Check indentation values (tabs should be converted to spaces)
779        assert_eq!(cache.list_items[0].indentation, 0); // "* Level 0"
780        assert_eq!(cache.list_items[1].indentation, 4); // "\t* Tab indented" (tab = 4 spaces)
781        assert_eq!(cache.list_items[2].indentation, 8); // "\t\t* Double tab" (2 tabs = 8 spaces)
782        assert_eq!(cache.list_items[3].indentation, 4); // "    * 4 spaces"
783        assert_eq!(cache.list_items[4].indentation, 8); // "        * 8 spaces"
784
785        // Check nesting levels
786        assert_eq!(cache.list_items[0].nesting_level, 0);
787        assert_eq!(cache.list_items[1].nesting_level, 1);
788        assert_eq!(cache.list_items[2].nesting_level, 2);
789        assert_eq!(cache.list_items[3].nesting_level, 1);
790        assert_eq!(cache.list_items[4].nesting_level, 2);
791    }
792
793    #[test]
794    fn test_mixed_tabs_and_spaces_indentation() {
795        // Test mixed tabs and spaces
796        let content = "* Level 0\n\t  * Tab + 2 spaces (should be level 1)\n  \t* 2 spaces + tab (should be level 1)\n\t\t  * 2 tabs + 2 spaces (should be level 2)\n";
797
798        // Clear any cached data to ensure fresh parsing
799        reset_element_cache();
800        let cache = ElementCache::new(content);
801
802        assert_eq!(cache.list_items.len(), 4);
803
804        // Check indentation values
805        assert_eq!(cache.list_items[0].indentation, 0); // "* Level 0"
806        assert_eq!(cache.list_items[1].indentation, 6); // "\t  * Tab + 2 spaces" (tab to 4 + 2 spaces = 6)
807        assert_eq!(cache.list_items[2].indentation, 4); // "  \t* 2 spaces + tab" (2 spaces, then tab to next stop = 4)
808        assert_eq!(cache.list_items[3].indentation, 10); // "\t\t  * 2 tabs + 2 spaces" (2 tabs = 8 + 2 spaces = 10)
809
810        // Check nesting levels
811        assert_eq!(cache.list_items[0].nesting_level, 0);
812        assert_eq!(cache.list_items[1].nesting_level, 1);
813        assert_eq!(cache.list_items[2].nesting_level, 1);
814        assert_eq!(cache.list_items[3].nesting_level, 2);
815    }
816
817    #[test]
818    fn test_tab_width_configuration() {
819        // Test with different tab widths (default should be 4)
820        let content = "\t* Single tab\n\t\t* Double tab\n";
821        let cache = ElementCache::new(content);
822
823        assert_eq!(cache.list_items.len(), 2);
824
825        // With default tab width of 4
826        assert_eq!(cache.list_items[0].indentation, 4); // "\t*" = 4 spaces
827        assert_eq!(cache.list_items[1].indentation, 8); // "\t\t*" = 8 spaces
828
829        // Check nesting levels
830        assert_eq!(cache.list_items[0].nesting_level, 0);
831        assert_eq!(cache.list_items[1].nesting_level, 1);
832    }
833
834    #[test]
835    fn test_tab_expansion_debug() {
836        // Debug the tab expansion logic
837        assert_eq!(ElementCache::calculate_indentation_width_default(""), 0);
838        assert_eq!(ElementCache::calculate_indentation_width_default(" "), 1);
839        assert_eq!(ElementCache::calculate_indentation_width_default("  "), 2);
840        assert_eq!(ElementCache::calculate_indentation_width_default("    "), 4);
841        assert_eq!(ElementCache::calculate_indentation_width_default("\t"), 4);
842        assert_eq!(ElementCache::calculate_indentation_width_default("\t\t"), 8);
843        assert_eq!(ElementCache::calculate_indentation_width_default("\t  "), 6); // tab to 4, then 2 spaces = 6
844        assert_eq!(ElementCache::calculate_indentation_width_default("  \t"), 4); // 2 spaces, then tab to next stop (4)
845        assert_eq!(ElementCache::calculate_indentation_width_default("\t\t  "), 10);
846        // 2 tabs = 8, then 2 spaces = 10
847    }
848
849    #[test]
850    fn test_mixed_tabs_debug() {
851        // Debug the specific failing case
852        let content = "* Level 0\n\t  * Tab + 2 spaces (should be level 1)\n  \t* 2 spaces + tab (should be level 1)\n\t\t  * 2 tabs + 2 spaces (should be level 2)\n";
853        let cache = ElementCache::new(content);
854
855        println!("Number of list items: {}", cache.list_items.len());
856        for (i, item) in cache.list_items.iter().enumerate() {
857            println!(
858                "Item {}: indent_str={:?}, indentation={}, content={:?}",
859                i, item.indent_str, item.indentation, item.content
860            );
861        }
862
863        // Test the specific indentation strings
864        assert_eq!(ElementCache::calculate_indentation_width_default("\t  "), 6); // tab + 2 spaces
865        assert_eq!(ElementCache::calculate_indentation_width_default("  \t"), 4); // 2 spaces + tab
866        assert_eq!(ElementCache::calculate_indentation_width_default("\t\t  "), 10);
867        // 2 tabs + 2 spaces
868    }
869}
rumdl_lib/utils/element_cache.rs

rumdl_lib/utils/
element_cache.rs