rumdl_lib/utils/
range_utils.rs

1//! Utilities for position/range conversions
2
3use crate::utils::element_cache::ElementCache;
4use std::collections::HashSet;
5use std::ops::Range;
6
7/// Find the nearest valid UTF-8 character boundary at or before the given byte index.
8/// This is critical for safely slicing strings that may contain multi-byte UTF-8 characters.
9///
10/// # Safety
11/// Returns a byte index that is guaranteed to be a valid character boundary,
12/// or the string length if the index is beyond the string.
13fn find_char_boundary(s: &str, byte_idx: usize) -> usize {
14    if byte_idx >= s.len() {
15        return s.len();
16    }
17
18    // If the index is already at a character boundary, return it
19    if s.is_char_boundary(byte_idx) {
20        return byte_idx;
21    }
22
23    // Find the nearest character boundary by scanning backwards
24    // This is safe because we know byte_idx < s.len()
25    let mut pos = byte_idx;
26    while pos > 0 && !s.is_char_boundary(pos) {
27        pos -= 1;
28    }
29    pos
30}
31
32/// Convert a byte index to a character count (1-indexed).
33/// This safely handles multi-byte UTF-8 characters by finding the nearest character boundary.
34fn byte_to_char_count(s: &str, byte_idx: usize) -> usize {
35    let safe_byte_idx = find_char_boundary(s, byte_idx);
36    s[..safe_byte_idx].chars().count() + 1 // 1-indexed
37}
38
39#[derive(Debug)]
40pub struct LineIndex<'a> {
41    line_starts: Vec<usize>,
42    content: &'a str,
43    code_block_lines: Option<HashSet<usize>>,
44}
45
46impl<'a> LineIndex<'a> {
47    pub fn new(content: &'a str) -> Self {
48        let mut line_starts = vec![0];
49        let mut pos = 0;
50
51        for c in content.chars() {
52            pos += c.len_utf8();
53            if c == '\n' {
54                line_starts.push(pos);
55            }
56        }
57
58        let mut index = Self {
59            line_starts,
60            content,
61            code_block_lines: None,
62        };
63
64        // Pre-compute code block lines for better performance
65        index.compute_code_block_lines();
66
67        index
68    }
69
70    /// Create a `LineIndex` from pre-computed line start byte offsets.
71    /// Each entry is the byte offset of the first character on that line.
72    /// The first entry must be 0 (start of content).
73    pub fn with_line_starts(content: &'a str, line_starts: Vec<usize>) -> Self {
74        let mut index = Self {
75            line_starts,
76            content,
77            code_block_lines: None,
78        };
79
80        // Pre-compute code block lines for better performance
81        index.compute_code_block_lines();
82
83        index
84    }
85
86    /// Create a `LineIndex` from pre-computed line starts and code block byte ranges.
87    ///
88    /// Instead of re-scanning content to find code blocks, this converts
89    /// the already-detected byte ranges into line-level information.
90    pub fn with_line_starts_and_code_blocks(
91        content: &'a str,
92        line_starts: Vec<usize>,
93        code_block_byte_ranges: &[(usize, usize)],
94    ) -> Self {
95        let mut code_block_lines = HashSet::new();
96
97        for &(block_start, block_end) in code_block_byte_ranges {
98            let start_line = match line_starts.binary_search(&block_start) {
99                Ok(idx) => idx,
100                Err(idx) => idx.saturating_sub(1),
101            };
102            let end_line = if block_end == 0 {
103                0
104            } else {
105                match line_starts.binary_search(&block_end) {
106                    // block_end exactly at a line start means the block ended on the previous line
107                    Ok(idx) => idx.saturating_sub(1),
108                    Err(idx) => idx.saturating_sub(1),
109                }
110            };
111            for line_idx in start_line..=end_line {
112                code_block_lines.insert(line_idx);
113            }
114        }
115
116        Self {
117            line_starts,
118            content,
119            code_block_lines: Some(code_block_lines),
120        }
121    }
122
123    /// Get the content of a line by 0-based index using pre-computed byte offsets.
124    /// Returns the line content without the trailing newline character.
125    fn get_line(&self, line_idx: usize) -> Option<&'a str> {
126        let start = *self.line_starts.get(line_idx)?;
127        let end = self
128            .line_starts
129            .get(line_idx + 1)
130            .copied()
131            .unwrap_or(self.content.len());
132        let line = &self.content[start..end];
133        // Strip trailing newline (and optional \r before it)
134        let line = line.strip_suffix('\n').unwrap_or(line);
135        let line = line.strip_suffix('\r').unwrap_or(line);
136        Some(line)
137    }
138
139    pub fn line_col_to_byte_range(&self, line: usize, column: usize) -> Range<usize> {
140        let line = line.saturating_sub(1);
141        let line_start = *self.line_starts.get(line).unwrap_or(&self.content.len());
142
143        let current_line = self.get_line(line).unwrap_or("");
144        // Column is 1-indexed character position, not byte position
145        let char_col = column.saturating_sub(1);
146        let char_count = current_line.chars().count();
147        let safe_char_col = char_col.min(char_count);
148
149        // Convert character position to byte position
150        let byte_offset = current_line
151            .char_indices()
152            .nth(safe_char_col)
153            .map(|(idx, _)| idx)
154            .unwrap_or(current_line.len());
155
156        let start = line_start + byte_offset;
157        start..start
158    }
159
160    /// Calculate a proper byte range for replacing text with a specific length
161    /// This is the correct function to use for LSP fixes
162    ///
163    /// # Safety
164    /// This function correctly handles multi-byte UTF-8 characters by converting
165    /// character positions (columns) to byte positions.
166    pub fn line_col_to_byte_range_with_length(&self, line: usize, column: usize, length: usize) -> Range<usize> {
167        let line = line.saturating_sub(1);
168        let line_start = *self.line_starts.get(line).unwrap_or(&self.content.len());
169        let line_end = self.line_starts.get(line + 1).copied().unwrap_or(self.content.len());
170        let mut current_line = &self.content[line_start..line_end];
171        if let Some(stripped) = current_line.strip_suffix('\n') {
172            current_line = stripped.strip_suffix('\r').unwrap_or(stripped);
173        }
174        if current_line.is_ascii() {
175            let line_len = current_line.len();
176            let start_byte = column.saturating_sub(1).min(line_len);
177            let end_byte = start_byte.saturating_add(length).min(line_len);
178            let start = line_start + start_byte;
179            let end = line_start + end_byte;
180            return start..end;
181        }
182        // Column is 1-indexed character position, not byte position
183        let char_col = column.saturating_sub(1);
184        let char_count = current_line.chars().count();
185        let safe_char_col = char_col.min(char_count);
186
187        // Convert character positions to byte positions
188        let mut char_indices = current_line.char_indices();
189        let start_byte = char_indices
190            .nth(safe_char_col)
191            .map(|(idx, _)| idx)
192            .unwrap_or(current_line.len());
193
194        // Calculate end position (start + length in characters)
195        let end_char_col = (safe_char_col + length).min(char_count);
196        let end_byte = current_line
197            .char_indices()
198            .nth(end_char_col)
199            .map(|(idx, _)| idx)
200            .unwrap_or(current_line.len());
201
202        let start = line_start + start_byte;
203        let end = line_start + end_byte;
204        start..end
205    }
206
207    /// Calculate byte range for entire line replacement (including newline)
208    /// This is ideal for rules that need to replace complete lines
209    pub fn whole_line_range(&self, line: usize) -> Range<usize> {
210        let line_idx = line.saturating_sub(1);
211        let start = *self.line_starts.get(line_idx).unwrap_or(&self.content.len());
212        let end = self
213            .line_starts
214            .get(line_idx + 1)
215            .copied()
216            .unwrap_or(self.content.len());
217        start..end
218    }
219
220    /// Calculate byte range spanning multiple lines (from start_line to end_line inclusive)
221    /// Both lines are 1-indexed. This is useful for replacing entire blocks like tables.
222    pub fn multi_line_range(&self, start_line: usize, end_line: usize) -> Range<usize> {
223        let start_idx = start_line.saturating_sub(1);
224        let end_idx = end_line.saturating_sub(1);
225
226        let start = *self.line_starts.get(start_idx).unwrap_or(&self.content.len());
227        let end = self.line_starts.get(end_idx + 1).copied().unwrap_or(self.content.len());
228        start..end
229    }
230
231    /// Calculate byte range for text within a line (excluding newline)
232    /// Useful for replacing specific parts of a line
233    ///
234    /// # Safety
235    /// This function correctly handles multi-byte UTF-8 characters by converting
236    /// character positions (columns) to byte positions.
237    pub fn line_text_range(&self, line: usize, start_col: usize, end_col: usize) -> Range<usize> {
238        let line_idx = line.saturating_sub(1);
239        let line_start = *self.line_starts.get(line_idx).unwrap_or(&self.content.len());
240
241        // Get the actual line content to ensure we don't exceed bounds
242        let current_line = self.get_line(line_idx).unwrap_or("");
243        let char_count = current_line.chars().count();
244
245        // Convert character positions to byte positions
246        let start_char_col = start_col.saturating_sub(1).min(char_count);
247        let end_char_col = end_col.saturating_sub(1).min(char_count);
248
249        let mut char_indices = current_line.char_indices();
250        let start_byte = char_indices
251            .nth(start_char_col)
252            .map(|(idx, _)| idx)
253            .unwrap_or(current_line.len());
254
255        let end_byte = current_line
256            .char_indices()
257            .nth(end_char_col)
258            .map(|(idx, _)| idx)
259            .unwrap_or(current_line.len());
260
261        let start = line_start + start_byte;
262        let end = line_start + end_byte.max(start_byte);
263        start..end
264    }
265
266    /// Calculate byte range from start of line to end of line content (excluding newline)
267    /// Useful for replacing line content while preserving line structure
268    pub fn line_content_range(&self, line: usize) -> Range<usize> {
269        let line_idx = line.saturating_sub(1);
270        let line_start = *self.line_starts.get(line_idx).unwrap_or(&self.content.len());
271
272        let current_line = self.get_line(line_idx).unwrap_or("");
273        let line_end = line_start + current_line.len();
274        line_start..line_end
275    }
276
277    /// Get the global start byte offset for a given 1-based line number.
278    pub fn get_line_start_byte(&self, line_num: usize) -> Option<usize> {
279        if line_num == 0 {
280            return None; // Lines are 1-based
281        }
282        // line_num is 1-based, line_starts index is 0-based
283        self.line_starts.get(line_num - 1).cloned()
284    }
285
286    /// Check if the line at the given index is within a code block
287    pub fn is_code_block(&self, line: usize) -> bool {
288        if let Some(ref code_block_lines) = self.code_block_lines {
289            code_block_lines.contains(&line)
290        } else {
291            // Fallback to a simpler check if pre-computation wasn't done
292            self.is_code_fence(line)
293        }
294    }
295
296    /// Check if the line is a code fence marker (``` or ~~~)
297    pub fn is_code_fence(&self, line: usize) -> bool {
298        self.get_line(line).is_some_and(|l| {
299            let trimmed = l.trim();
300            trimmed.starts_with("```") || trimmed.starts_with("~~~")
301        })
302    }
303
304    /// Check if the line is a tilde code fence marker (~~~)
305    pub fn is_tilde_code_block(&self, line: usize) -> bool {
306        self.get_line(line).is_some_and(|l| l.trim().starts_with("~~~"))
307    }
308
309    /// Get a reference to the content
310    pub fn get_content(&self) -> &str {
311        self.content
312    }
313
314    /// Pre-compute which lines are within code blocks for faster lookup
315    fn compute_code_block_lines(&mut self) {
316        let mut code_block_lines = HashSet::new();
317        let lines: Vec<&str> = self.content.lines().collect();
318
319        // Initialize block tracking
320        let mut in_block = false;
321        let mut active_fence_type = ' '; // '`' or '~'
322        let mut block_indent = 0;
323        let mut block_fence_length = 0;
324        let mut in_markdown_block = false;
325        let mut nested_fence_start = None;
326        let mut nested_fence_end = None;
327
328        // Process each line
329        for (i, line) in lines.iter().enumerate() {
330            let trimmed = line.trim();
331            let indent = line.len() - trimmed.len();
332
333            // 1. Detect indented code blocks (4+ columns accounting for tab expansion)
334            if ElementCache::calculate_indentation_width_default(line) >= 4 {
335                code_block_lines.insert(i);
336                continue; // Skip further processing for indented code blocks
337            }
338
339            // 2. Handle fenced code blocks (backticks and tildes)
340            if !in_block {
341                // Check for opening fences
342                if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
343                    let char_type = if trimmed.starts_with("```") { '`' } else { '~' };
344                    let count = trimmed.chars().take_while(|&c| c == char_type).count();
345                    let info_string = if trimmed.len() > count {
346                        trimmed[count..].trim()
347                    } else {
348                        ""
349                    };
350
351                    // Mark the start of a new code block
352                    in_block = true;
353                    active_fence_type = char_type;
354                    block_indent = indent;
355                    block_fence_length = count;
356                    in_markdown_block = info_string == "markdown";
357                    nested_fence_start = None;
358                    nested_fence_end = None;
359
360                    code_block_lines.insert(i);
361                }
362            } else {
363                // We're inside a code block
364                code_block_lines.insert(i);
365
366                // Detection of nested fences in markdown blocks
367                if in_markdown_block && nested_fence_start.is_none() && trimmed.starts_with("```") {
368                    // Check if this looks like a nested fence opening (has content after the backticks)
369                    let count = trimmed.chars().take_while(|&c| c == '`').count();
370                    let remaining = if trimmed.len() > count {
371                        trimmed[count..].trim()
372                    } else {
373                        ""
374                    };
375
376                    if !remaining.is_empty() {
377                        nested_fence_start = Some(i);
378                    }
379                }
380
381                // Check if we've found a nested fence end (only if we have a start)
382                if in_markdown_block
383                    && nested_fence_start.is_some()
384                    && nested_fence_end.is_none()
385                    && trimmed.starts_with("```")
386                    && trimmed.trim_start_matches('`').trim().is_empty()
387                {
388                    nested_fence_end = Some(i);
389                }
390
391                // Check if this line matches the closing fence pattern for the outer block
392                if trimmed.starts_with(&active_fence_type.to_string().repeat(3)) {
393                    let count = trimmed.chars().take_while(|&c| c == active_fence_type).count();
394                    let remaining = if trimmed.len() > count {
395                        trimmed[count..].trim()
396                    } else {
397                        ""
398                    };
399
400                    // A line is a closing fence if:
401                    // 1. It uses the same fence character as the opening fence
402                    // 2. It has at least as many fence characters as the opening fence
403                    // 3. It has no content after the fence characters (except for whitespace)
404                    // 4. Its indentation level is less than or equal to the opening fence
405                    let is_valid_closing_fence =
406                        count >= block_fence_length && remaining.is_empty() && indent <= block_indent;
407
408                    // For nested code blocks in markdown, the first backtick fence after the nested content
409                    // should be recognized as the closing fence for the outer block
410                    let is_nested_closing = nested_fence_end.is_some() && i == nested_fence_end.unwrap();
411
412                    // Skip nested closing fences
413                    if is_valid_closing_fence && !is_nested_closing {
414                        in_block = false;
415                        in_markdown_block = false;
416                    }
417                }
418            }
419        }
420
421        self.code_block_lines = Some(code_block_lines);
422    }
423}
424
425/// Calculate end position for a single-line range
426pub fn calculate_single_line_range(line: usize, start_col: usize, length: usize) -> (usize, usize, usize, usize) {
427    (line, start_col, line, start_col + length)
428}
429
430/// Calculate range for entire line
431pub fn calculate_line_range(line: usize, line_content: &str) -> (usize, usize, usize, usize) {
432    let trimmed_len = line_content.trim_end().len();
433    (line, 1, line, trimmed_len + 1)
434}
435
436/// Calculate range from regex match on a line
437///
438/// # Safety
439/// This function safely handles multi-byte UTF-8 characters by ensuring all
440/// string slicing operations occur at valid character boundaries.
441pub fn calculate_match_range(
442    line: usize,
443    line_content: &str,
444    match_start: usize,
445    match_len: usize,
446) -> (usize, usize, usize, usize) {
447    // Bounds check to prevent panic
448    let line_len = line_content.len();
449    if match_start > line_len {
450        // If match_start is beyond line bounds, return a safe range at end of line
451        let char_count = line_content.chars().count();
452        return (line, char_count + 1, line, char_count + 1);
453    }
454
455    // Find safe character boundaries for the match range
456    let safe_match_start = find_char_boundary(line_content, match_start);
457    let safe_match_end_byte = find_char_boundary(line_content, (match_start + match_len).min(line_len));
458
459    // Convert byte positions to character positions safely
460    let char_start = byte_to_char_count(line_content, safe_match_start);
461    let char_len = if safe_match_end_byte > safe_match_start {
462        // Count characters in the safe range
463        line_content[safe_match_start..safe_match_end_byte].chars().count()
464    } else {
465        0
466    };
467    (line, char_start, line, char_start + char_len)
468}
469
470/// Calculate range for trailing content (like trailing spaces)
471///
472/// # Safety
473/// This function safely handles multi-byte UTF-8 characters by ensuring all
474/// string slicing operations occur at valid character boundaries.
475pub fn calculate_trailing_range(line: usize, line_content: &str, content_end: usize) -> (usize, usize, usize, usize) {
476    // Find safe character boundary for content_end
477    let safe_content_end = find_char_boundary(line_content, content_end);
478    let char_content_end = byte_to_char_count(line_content, safe_content_end);
479    let line_char_len = line_content.chars().count() + 1;
480    (line, char_content_end, line, line_char_len)
481}
482
483/// Calculate range for a heading (entire line)
484pub fn calculate_heading_range(line: usize, line_content: &str) -> (usize, usize, usize, usize) {
485    calculate_line_range(line, line_content)
486}
487
488/// Calculate range for emphasis markers and content
489///
490/// # Safety
491/// This function safely handles multi-byte UTF-8 characters by ensuring all
492/// string slicing operations occur at valid character boundaries.
493pub fn calculate_emphasis_range(
494    line: usize,
495    line_content: &str,
496    start_pos: usize,
497    end_pos: usize,
498) -> (usize, usize, usize, usize) {
499    // Find safe character boundaries for start and end positions
500    let safe_start_pos = find_char_boundary(line_content, start_pos);
501    let safe_end_pos = find_char_boundary(line_content, end_pos);
502    let char_start = byte_to_char_count(line_content, safe_start_pos);
503    let char_end = byte_to_char_count(line_content, safe_end_pos);
504    (line, char_start, line, char_end)
505}
506
507/// Calculate range for HTML tags
508pub fn calculate_html_tag_range(
509    line: usize,
510    line_content: &str,
511    tag_start: usize,
512    tag_len: usize,
513) -> (usize, usize, usize, usize) {
514    calculate_match_range(line, line_content, tag_start, tag_len)
515}
516
517/// Calculate range for URLs
518pub fn calculate_url_range(
519    line: usize,
520    line_content: &str,
521    url_start: usize,
522    url_len: usize,
523) -> (usize, usize, usize, usize) {
524    calculate_match_range(line, line_content, url_start, url_len)
525}
526
527/// Calculate range for list markers
528pub fn calculate_list_marker_range(
529    line: usize,
530    line_content: &str,
531    marker_start: usize,
532    marker_len: usize,
533) -> (usize, usize, usize, usize) {
534    calculate_match_range(line, line_content, marker_start, marker_len)
535}
536
537/// Calculate range that exceeds a limit (like line length)
538pub fn calculate_excess_range(line: usize, line_content: &str, limit: usize) -> (usize, usize, usize, usize) {
539    let char_limit = std::cmp::min(limit, line_content.chars().count());
540    let line_char_len = line_content.chars().count() + 1;
541    (line, char_limit + 1, line, line_char_len)
542}
543
544#[cfg(test)]
545mod tests {
546    use super::*;
547
548    #[test]
549    fn test_single_line_range() {
550        let (start_line, start_col, end_line, end_col) = calculate_single_line_range(5, 10, 3);
551        assert_eq!(start_line, 5);
552        assert_eq!(start_col, 10);
553        assert_eq!(end_line, 5);
554        assert_eq!(end_col, 13);
555    }
556
557    #[test]
558    fn test_line_range() {
559        let content = "# This is a heading  ";
560        let (start_line, start_col, end_line, end_col) = calculate_line_range(1, content);
561        assert_eq!(start_line, 1);
562        assert_eq!(start_col, 1);
563        assert_eq!(end_line, 1);
564        assert_eq!(end_col, 20); // Trimmed length + 1
565    }
566
567    #[test]
568    fn test_match_range() {
569        let content = "Text <div>content</div> more";
570        let tag_start = 5; // Position of '<'
571        let tag_len = 5; // Length of "<div>"
572        let (start_line, start_col, end_line, end_col) = calculate_match_range(1, content, tag_start, tag_len);
573        assert_eq!(start_line, 1);
574        assert_eq!(start_col, 6); // 1-indexed
575        assert_eq!(end_line, 1);
576        assert_eq!(end_col, 11); // 6 + 5
577    }
578
579    #[test]
580    fn test_trailing_range() {
581        let content = "Text content   "; // 3 trailing spaces
582        let content_end = 12; // End of "Text content"
583        let (start_line, start_col, end_line, end_col) = calculate_trailing_range(1, content, content_end);
584        assert_eq!(start_line, 1);
585        assert_eq!(start_col, 13); // content_end + 1 (1-indexed)
586        assert_eq!(end_line, 1);
587        assert_eq!(end_col, 16); // Total length + 1
588    }
589
590    #[test]
591    fn test_excess_range() {
592        let content = "This line is too long for the limit";
593        let limit = 20;
594        let (start_line, start_col, end_line, end_col) = calculate_excess_range(1, content, limit);
595        assert_eq!(start_line, 1);
596        assert_eq!(start_col, 21); // limit + 1
597        assert_eq!(end_line, 1);
598        assert_eq!(end_col, 36); // Total length + 1 (35 chars + 1 = 36)
599    }
600
601    #[test]
602    fn test_whole_line_range() {
603        let content = "Line 1\nLine 2\nLine 3";
604        let line_index = LineIndex::new(content);
605
606        // Test first line (includes newline)
607        let range = line_index.whole_line_range(1);
608        assert_eq!(range, 0..7); // "Line 1\n"
609
610        // Test middle line
611        let range = line_index.whole_line_range(2);
612        assert_eq!(range, 7..14); // "Line 2\n"
613
614        // Test last line (no newline)
615        let range = line_index.whole_line_range(3);
616        assert_eq!(range, 14..20); // "Line 3"
617    }
618
619    #[test]
620    fn test_line_content_range() {
621        let content = "Line 1\nLine 2\nLine 3";
622        let line_index = LineIndex::new(content);
623
624        // Test first line content (excludes newline)
625        let range = line_index.line_content_range(1);
626        assert_eq!(range, 0..6); // "Line 1"
627
628        // Test middle line content
629        let range = line_index.line_content_range(2);
630        assert_eq!(range, 7..13); // "Line 2"
631
632        // Test last line content
633        let range = line_index.line_content_range(3);
634        assert_eq!(range, 14..20); // "Line 3"
635    }
636
637    #[test]
638    fn test_line_text_range() {
639        let content = "Hello world\nAnother line";
640        let line_index = LineIndex::new(content);
641
642        // Test partial text in first line
643        let range = line_index.line_text_range(1, 1, 5); // "Hell"
644        assert_eq!(range, 0..4);
645
646        // Test partial text in second line
647        let range = line_index.line_text_range(2, 1, 7); // "Another"
648        assert_eq!(range, 12..18);
649
650        // Test bounds checking
651        let range = line_index.line_text_range(1, 1, 100); // Should clamp to line end
652        assert_eq!(range, 0..11); // "Hello world"
653    }
654
655    #[test]
656    fn test_calculate_match_range_bounds_checking() {
657        // Test case 1: match_start beyond line bounds
658        let line_content = "] not a link [";
659        let (line, start_col, end_line, end_col) = calculate_match_range(121, line_content, 57, 10);
660        assert_eq!(line, 121);
661        assert_eq!(start_col, 15); // line length + 1
662        assert_eq!(end_line, 121);
663        assert_eq!(end_col, 15); // same as start when out of bounds
664
665        // Test case 2: match extends beyond line end
666        let line_content = "short";
667        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 2, 10);
668        assert_eq!(line, 1);
669        assert_eq!(start_col, 3); // position 2 + 1
670        assert_eq!(end_line, 1);
671        assert_eq!(end_col, 6); // clamped to line length + 1
672
673        // Test case 3: normal case within bounds
674        let line_content = "normal text here";
675        let (line, start_col, end_line, end_col) = calculate_match_range(5, line_content, 7, 4);
676        assert_eq!(line, 5);
677        assert_eq!(start_col, 8); // position 7 + 1
678        assert_eq!(end_line, 5);
679        assert_eq!(end_col, 12); // position 7 + 4 + 1
680
681        // Test case 4: zero length match
682        let line_content = "test line";
683        let (line, start_col, end_line, end_col) = calculate_match_range(10, line_content, 5, 0);
684        assert_eq!(line, 10);
685        assert_eq!(start_col, 6); // position 5 + 1
686        assert_eq!(end_line, 10);
687        assert_eq!(end_col, 6); // same as start for zero length
688    }
689
690    // ============================================================================
691    // UTF-8 Multi-byte Character Tests (Issue #154)
692    // ============================================================================
693
694    #[test]
695    fn test_issue_154_korean_character_boundary() {
696        // Exact reproduction of issue #154: Korean character '후' (3 bytes: 18..21)
697        // The error was: "byte index 19 is not a char boundary; it is inside '후'"
698        let line_content = "- 2023 년 초 이후 주가 상승        +1,000% (10 배 상승)  ";
699
700        // Test match at byte 19 (middle of '후' character)
701        // This should not panic and should find the nearest character boundary
702        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 19, 1);
703
704        // Should successfully calculate without panicking
705        assert!(start_col > 0);
706        assert_eq!(line, 1);
707        assert_eq!(end_line, 1);
708        assert!(end_col >= start_col);
709    }
710
711    #[test]
712    fn test_calculate_match_range_korean() {
713        // Korean text: "안녕하세요" (Hello in Korean)
714        // Each character is 3 bytes
715        let line_content = "안녕하세요";
716        // Match at byte 3 (start of second character)
717        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 3, 3);
718        assert_eq!(line, 1);
719        assert_eq!(start_col, 2); // Second character (1-indexed)
720        assert_eq!(end_line, 1);
721        assert_eq!(end_col, 3); // End of second character
722
723        // Match at byte 4 (middle of second character - should round down)
724        let (line, start_col, end_line, _end_col) = calculate_match_range(1, line_content, 4, 3);
725        assert_eq!(line, 1);
726        assert_eq!(start_col, 2); // Should round to start of character
727        assert_eq!(end_line, 1);
728    }
729
730    #[test]
731    fn test_calculate_match_range_chinese() {
732        // Chinese text: "你好世界" (Hello World)
733        // Each character is 3 bytes
734        let line_content = "你好世界";
735        // Match at byte 6 (start of third character)
736        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 6, 3);
737        assert_eq!(line, 1);
738        assert_eq!(start_col, 3); // Third character (1-indexed)
739        assert_eq!(end_line, 1);
740        assert_eq!(end_col, 4); // End of third character
741    }
742
743    #[test]
744    fn test_calculate_match_range_japanese() {
745        // Japanese text: "こんにちは" (Hello)
746        // Each character is 3 bytes
747        let line_content = "こんにちは";
748        // Match at byte 9 (start of fourth character)
749        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 9, 3);
750        assert_eq!(line, 1);
751        assert_eq!(start_col, 4); // Fourth character (1-indexed)
752        assert_eq!(end_line, 1);
753        assert_eq!(end_col, 5); // End of fourth character
754    }
755
756    #[test]
757    fn test_calculate_match_range_mixed_unicode() {
758        // Mixed ASCII and CJK: "Hello 世界"
759        // "Hello " = 6 bytes (H, e, l, l, o, space)
760        // "世" = bytes 6-8 (3 bytes), character 7
761        // "界" = bytes 9-11 (3 bytes), character 8
762        let line_content = "Hello 世界";
763
764        // Match at byte 5 (space character)
765        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 5, 1);
766        assert_eq!(line, 1);
767        assert_eq!(start_col, 6); // Space character (1-indexed: H=1, e=2, l=3, l=4, o=5, space=6)
768        assert_eq!(end_line, 1);
769        assert_eq!(end_col, 7); // After space
770
771        // Match at byte 6 (start of first Chinese character "世")
772        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 6, 3);
773        assert_eq!(line, 1);
774        assert_eq!(start_col, 7); // First Chinese character (1-indexed)
775        assert_eq!(end_line, 1);
776        assert_eq!(end_col, 8); // End of first Chinese character
777    }
778
779    #[test]
780    fn test_calculate_trailing_range_korean() {
781        // Korean text with trailing spaces
782        let line_content = "안녕하세요   ";
783        // content_end at byte 15 (middle of last character + spaces)
784        let (line, start_col, end_line, end_col) = calculate_trailing_range(1, line_content, 15);
785        assert_eq!(line, 1);
786        assert!(start_col > 0);
787        assert_eq!(end_line, 1);
788        assert!(end_col > start_col);
789    }
790
791    #[test]
792    fn test_calculate_emphasis_range_chinese() {
793        // Chinese text with emphasis markers
794        let line_content = "这是**重要**的";
795        // start_pos and end_pos at byte boundaries within Chinese characters
796        let (line, start_col, end_line, end_col) = calculate_emphasis_range(1, line_content, 6, 12);
797        assert_eq!(line, 1);
798        assert!(start_col > 0);
799        assert_eq!(end_line, 1);
800        assert!(end_col > start_col);
801    }
802
803    #[test]
804    fn test_line_col_to_byte_range_korean() {
805        // Test that column positions (character positions) are correctly converted to byte positions
806        let content = "안녕하세요\nWorld";
807        let line_index = LineIndex::new(content);
808
809        // Column 1 (first character)
810        let range = line_index.line_col_to_byte_range(1, 1);
811        assert_eq!(range, 0..0);
812
813        // Column 2 (second character)
814        let range = line_index.line_col_to_byte_range(1, 2);
815        assert_eq!(range, 3..3); // 3 bytes for first character
816
817        // Column 3 (third character)
818        let range = line_index.line_col_to_byte_range(1, 3);
819        assert_eq!(range, 6..6); // 6 bytes for first two characters
820    }
821
822    #[test]
823    fn test_line_col_to_byte_range_with_length_chinese() {
824        // Test byte range calculation with length for Chinese characters
825        let content = "你好世界\nTest";
826        let line_index = LineIndex::new(content);
827
828        // Column 1, length 2 (first two Chinese characters)
829        let range = line_index.line_col_to_byte_range_with_length(1, 1, 2);
830        assert_eq!(range, 0..6); // 6 bytes for two 3-byte characters
831
832        // Column 2, length 1 (second Chinese character)
833        let range = line_index.line_col_to_byte_range_with_length(1, 2, 1);
834        assert_eq!(range, 3..6); // Bytes 3-6 for second character
835    }
836
837    #[test]
838    fn test_line_text_range_japanese() {
839        // Test text range calculation for Japanese characters
840        let content = "こんにちは\nHello";
841        let line_index = LineIndex::new(content);
842
843        // Columns 2-4 (second to fourth Japanese characters)
844        let range = line_index.line_text_range(1, 2, 4);
845        assert_eq!(range, 3..9); // Bytes 3-9 for three 3-byte characters
846    }
847
848    #[test]
849    fn test_find_char_boundary_edge_cases() {
850        // Test the helper function directly
851        let s = "안녕";
852
853        // Byte 0 (start) - should be valid
854        assert_eq!(find_char_boundary(s, 0), 0);
855
856        // Byte 1 (middle of first character) - should round down to 0
857        assert_eq!(find_char_boundary(s, 1), 0);
858
859        // Byte 2 (middle of first character) - should round down to 0
860        assert_eq!(find_char_boundary(s, 2), 0);
861
862        // Byte 3 (start of second character) - should be valid
863        assert_eq!(find_char_boundary(s, 3), 3);
864
865        // Byte 4 (middle of second character) - should round down to 3
866        assert_eq!(find_char_boundary(s, 4), 3);
867
868        // Byte beyond string length - should return string length
869        assert_eq!(find_char_boundary(s, 100), s.len());
870    }
871
872    #[test]
873    fn test_byte_to_char_count_unicode() {
874        // Test character counting with multi-byte characters
875        let s = "안녕하세요";
876
877        // Byte 0 (start) - 1 character
878        assert_eq!(byte_to_char_count(s, 0), 1);
879
880        // Byte 3 (start of second character) - 2 characters
881        assert_eq!(byte_to_char_count(s, 3), 2);
882
883        // Byte 6 (start of third character) - 3 characters
884        assert_eq!(byte_to_char_count(s, 6), 3);
885
886        // Byte 9 (start of fourth character) - 4 characters
887        assert_eq!(byte_to_char_count(s, 9), 4);
888
889        // Byte 12 (start of fifth character) - 5 characters
890        assert_eq!(byte_to_char_count(s, 12), 5);
891
892        // Byte 15 (end) - 6 characters (5 + 1 for 1-indexed)
893        assert_eq!(byte_to_char_count(s, 15), 6);
894    }
895
896    #[test]
897    fn test_all_range_functions_with_emoji() {
898        // Test with emoji (4-byte UTF-8 characters)
899        let line_content = "Hello 🎉 World 🌍";
900
901        // calculate_match_range
902        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 6, 4);
903        assert_eq!(line, 1);
904        assert!(start_col > 0);
905        assert_eq!(end_line, 1);
906        assert!(end_col > start_col);
907
908        // calculate_trailing_range
909        let (line, start_col, end_line, end_col) = calculate_trailing_range(1, line_content, 12);
910        assert_eq!(line, 1);
911        assert!(start_col > 0);
912        assert_eq!(end_line, 1);
913        assert!(end_col > start_col);
914
915        // calculate_emphasis_range
916        let (line, start_col, end_line, end_col) = calculate_emphasis_range(1, line_content, 0, 5);
917        assert_eq!(line, 1);
918        assert_eq!(start_col, 1);
919        assert_eq!(end_line, 1);
920        assert!(end_col > start_col);
921    }
922}
rumdl_lib/utils/range_utils.rs

rumdl_lib/utils/
range_utils.rs