rumdl_lib/utils/
range_utils.rs

1//! Utilities for position/range conversions
2
3use crate::utils::element_cache::ElementCache;
4use std::collections::HashSet;
5use std::ops::Range;
6
7/// Find the nearest valid UTF-8 character boundary at or before the given byte index.
8/// This is critical for safely slicing strings that may contain multi-byte UTF-8 characters.
9///
10/// # Safety
11/// Returns a byte index that is guaranteed to be a valid character boundary,
12/// or the string length if the index is beyond the string.
13fn find_char_boundary(s: &str, byte_idx: usize) -> usize {
14    if byte_idx >= s.len() {
15        return s.len();
16    }
17
18    // If the index is already at a character boundary, return it
19    if s.is_char_boundary(byte_idx) {
20        return byte_idx;
21    }
22
23    // Find the nearest character boundary by scanning backwards
24    // This is safe because we know byte_idx < s.len()
25    let mut pos = byte_idx;
26    while pos > 0 && !s.is_char_boundary(pos) {
27        pos -= 1;
28    }
29    pos
30}
31
32/// Convert a byte index to a character count (1-indexed).
33/// This safely handles multi-byte UTF-8 characters by finding the nearest character boundary.
34fn byte_to_char_count(s: &str, byte_idx: usize) -> usize {
35    let safe_byte_idx = find_char_boundary(s, byte_idx);
36    s[..safe_byte_idx].chars().count() + 1 // 1-indexed
37}
38
39#[derive(Debug)]
40pub struct LineIndex<'a> {
41    line_starts: Vec<usize>,
42    content: &'a str,
43    code_block_lines: Option<HashSet<usize>>,
44}
45
46impl<'a> LineIndex<'a> {
47    pub fn new(content: &'a str) -> Self {
48        let mut line_starts = vec![0];
49        let mut pos = 0;
50
51        for c in content.chars() {
52            pos += c.len_utf8();
53            if c == '\n' {
54                line_starts.push(pos);
55            }
56        }
57
58        let mut index = Self {
59            line_starts,
60            content,
61            code_block_lines: None,
62        };
63
64        // Pre-compute code block lines for better performance
65        index.compute_code_block_lines();
66
67        index
68    }
69
70    pub fn line_col_to_byte_range(&self, line: usize, column: usize) -> Range<usize> {
71        let line = line.saturating_sub(1);
72        let line_start = *self.line_starts.get(line).unwrap_or(&self.content.len());
73
74        let current_line = self.content.lines().nth(line).unwrap_or("");
75        // Column is 1-indexed character position, not byte position
76        let char_col = column.saturating_sub(1);
77        let char_count = current_line.chars().count();
78        let safe_char_col = char_col.min(char_count);
79
80        // Convert character position to byte position
81        let byte_offset = current_line
82            .char_indices()
83            .nth(safe_char_col)
84            .map(|(idx, _)| idx)
85            .unwrap_or(current_line.len());
86
87        let start = line_start + byte_offset;
88        start..start
89    }
90
91    /// Calculate a proper byte range for replacing text with a specific length
92    /// This is the correct function to use for LSP fixes
93    ///
94    /// # Safety
95    /// This function correctly handles multi-byte UTF-8 characters by converting
96    /// character positions (columns) to byte positions.
97    pub fn line_col_to_byte_range_with_length(&self, line: usize, column: usize, length: usize) -> Range<usize> {
98        let line = line.saturating_sub(1);
99        let line_start = *self.line_starts.get(line).unwrap_or(&self.content.len());
100        let line_end = self.line_starts.get(line + 1).copied().unwrap_or(self.content.len());
101        let mut current_line = &self.content[line_start..line_end];
102        if let Some(stripped) = current_line.strip_suffix('\n') {
103            current_line = stripped.strip_suffix('\r').unwrap_or(stripped);
104        }
105        if current_line.is_ascii() {
106            let line_len = current_line.len();
107            let start_byte = column.saturating_sub(1).min(line_len);
108            let end_byte = start_byte.saturating_add(length).min(line_len);
109            let start = line_start + start_byte;
110            let end = line_start + end_byte;
111            return start..end;
112        }
113        // Column is 1-indexed character position, not byte position
114        let char_col = column.saturating_sub(1);
115        let char_count = current_line.chars().count();
116        let safe_char_col = char_col.min(char_count);
117
118        // Convert character positions to byte positions
119        let mut char_indices = current_line.char_indices();
120        let start_byte = char_indices
121            .nth(safe_char_col)
122            .map(|(idx, _)| idx)
123            .unwrap_or(current_line.len());
124
125        // Calculate end position (start + length in characters)
126        let end_char_col = (safe_char_col + length).min(char_count);
127        let end_byte = current_line
128            .char_indices()
129            .nth(end_char_col)
130            .map(|(idx, _)| idx)
131            .unwrap_or(current_line.len());
132
133        let start = line_start + start_byte;
134        let end = line_start + end_byte;
135        start..end
136    }
137
138    /// Calculate byte range for entire line replacement (including newline)
139    /// This is ideal for rules that need to replace complete lines
140    pub fn whole_line_range(&self, line: usize) -> Range<usize> {
141        let line_idx = line.saturating_sub(1);
142        let start = *self.line_starts.get(line_idx).unwrap_or(&self.content.len());
143        let end = self
144            .line_starts
145            .get(line_idx + 1)
146            .copied()
147            .unwrap_or(self.content.len());
148        start..end
149    }
150
151    /// Calculate byte range spanning multiple lines (from start_line to end_line inclusive)
152    /// Both lines are 1-indexed. This is useful for replacing entire blocks like tables.
153    pub fn multi_line_range(&self, start_line: usize, end_line: usize) -> Range<usize> {
154        let start_idx = start_line.saturating_sub(1);
155        let end_idx = end_line.saturating_sub(1);
156
157        let start = *self.line_starts.get(start_idx).unwrap_or(&self.content.len());
158        let end = self.line_starts.get(end_idx + 1).copied().unwrap_or(self.content.len());
159        start..end
160    }
161
162    /// Calculate byte range for text within a line (excluding newline)
163    /// Useful for replacing specific parts of a line
164    ///
165    /// # Safety
166    /// This function correctly handles multi-byte UTF-8 characters by converting
167    /// character positions (columns) to byte positions.
168    pub fn line_text_range(&self, line: usize, start_col: usize, end_col: usize) -> Range<usize> {
169        let line_idx = line.saturating_sub(1);
170        let line_start = *self.line_starts.get(line_idx).unwrap_or(&self.content.len());
171
172        // Get the actual line content to ensure we don't exceed bounds
173        let current_line = self.content.lines().nth(line_idx).unwrap_or("");
174        let char_count = current_line.chars().count();
175
176        // Convert character positions to byte positions
177        let start_char_col = start_col.saturating_sub(1).min(char_count);
178        let end_char_col = end_col.saturating_sub(1).min(char_count);
179
180        let mut char_indices = current_line.char_indices();
181        let start_byte = char_indices
182            .nth(start_char_col)
183            .map(|(idx, _)| idx)
184            .unwrap_or(current_line.len());
185
186        let end_byte = current_line
187            .char_indices()
188            .nth(end_char_col)
189            .map(|(idx, _)| idx)
190            .unwrap_or(current_line.len());
191
192        let start = line_start + start_byte;
193        let end = line_start + end_byte.max(start_byte);
194        start..end
195    }
196
197    /// Calculate byte range from start of line to end of line content (excluding newline)
198    /// Useful for replacing line content while preserving line structure
199    pub fn line_content_range(&self, line: usize) -> Range<usize> {
200        let line_idx = line.saturating_sub(1);
201        let line_start = *self.line_starts.get(line_idx).unwrap_or(&self.content.len());
202
203        let current_line = self.content.lines().nth(line_idx).unwrap_or("");
204        let line_end = line_start + current_line.len();
205        line_start..line_end
206    }
207
208    /// Get the global start byte offset for a given 1-based line number.
209    pub fn get_line_start_byte(&self, line_num: usize) -> Option<usize> {
210        if line_num == 0 {
211            return None; // Lines are 1-based
212        }
213        // line_num is 1-based, line_starts index is 0-based
214        self.line_starts.get(line_num - 1).cloned()
215    }
216
217    /// Check if the line at the given index is within a code block
218    pub fn is_code_block(&self, line: usize) -> bool {
219        if let Some(ref code_block_lines) = self.code_block_lines {
220            code_block_lines.contains(&line)
221        } else {
222            // Fallback to a simpler check if pre-computation wasn't done
223            self.is_code_fence(line)
224        }
225    }
226
227    /// Check if the line is a code fence marker (``` or ~~~)
228    pub fn is_code_fence(&self, line: usize) -> bool {
229        self.content.lines().nth(line).is_some_and(|l| {
230            let trimmed = l.trim();
231            trimmed.starts_with("```") || trimmed.starts_with("~~~")
232        })
233    }
234
235    /// Check if the line is a tilde code fence marker (~~~)
236    pub fn is_tilde_code_block(&self, line: usize) -> bool {
237        self.content
238            .lines()
239            .nth(line)
240            .is_some_and(|l| l.trim().starts_with("~~~"))
241    }
242
243    /// Get a reference to the content
244    pub fn get_content(&self) -> &str {
245        self.content
246    }
247
248    /// Pre-compute which lines are within code blocks for faster lookup
249    fn compute_code_block_lines(&mut self) {
250        let mut code_block_lines = HashSet::new();
251        let lines: Vec<&str> = self.content.lines().collect();
252
253        // Initialize block tracking
254        let mut in_block = false;
255        let mut active_fence_type = ' '; // '`' or '~'
256        let mut block_indent = 0;
257        let mut block_fence_length = 0;
258        let mut in_markdown_block = false;
259        let mut nested_fence_start = None;
260        let mut nested_fence_end = None;
261
262        // Process each line
263        for (i, line) in lines.iter().enumerate() {
264            let trimmed = line.trim();
265            let indent = line.len() - trimmed.len();
266
267            // 1. Detect indented code blocks (4+ columns accounting for tab expansion)
268            if ElementCache::calculate_indentation_width_default(line) >= 4 {
269                code_block_lines.insert(i);
270                continue; // Skip further processing for indented code blocks
271            }
272
273            // 2. Handle fenced code blocks (backticks and tildes)
274            if !in_block {
275                // Check for opening fences
276                if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
277                    let char_type = if trimmed.starts_with("```") { '`' } else { '~' };
278                    let count = trimmed.chars().take_while(|&c| c == char_type).count();
279                    let info_string = if trimmed.len() > count {
280                        trimmed[count..].trim()
281                    } else {
282                        ""
283                    };
284
285                    // Mark the start of a new code block
286                    in_block = true;
287                    active_fence_type = char_type;
288                    block_indent = indent;
289                    block_fence_length = count;
290                    in_markdown_block = info_string == "markdown";
291                    nested_fence_start = None;
292                    nested_fence_end = None;
293
294                    code_block_lines.insert(i);
295                }
296            } else {
297                // We're inside a code block
298                code_block_lines.insert(i);
299
300                // Detection of nested fences in markdown blocks
301                if in_markdown_block && nested_fence_start.is_none() && trimmed.starts_with("```") {
302                    // Check if this looks like a nested fence opening (has content after the backticks)
303                    let count = trimmed.chars().take_while(|&c| c == '`').count();
304                    let remaining = if trimmed.len() > count {
305                        trimmed[count..].trim()
306                    } else {
307                        ""
308                    };
309
310                    if !remaining.is_empty() {
311                        nested_fence_start = Some(i);
312                    }
313                }
314
315                // Check if we've found a nested fence end (only if we have a start)
316                if in_markdown_block
317                    && nested_fence_start.is_some()
318                    && nested_fence_end.is_none()
319                    && trimmed.starts_with("```")
320                    && trimmed.trim_start_matches('`').trim().is_empty()
321                {
322                    nested_fence_end = Some(i);
323                }
324
325                // Check if this line matches the closing fence pattern for the outer block
326                if trimmed.starts_with(&active_fence_type.to_string().repeat(3)) {
327                    let count = trimmed.chars().take_while(|&c| c == active_fence_type).count();
328                    let remaining = if trimmed.len() > count {
329                        trimmed[count..].trim()
330                    } else {
331                        ""
332                    };
333
334                    // A line is a closing fence if:
335                    // 1. It uses the same fence character as the opening fence
336                    // 2. It has at least as many fence characters as the opening fence
337                    // 3. It has no content after the fence characters (except for whitespace)
338                    // 4. Its indentation level is less than or equal to the opening fence
339                    let is_valid_closing_fence =
340                        count >= block_fence_length && remaining.is_empty() && indent <= block_indent;
341
342                    // For nested code blocks in markdown, the first backtick fence after the nested content
343                    // should be recognized as the closing fence for the outer block
344                    let is_nested_closing = nested_fence_end.is_some() && i == nested_fence_end.unwrap();
345
346                    // Skip nested closing fences
347                    if is_valid_closing_fence && !is_nested_closing {
348                        in_block = false;
349                        in_markdown_block = false;
350                    }
351                }
352            }
353        }
354
355        self.code_block_lines = Some(code_block_lines);
356    }
357}
358
359/// Calculate end position for a single-line range
360pub fn calculate_single_line_range(line: usize, start_col: usize, length: usize) -> (usize, usize, usize, usize) {
361    (line, start_col, line, start_col + length)
362}
363
364/// Calculate range for entire line
365pub fn calculate_line_range(line: usize, line_content: &str) -> (usize, usize, usize, usize) {
366    let trimmed_len = line_content.trim_end().len();
367    (line, 1, line, trimmed_len + 1)
368}
369
370/// Calculate range from regex match on a line
371///
372/// # Safety
373/// This function safely handles multi-byte UTF-8 characters by ensuring all
374/// string slicing operations occur at valid character boundaries.
375pub fn calculate_match_range(
376    line: usize,
377    line_content: &str,
378    match_start: usize,
379    match_len: usize,
380) -> (usize, usize, usize, usize) {
381    // Bounds check to prevent panic
382    let line_len = line_content.len();
383    if match_start > line_len {
384        // If match_start is beyond line bounds, return a safe range at end of line
385        let char_count = line_content.chars().count();
386        return (line, char_count + 1, line, char_count + 1);
387    }
388
389    // Find safe character boundaries for the match range
390    let safe_match_start = find_char_boundary(line_content, match_start);
391    let safe_match_end_byte = find_char_boundary(line_content, (match_start + match_len).min(line_len));
392
393    // Convert byte positions to character positions safely
394    let char_start = byte_to_char_count(line_content, safe_match_start);
395    let char_len = if safe_match_end_byte > safe_match_start {
396        // Count characters in the safe range
397        line_content[safe_match_start..safe_match_end_byte].chars().count()
398    } else {
399        0
400    };
401    (line, char_start, line, char_start + char_len)
402}
403
404/// Calculate range for trailing content (like trailing spaces)
405///
406/// # Safety
407/// This function safely handles multi-byte UTF-8 characters by ensuring all
408/// string slicing operations occur at valid character boundaries.
409pub fn calculate_trailing_range(line: usize, line_content: &str, content_end: usize) -> (usize, usize, usize, usize) {
410    // Find safe character boundary for content_end
411    let safe_content_end = find_char_boundary(line_content, content_end);
412    let char_content_end = byte_to_char_count(line_content, safe_content_end);
413    let line_char_len = line_content.chars().count() + 1;
414    (line, char_content_end, line, line_char_len)
415}
416
417/// Calculate range for a heading (entire line)
418pub fn calculate_heading_range(line: usize, line_content: &str) -> (usize, usize, usize, usize) {
419    calculate_line_range(line, line_content)
420}
421
422/// Calculate range for emphasis markers and content
423///
424/// # Safety
425/// This function safely handles multi-byte UTF-8 characters by ensuring all
426/// string slicing operations occur at valid character boundaries.
427pub fn calculate_emphasis_range(
428    line: usize,
429    line_content: &str,
430    start_pos: usize,
431    end_pos: usize,
432) -> (usize, usize, usize, usize) {
433    // Find safe character boundaries for start and end positions
434    let safe_start_pos = find_char_boundary(line_content, start_pos);
435    let safe_end_pos = find_char_boundary(line_content, end_pos);
436    let char_start = byte_to_char_count(line_content, safe_start_pos);
437    let char_end = byte_to_char_count(line_content, safe_end_pos);
438    (line, char_start, line, char_end)
439}
440
441/// Calculate range for HTML tags
442pub fn calculate_html_tag_range(
443    line: usize,
444    line_content: &str,
445    tag_start: usize,
446    tag_len: usize,
447) -> (usize, usize, usize, usize) {
448    calculate_match_range(line, line_content, tag_start, tag_len)
449}
450
451/// Calculate range for URLs
452pub fn calculate_url_range(
453    line: usize,
454    line_content: &str,
455    url_start: usize,
456    url_len: usize,
457) -> (usize, usize, usize, usize) {
458    calculate_match_range(line, line_content, url_start, url_len)
459}
460
461/// Calculate range for list markers
462pub fn calculate_list_marker_range(
463    line: usize,
464    line_content: &str,
465    marker_start: usize,
466    marker_len: usize,
467) -> (usize, usize, usize, usize) {
468    calculate_match_range(line, line_content, marker_start, marker_len)
469}
470
471/// Calculate range that exceeds a limit (like line length)
472pub fn calculate_excess_range(line: usize, line_content: &str, limit: usize) -> (usize, usize, usize, usize) {
473    let char_limit = std::cmp::min(limit, line_content.chars().count());
474    let line_char_len = line_content.chars().count() + 1;
475    (line, char_limit + 1, line, line_char_len)
476}
477
478#[cfg(test)]
479mod tests {
480    use super::*;
481
482    #[test]
483    fn test_single_line_range() {
484        let (start_line, start_col, end_line, end_col) = calculate_single_line_range(5, 10, 3);
485        assert_eq!(start_line, 5);
486        assert_eq!(start_col, 10);
487        assert_eq!(end_line, 5);
488        assert_eq!(end_col, 13);
489    }
490
491    #[test]
492    fn test_line_range() {
493        let content = "# This is a heading  ";
494        let (start_line, start_col, end_line, end_col) = calculate_line_range(1, content);
495        assert_eq!(start_line, 1);
496        assert_eq!(start_col, 1);
497        assert_eq!(end_line, 1);
498        assert_eq!(end_col, 20); // Trimmed length + 1
499    }
500
501    #[test]
502    fn test_match_range() {
503        let content = "Text <div>content</div> more";
504        let tag_start = 5; // Position of '<'
505        let tag_len = 5; // Length of "<div>"
506        let (start_line, start_col, end_line, end_col) = calculate_match_range(1, content, tag_start, tag_len);
507        assert_eq!(start_line, 1);
508        assert_eq!(start_col, 6); // 1-indexed
509        assert_eq!(end_line, 1);
510        assert_eq!(end_col, 11); // 6 + 5
511    }
512
513    #[test]
514    fn test_trailing_range() {
515        let content = "Text content   "; // 3 trailing spaces
516        let content_end = 12; // End of "Text content"
517        let (start_line, start_col, end_line, end_col) = calculate_trailing_range(1, content, content_end);
518        assert_eq!(start_line, 1);
519        assert_eq!(start_col, 13); // content_end + 1 (1-indexed)
520        assert_eq!(end_line, 1);
521        assert_eq!(end_col, 16); // Total length + 1
522    }
523
524    #[test]
525    fn test_excess_range() {
526        let content = "This line is too long for the limit";
527        let limit = 20;
528        let (start_line, start_col, end_line, end_col) = calculate_excess_range(1, content, limit);
529        assert_eq!(start_line, 1);
530        assert_eq!(start_col, 21); // limit + 1
531        assert_eq!(end_line, 1);
532        assert_eq!(end_col, 36); // Total length + 1 (35 chars + 1 = 36)
533    }
534
535    #[test]
536    fn test_whole_line_range() {
537        let content = "Line 1\nLine 2\nLine 3";
538        let line_index = LineIndex::new(content);
539
540        // Test first line (includes newline)
541        let range = line_index.whole_line_range(1);
542        assert_eq!(range, 0..7); // "Line 1\n"
543
544        // Test middle line
545        let range = line_index.whole_line_range(2);
546        assert_eq!(range, 7..14); // "Line 2\n"
547
548        // Test last line (no newline)
549        let range = line_index.whole_line_range(3);
550        assert_eq!(range, 14..20); // "Line 3"
551    }
552
553    #[test]
554    fn test_line_content_range() {
555        let content = "Line 1\nLine 2\nLine 3";
556        let line_index = LineIndex::new(content);
557
558        // Test first line content (excludes newline)
559        let range = line_index.line_content_range(1);
560        assert_eq!(range, 0..6); // "Line 1"
561
562        // Test middle line content
563        let range = line_index.line_content_range(2);
564        assert_eq!(range, 7..13); // "Line 2"
565
566        // Test last line content
567        let range = line_index.line_content_range(3);
568        assert_eq!(range, 14..20); // "Line 3"
569    }
570
571    #[test]
572    fn test_line_text_range() {
573        let content = "Hello world\nAnother line";
574        let line_index = LineIndex::new(content);
575
576        // Test partial text in first line
577        let range = line_index.line_text_range(1, 1, 5); // "Hell"
578        assert_eq!(range, 0..4);
579
580        // Test partial text in second line
581        let range = line_index.line_text_range(2, 1, 7); // "Another"
582        assert_eq!(range, 12..18);
583
584        // Test bounds checking
585        let range = line_index.line_text_range(1, 1, 100); // Should clamp to line end
586        assert_eq!(range, 0..11); // "Hello world"
587    }
588
589    #[test]
590    fn test_calculate_match_range_bounds_checking() {
591        // Test case 1: match_start beyond line bounds
592        let line_content = "] not a link [";
593        let (line, start_col, end_line, end_col) = calculate_match_range(121, line_content, 57, 10);
594        assert_eq!(line, 121);
595        assert_eq!(start_col, 15); // line length + 1
596        assert_eq!(end_line, 121);
597        assert_eq!(end_col, 15); // same as start when out of bounds
598
599        // Test case 2: match extends beyond line end
600        let line_content = "short";
601        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 2, 10);
602        assert_eq!(line, 1);
603        assert_eq!(start_col, 3); // position 2 + 1
604        assert_eq!(end_line, 1);
605        assert_eq!(end_col, 6); // clamped to line length + 1
606
607        // Test case 3: normal case within bounds
608        let line_content = "normal text here";
609        let (line, start_col, end_line, end_col) = calculate_match_range(5, line_content, 7, 4);
610        assert_eq!(line, 5);
611        assert_eq!(start_col, 8); // position 7 + 1
612        assert_eq!(end_line, 5);
613        assert_eq!(end_col, 12); // position 7 + 4 + 1
614
615        // Test case 4: zero length match
616        let line_content = "test line";
617        let (line, start_col, end_line, end_col) = calculate_match_range(10, line_content, 5, 0);
618        assert_eq!(line, 10);
619        assert_eq!(start_col, 6); // position 5 + 1
620        assert_eq!(end_line, 10);
621        assert_eq!(end_col, 6); // same as start for zero length
622    }
623
624    // ============================================================================
625    // UTF-8 Multi-byte Character Tests (Issue #154)
626    // ============================================================================
627
628    #[test]
629    fn test_issue_154_korean_character_boundary() {
630        // Exact reproduction of issue #154: Korean character '후' (3 bytes: 18..21)
631        // The error was: "byte index 19 is not a char boundary; it is inside '후'"
632        let line_content = "- 2023 년 초 이후 주가 상승        +1,000% (10 배 상승)  ";
633
634        // Test match at byte 19 (middle of '후' character)
635        // This should not panic and should find the nearest character boundary
636        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 19, 1);
637
638        // Should successfully calculate without panicking
639        assert!(start_col > 0);
640        assert_eq!(line, 1);
641        assert_eq!(end_line, 1);
642        assert!(end_col >= start_col);
643    }
644
645    #[test]
646    fn test_calculate_match_range_korean() {
647        // Korean text: "안녕하세요" (Hello in Korean)
648        // Each character is 3 bytes
649        let line_content = "안녕하세요";
650        // Match at byte 3 (start of second character)
651        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 3, 3);
652        assert_eq!(line, 1);
653        assert_eq!(start_col, 2); // Second character (1-indexed)
654        assert_eq!(end_line, 1);
655        assert_eq!(end_col, 3); // End of second character
656
657        // Match at byte 4 (middle of second character - should round down)
658        let (line, start_col, end_line, _end_col) = calculate_match_range(1, line_content, 4, 3);
659        assert_eq!(line, 1);
660        assert_eq!(start_col, 2); // Should round to start of character
661        assert_eq!(end_line, 1);
662    }
663
664    #[test]
665    fn test_calculate_match_range_chinese() {
666        // Chinese text: "你好世界" (Hello World)
667        // Each character is 3 bytes
668        let line_content = "你好世界";
669        // Match at byte 6 (start of third character)
670        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 6, 3);
671        assert_eq!(line, 1);
672        assert_eq!(start_col, 3); // Third character (1-indexed)
673        assert_eq!(end_line, 1);
674        assert_eq!(end_col, 4); // End of third character
675    }
676
677    #[test]
678    fn test_calculate_match_range_japanese() {
679        // Japanese text: "こんにちは" (Hello)
680        // Each character is 3 bytes
681        let line_content = "こんにちは";
682        // Match at byte 9 (start of fourth character)
683        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 9, 3);
684        assert_eq!(line, 1);
685        assert_eq!(start_col, 4); // Fourth character (1-indexed)
686        assert_eq!(end_line, 1);
687        assert_eq!(end_col, 5); // End of fourth character
688    }
689
690    #[test]
691    fn test_calculate_match_range_mixed_unicode() {
692        // Mixed ASCII and CJK: "Hello 世界"
693        // "Hello " = 6 bytes (H, e, l, l, o, space)
694        // "世" = bytes 6-8 (3 bytes), character 7
695        // "界" = bytes 9-11 (3 bytes), character 8
696        let line_content = "Hello 世界";
697
698        // Match at byte 5 (space character)
699        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 5, 1);
700        assert_eq!(line, 1);
701        assert_eq!(start_col, 6); // Space character (1-indexed: H=1, e=2, l=3, l=4, o=5, space=6)
702        assert_eq!(end_line, 1);
703        assert_eq!(end_col, 7); // After space
704
705        // Match at byte 6 (start of first Chinese character "世")
706        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 6, 3);
707        assert_eq!(line, 1);
708        assert_eq!(start_col, 7); // First Chinese character (1-indexed)
709        assert_eq!(end_line, 1);
710        assert_eq!(end_col, 8); // End of first Chinese character
711    }
712
713    #[test]
714    fn test_calculate_trailing_range_korean() {
715        // Korean text with trailing spaces
716        let line_content = "안녕하세요   ";
717        // content_end at byte 15 (middle of last character + spaces)
718        let (line, start_col, end_line, end_col) = calculate_trailing_range(1, line_content, 15);
719        assert_eq!(line, 1);
720        assert!(start_col > 0);
721        assert_eq!(end_line, 1);
722        assert!(end_col > start_col);
723    }
724
725    #[test]
726    fn test_calculate_emphasis_range_chinese() {
727        // Chinese text with emphasis markers
728        let line_content = "这是**重要**的";
729        // start_pos and end_pos at byte boundaries within Chinese characters
730        let (line, start_col, end_line, end_col) = calculate_emphasis_range(1, line_content, 6, 12);
731        assert_eq!(line, 1);
732        assert!(start_col > 0);
733        assert_eq!(end_line, 1);
734        assert!(end_col > start_col);
735    }
736
737    #[test]
738    fn test_line_col_to_byte_range_korean() {
739        // Test that column positions (character positions) are correctly converted to byte positions
740        let content = "안녕하세요\nWorld";
741        let line_index = LineIndex::new(content);
742
743        // Column 1 (first character)
744        let range = line_index.line_col_to_byte_range(1, 1);
745        assert_eq!(range, 0..0);
746
747        // Column 2 (second character)
748        let range = line_index.line_col_to_byte_range(1, 2);
749        assert_eq!(range, 3..3); // 3 bytes for first character
750
751        // Column 3 (third character)
752        let range = line_index.line_col_to_byte_range(1, 3);
753        assert_eq!(range, 6..6); // 6 bytes for first two characters
754    }
755
756    #[test]
757    fn test_line_col_to_byte_range_with_length_chinese() {
758        // Test byte range calculation with length for Chinese characters
759        let content = "你好世界\nTest";
760        let line_index = LineIndex::new(content);
761
762        // Column 1, length 2 (first two Chinese characters)
763        let range = line_index.line_col_to_byte_range_with_length(1, 1, 2);
764        assert_eq!(range, 0..6); // 6 bytes for two 3-byte characters
765
766        // Column 2, length 1 (second Chinese character)
767        let range = line_index.line_col_to_byte_range_with_length(1, 2, 1);
768        assert_eq!(range, 3..6); // Bytes 3-6 for second character
769    }
770
771    #[test]
772    fn test_line_text_range_japanese() {
773        // Test text range calculation for Japanese characters
774        let content = "こんにちは\nHello";
775        let line_index = LineIndex::new(content);
776
777        // Columns 2-4 (second to fourth Japanese characters)
778        let range = line_index.line_text_range(1, 2, 4);
779        assert_eq!(range, 3..9); // Bytes 3-9 for three 3-byte characters
780    }
781
782    #[test]
783    fn test_find_char_boundary_edge_cases() {
784        // Test the helper function directly
785        let s = "안녕";
786
787        // Byte 0 (start) - should be valid
788        assert_eq!(find_char_boundary(s, 0), 0);
789
790        // Byte 1 (middle of first character) - should round down to 0
791        assert_eq!(find_char_boundary(s, 1), 0);
792
793        // Byte 2 (middle of first character) - should round down to 0
794        assert_eq!(find_char_boundary(s, 2), 0);
795
796        // Byte 3 (start of second character) - should be valid
797        assert_eq!(find_char_boundary(s, 3), 3);
798
799        // Byte 4 (middle of second character) - should round down to 3
800        assert_eq!(find_char_boundary(s, 4), 3);
801
802        // Byte beyond string length - should return string length
803        assert_eq!(find_char_boundary(s, 100), s.len());
804    }
805
806    #[test]
807    fn test_byte_to_char_count_unicode() {
808        // Test character counting with multi-byte characters
809        let s = "안녕하세요";
810
811        // Byte 0 (start) - 1 character
812        assert_eq!(byte_to_char_count(s, 0), 1);
813
814        // Byte 3 (start of second character) - 2 characters
815        assert_eq!(byte_to_char_count(s, 3), 2);
816
817        // Byte 6 (start of third character) - 3 characters
818        assert_eq!(byte_to_char_count(s, 6), 3);
819
820        // Byte 9 (start of fourth character) - 4 characters
821        assert_eq!(byte_to_char_count(s, 9), 4);
822
823        // Byte 12 (start of fifth character) - 5 characters
824        assert_eq!(byte_to_char_count(s, 12), 5);
825
826        // Byte 15 (end) - 6 characters (5 + 1 for 1-indexed)
827        assert_eq!(byte_to_char_count(s, 15), 6);
828    }
829
830    #[test]
831    fn test_all_range_functions_with_emoji() {
832        // Test with emoji (4-byte UTF-8 characters)
833        let line_content = "Hello 🎉 World 🌍";
834
835        // calculate_match_range
836        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 6, 4);
837        assert_eq!(line, 1);
838        assert!(start_col > 0);
839        assert_eq!(end_line, 1);
840        assert!(end_col > start_col);
841
842        // calculate_trailing_range
843        let (line, start_col, end_line, end_col) = calculate_trailing_range(1, line_content, 12);
844        assert_eq!(line, 1);
845        assert!(start_col > 0);
846        assert_eq!(end_line, 1);
847        assert!(end_col > start_col);
848
849        // calculate_emphasis_range
850        let (line, start_col, end_line, end_col) = calculate_emphasis_range(1, line_content, 0, 5);
851        assert_eq!(line, 1);
852        assert_eq!(start_col, 1);
853        assert_eq!(end_line, 1);
854        assert!(end_col > start_col);
855    }
856}
rumdl_lib/utils/range_utils.rs

rumdl_lib/utils/
range_utils.rs