rumdl_lib/utils/
range_utils.rs

1//! Utilities for position/range conversions
2
3use crate::utils::element_cache::ElementCache;
4use std::collections::HashSet;
5use std::ops::Range;
6
7/// Find the nearest valid UTF-8 character boundary at or before the given byte index.
8/// This is critical for safely slicing strings that may contain multi-byte UTF-8 characters.
9///
10/// # Safety
11/// Returns a byte index that is guaranteed to be a valid character boundary,
12/// or the string length if the index is beyond the string.
13fn find_char_boundary(s: &str, byte_idx: usize) -> usize {
14    if byte_idx >= s.len() {
15        return s.len();
16    }
17
18    // If the index is already at a character boundary, return it
19    if s.is_char_boundary(byte_idx) {
20        return byte_idx;
21    }
22
23    // Find the nearest character boundary by scanning backwards
24    // This is safe because we know byte_idx < s.len()
25    let mut pos = byte_idx;
26    while pos > 0 && !s.is_char_boundary(pos) {
27        pos -= 1;
28    }
29    pos
30}
31
32/// Convert a byte index to a character count (1-indexed).
33/// This safely handles multi-byte UTF-8 characters by finding the nearest character boundary.
34fn byte_to_char_count(s: &str, byte_idx: usize) -> usize {
35    let safe_byte_idx = find_char_boundary(s, byte_idx);
36    s[..safe_byte_idx].chars().count() + 1 // 1-indexed
37}
38
39#[derive(Debug)]
40pub struct LineIndex<'a> {
41    line_starts: Vec<usize>,
42    content: &'a str,
43    code_block_lines: Option<HashSet<usize>>,
44}
45
46impl<'a> LineIndex<'a> {
47    pub fn new(content: &'a str) -> Self {
48        let mut line_starts = vec![0];
49        let mut pos = 0;
50
51        for c in content.chars() {
52            pos += c.len_utf8();
53            if c == '\n' {
54                line_starts.push(pos);
55            }
56        }
57
58        let mut index = Self {
59            line_starts,
60            content,
61            code_block_lines: None,
62        };
63
64        // Pre-compute code block lines for better performance
65        index.compute_code_block_lines();
66
67        index
68    }
69
70    pub fn line_col_to_byte_range(&self, line: usize, column: usize) -> Range<usize> {
71        let line = line.saturating_sub(1);
72        let line_start = *self.line_starts.get(line).unwrap_or(&self.content.len());
73
74        let current_line = self.content.lines().nth(line).unwrap_or("");
75        // Column is 1-indexed character position, not byte position
76        let char_col = column.saturating_sub(1);
77        let char_count = current_line.chars().count();
78        let safe_char_col = char_col.min(char_count);
79
80        // Convert character position to byte position
81        let byte_offset = current_line
82            .char_indices()
83            .nth(safe_char_col)
84            .map(|(idx, _)| idx)
85            .unwrap_or(current_line.len());
86
87        let start = line_start + byte_offset;
88        start..start
89    }
90
91    /// Calculate a proper byte range for replacing text with a specific length
92    /// This is the correct function to use for LSP fixes
93    ///
94    /// # Safety
95    /// This function correctly handles multi-byte UTF-8 characters by converting
96    /// character positions (columns) to byte positions.
97    pub fn line_col_to_byte_range_with_length(&self, line: usize, column: usize, length: usize) -> Range<usize> {
98        let line = line.saturating_sub(1);
99        let line_start = *self.line_starts.get(line).unwrap_or(&self.content.len());
100
101        let current_line = self.content.lines().nth(line).unwrap_or("");
102        // Column is 1-indexed character position, not byte position
103        let char_col = column.saturating_sub(1);
104        let char_count = current_line.chars().count();
105        let safe_char_col = char_col.min(char_count);
106
107        // Convert character positions to byte positions
108        let mut char_indices = current_line.char_indices();
109        let start_byte = char_indices
110            .nth(safe_char_col)
111            .map(|(idx, _)| idx)
112            .unwrap_or(current_line.len());
113
114        // Calculate end position (start + length in characters)
115        let end_char_col = (safe_char_col + length).min(char_count);
116        let end_byte = current_line
117            .char_indices()
118            .nth(end_char_col)
119            .map(|(idx, _)| idx)
120            .unwrap_or(current_line.len());
121
122        let start = line_start + start_byte;
123        let end = line_start + end_byte;
124        start..end
125    }
126
127    /// Calculate byte range for entire line replacement (including newline)
128    /// This is ideal for rules that need to replace complete lines
129    pub fn whole_line_range(&self, line: usize) -> Range<usize> {
130        let line_idx = line.saturating_sub(1);
131        let start = *self.line_starts.get(line_idx).unwrap_or(&self.content.len());
132        let end = self
133            .line_starts
134            .get(line_idx + 1)
135            .copied()
136            .unwrap_or(self.content.len());
137        start..end
138    }
139
140    /// Calculate byte range spanning multiple lines (from start_line to end_line inclusive)
141    /// Both lines are 1-indexed. This is useful for replacing entire blocks like tables.
142    pub fn multi_line_range(&self, start_line: usize, end_line: usize) -> Range<usize> {
143        let start_idx = start_line.saturating_sub(1);
144        let end_idx = end_line.saturating_sub(1);
145
146        let start = *self.line_starts.get(start_idx).unwrap_or(&self.content.len());
147        let end = self.line_starts.get(end_idx + 1).copied().unwrap_or(self.content.len());
148        start..end
149    }
150
151    /// Calculate byte range for text within a line (excluding newline)
152    /// Useful for replacing specific parts of a line
153    ///
154    /// # Safety
155    /// This function correctly handles multi-byte UTF-8 characters by converting
156    /// character positions (columns) to byte positions.
157    pub fn line_text_range(&self, line: usize, start_col: usize, end_col: usize) -> Range<usize> {
158        let line_idx = line.saturating_sub(1);
159        let line_start = *self.line_starts.get(line_idx).unwrap_or(&self.content.len());
160
161        // Get the actual line content to ensure we don't exceed bounds
162        let current_line = self.content.lines().nth(line_idx).unwrap_or("");
163        let char_count = current_line.chars().count();
164
165        // Convert character positions to byte positions
166        let start_char_col = start_col.saturating_sub(1).min(char_count);
167        let end_char_col = end_col.saturating_sub(1).min(char_count);
168
169        let mut char_indices = current_line.char_indices();
170        let start_byte = char_indices
171            .nth(start_char_col)
172            .map(|(idx, _)| idx)
173            .unwrap_or(current_line.len());
174
175        let end_byte = current_line
176            .char_indices()
177            .nth(end_char_col)
178            .map(|(idx, _)| idx)
179            .unwrap_or(current_line.len());
180
181        let start = line_start + start_byte;
182        let end = line_start + end_byte.max(start_byte);
183        start..end
184    }
185
186    /// Calculate byte range from start of line to end of line content (excluding newline)
187    /// Useful for replacing line content while preserving line structure
188    pub fn line_content_range(&self, line: usize) -> Range<usize> {
189        let line_idx = line.saturating_sub(1);
190        let line_start = *self.line_starts.get(line_idx).unwrap_or(&self.content.len());
191
192        let current_line = self.content.lines().nth(line_idx).unwrap_or("");
193        let line_end = line_start + current_line.len();
194        line_start..line_end
195    }
196
197    /// Get the global start byte offset for a given 1-based line number.
198    pub fn get_line_start_byte(&self, line_num: usize) -> Option<usize> {
199        if line_num == 0 {
200            return None; // Lines are 1-based
201        }
202        // line_num is 1-based, line_starts index is 0-based
203        self.line_starts.get(line_num - 1).cloned()
204    }
205
206    /// Check if the line at the given index is within a code block
207    pub fn is_code_block(&self, line: usize) -> bool {
208        if let Some(ref code_block_lines) = self.code_block_lines {
209            code_block_lines.contains(&line)
210        } else {
211            // Fallback to a simpler check if pre-computation wasn't done
212            self.is_code_fence(line)
213        }
214    }
215
216    /// Check if the line is a code fence marker (``` or ~~~)
217    pub fn is_code_fence(&self, line: usize) -> bool {
218        self.content.lines().nth(line).is_some_and(|l| {
219            let trimmed = l.trim();
220            trimmed.starts_with("```") || trimmed.starts_with("~~~")
221        })
222    }
223
224    /// Check if the line is a tilde code fence marker (~~~)
225    pub fn is_tilde_code_block(&self, line: usize) -> bool {
226        self.content
227            .lines()
228            .nth(line)
229            .is_some_and(|l| l.trim().starts_with("~~~"))
230    }
231
232    /// Get a reference to the content
233    pub fn get_content(&self) -> &str {
234        self.content
235    }
236
237    /// Pre-compute which lines are within code blocks for faster lookup
238    fn compute_code_block_lines(&mut self) {
239        let mut code_block_lines = HashSet::new();
240        let lines: Vec<&str> = self.content.lines().collect();
241
242        // Initialize block tracking
243        let mut in_block = false;
244        let mut active_fence_type = ' '; // '`' or '~'
245        let mut block_indent = 0;
246        let mut block_fence_length = 0;
247        let mut in_markdown_block = false;
248        let mut nested_fence_start = None;
249        let mut nested_fence_end = None;
250
251        // Process each line
252        for (i, line) in lines.iter().enumerate() {
253            let trimmed = line.trim();
254            let indent = line.len() - trimmed.len();
255
256            // 1. Detect indented code blocks (4+ columns accounting for tab expansion)
257            if ElementCache::calculate_indentation_width_default(line) >= 4 {
258                code_block_lines.insert(i);
259                continue; // Skip further processing for indented code blocks
260            }
261
262            // 2. Handle fenced code blocks (backticks and tildes)
263            if !in_block {
264                // Check for opening fences
265                if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
266                    let char_type = if trimmed.starts_with("```") { '`' } else { '~' };
267                    let count = trimmed.chars().take_while(|&c| c == char_type).count();
268                    let info_string = if trimmed.len() > count {
269                        trimmed[count..].trim()
270                    } else {
271                        ""
272                    };
273
274                    // Mark the start of a new code block
275                    in_block = true;
276                    active_fence_type = char_type;
277                    block_indent = indent;
278                    block_fence_length = count;
279                    in_markdown_block = info_string == "markdown";
280                    nested_fence_start = None;
281                    nested_fence_end = None;
282
283                    code_block_lines.insert(i);
284                }
285            } else {
286                // We're inside a code block
287                code_block_lines.insert(i);
288
289                // Detection of nested fences in markdown blocks
290                if in_markdown_block && nested_fence_start.is_none() && trimmed.starts_with("```") {
291                    // Check if this looks like a nested fence opening (has content after the backticks)
292                    let count = trimmed.chars().take_while(|&c| c == '`').count();
293                    let remaining = if trimmed.len() > count {
294                        trimmed[count..].trim()
295                    } else {
296                        ""
297                    };
298
299                    if !remaining.is_empty() {
300                        nested_fence_start = Some(i);
301                    }
302                }
303
304                // Check if we've found a nested fence end (only if we have a start)
305                if in_markdown_block
306                    && nested_fence_start.is_some()
307                    && nested_fence_end.is_none()
308                    && trimmed.starts_with("```")
309                    && trimmed.trim_start_matches('`').trim().is_empty()
310                {
311                    nested_fence_end = Some(i);
312                }
313
314                // Check if this line matches the closing fence pattern for the outer block
315                if trimmed.starts_with(&active_fence_type.to_string().repeat(3)) {
316                    let count = trimmed.chars().take_while(|&c| c == active_fence_type).count();
317                    let remaining = if trimmed.len() > count {
318                        trimmed[count..].trim()
319                    } else {
320                        ""
321                    };
322
323                    // A line is a closing fence if:
324                    // 1. It uses the same fence character as the opening fence
325                    // 2. It has at least as many fence characters as the opening fence
326                    // 3. It has no content after the fence characters (except for whitespace)
327                    // 4. Its indentation level is less than or equal to the opening fence
328                    let is_valid_closing_fence =
329                        count >= block_fence_length && remaining.is_empty() && indent <= block_indent;
330
331                    // For nested code blocks in markdown, the first backtick fence after the nested content
332                    // should be recognized as the closing fence for the outer block
333                    let is_nested_closing = nested_fence_end.is_some() && i == nested_fence_end.unwrap();
334
335                    // Skip nested closing fences
336                    if is_valid_closing_fence && !is_nested_closing {
337                        in_block = false;
338                        in_markdown_block = false;
339                    }
340                }
341            }
342        }
343
344        self.code_block_lines = Some(code_block_lines);
345    }
346}
347
348/// Calculate end position for a single-line range
349pub fn calculate_single_line_range(line: usize, start_col: usize, length: usize) -> (usize, usize, usize, usize) {
350    (line, start_col, line, start_col + length)
351}
352
353/// Calculate range for entire line
354pub fn calculate_line_range(line: usize, line_content: &str) -> (usize, usize, usize, usize) {
355    let trimmed_len = line_content.trim_end().len();
356    (line, 1, line, trimmed_len + 1)
357}
358
359/// Calculate range from regex match on a line
360///
361/// # Safety
362/// This function safely handles multi-byte UTF-8 characters by ensuring all
363/// string slicing operations occur at valid character boundaries.
364pub fn calculate_match_range(
365    line: usize,
366    line_content: &str,
367    match_start: usize,
368    match_len: usize,
369) -> (usize, usize, usize, usize) {
370    // Bounds check to prevent panic
371    let line_len = line_content.len();
372    if match_start > line_len {
373        // If match_start is beyond line bounds, return a safe range at end of line
374        let char_count = line_content.chars().count();
375        return (line, char_count + 1, line, char_count + 1);
376    }
377
378    // Find safe character boundaries for the match range
379    let safe_match_start = find_char_boundary(line_content, match_start);
380    let safe_match_end_byte = find_char_boundary(line_content, (match_start + match_len).min(line_len));
381
382    // Convert byte positions to character positions safely
383    let char_start = byte_to_char_count(line_content, safe_match_start);
384    let char_len = if safe_match_end_byte > safe_match_start {
385        // Count characters in the safe range
386        line_content[safe_match_start..safe_match_end_byte].chars().count()
387    } else {
388        0
389    };
390    (line, char_start, line, char_start + char_len)
391}
392
393/// Calculate range for trailing content (like trailing spaces)
394///
395/// # Safety
396/// This function safely handles multi-byte UTF-8 characters by ensuring all
397/// string slicing operations occur at valid character boundaries.
398pub fn calculate_trailing_range(line: usize, line_content: &str, content_end: usize) -> (usize, usize, usize, usize) {
399    // Find safe character boundary for content_end
400    let safe_content_end = find_char_boundary(line_content, content_end);
401    let char_content_end = byte_to_char_count(line_content, safe_content_end);
402    let line_char_len = line_content.chars().count() + 1;
403    (line, char_content_end, line, line_char_len)
404}
405
406/// Calculate range for a heading (entire line)
407pub fn calculate_heading_range(line: usize, line_content: &str) -> (usize, usize, usize, usize) {
408    calculate_line_range(line, line_content)
409}
410
411/// Calculate range for emphasis markers and content
412///
413/// # Safety
414/// This function safely handles multi-byte UTF-8 characters by ensuring all
415/// string slicing operations occur at valid character boundaries.
416pub fn calculate_emphasis_range(
417    line: usize,
418    line_content: &str,
419    start_pos: usize,
420    end_pos: usize,
421) -> (usize, usize, usize, usize) {
422    // Find safe character boundaries for start and end positions
423    let safe_start_pos = find_char_boundary(line_content, start_pos);
424    let safe_end_pos = find_char_boundary(line_content, end_pos);
425    let char_start = byte_to_char_count(line_content, safe_start_pos);
426    let char_end = byte_to_char_count(line_content, safe_end_pos);
427    (line, char_start, line, char_end)
428}
429
430/// Calculate range for HTML tags
431pub fn calculate_html_tag_range(
432    line: usize,
433    line_content: &str,
434    tag_start: usize,
435    tag_len: usize,
436) -> (usize, usize, usize, usize) {
437    calculate_match_range(line, line_content, tag_start, tag_len)
438}
439
440/// Calculate range for URLs
441pub fn calculate_url_range(
442    line: usize,
443    line_content: &str,
444    url_start: usize,
445    url_len: usize,
446) -> (usize, usize, usize, usize) {
447    calculate_match_range(line, line_content, url_start, url_len)
448}
449
450/// Calculate range for list markers
451pub fn calculate_list_marker_range(
452    line: usize,
453    line_content: &str,
454    marker_start: usize,
455    marker_len: usize,
456) -> (usize, usize, usize, usize) {
457    calculate_match_range(line, line_content, marker_start, marker_len)
458}
459
460/// Calculate range that exceeds a limit (like line length)
461pub fn calculate_excess_range(line: usize, line_content: &str, limit: usize) -> (usize, usize, usize, usize) {
462    let char_limit = std::cmp::min(limit, line_content.chars().count());
463    let line_char_len = line_content.chars().count() + 1;
464    (line, char_limit + 1, line, line_char_len)
465}
466
467#[cfg(test)]
468mod tests {
469    use super::*;
470
471    #[test]
472    fn test_single_line_range() {
473        let (start_line, start_col, end_line, end_col) = calculate_single_line_range(5, 10, 3);
474        assert_eq!(start_line, 5);
475        assert_eq!(start_col, 10);
476        assert_eq!(end_line, 5);
477        assert_eq!(end_col, 13);
478    }
479
480    #[test]
481    fn test_line_range() {
482        let content = "# This is a heading  ";
483        let (start_line, start_col, end_line, end_col) = calculate_line_range(1, content);
484        assert_eq!(start_line, 1);
485        assert_eq!(start_col, 1);
486        assert_eq!(end_line, 1);
487        assert_eq!(end_col, 20); // Trimmed length + 1
488    }
489
490    #[test]
491    fn test_match_range() {
492        let content = "Text <div>content</div> more";
493        let tag_start = 5; // Position of '<'
494        let tag_len = 5; // Length of "<div>"
495        let (start_line, start_col, end_line, end_col) = calculate_match_range(1, content, tag_start, tag_len);
496        assert_eq!(start_line, 1);
497        assert_eq!(start_col, 6); // 1-indexed
498        assert_eq!(end_line, 1);
499        assert_eq!(end_col, 11); // 6 + 5
500    }
501
502    #[test]
503    fn test_trailing_range() {
504        let content = "Text content   "; // 3 trailing spaces
505        let content_end = 12; // End of "Text content"
506        let (start_line, start_col, end_line, end_col) = calculate_trailing_range(1, content, content_end);
507        assert_eq!(start_line, 1);
508        assert_eq!(start_col, 13); // content_end + 1 (1-indexed)
509        assert_eq!(end_line, 1);
510        assert_eq!(end_col, 16); // Total length + 1
511    }
512
513    #[test]
514    fn test_excess_range() {
515        let content = "This line is too long for the limit";
516        let limit = 20;
517        let (start_line, start_col, end_line, end_col) = calculate_excess_range(1, content, limit);
518        assert_eq!(start_line, 1);
519        assert_eq!(start_col, 21); // limit + 1
520        assert_eq!(end_line, 1);
521        assert_eq!(end_col, 36); // Total length + 1 (35 chars + 1 = 36)
522    }
523
524    #[test]
525    fn test_whole_line_range() {
526        let content = "Line 1\nLine 2\nLine 3";
527        let line_index = LineIndex::new(content);
528
529        // Test first line (includes newline)
530        let range = line_index.whole_line_range(1);
531        assert_eq!(range, 0..7); // "Line 1\n"
532
533        // Test middle line
534        let range = line_index.whole_line_range(2);
535        assert_eq!(range, 7..14); // "Line 2\n"
536
537        // Test last line (no newline)
538        let range = line_index.whole_line_range(3);
539        assert_eq!(range, 14..20); // "Line 3"
540    }
541
542    #[test]
543    fn test_line_content_range() {
544        let content = "Line 1\nLine 2\nLine 3";
545        let line_index = LineIndex::new(content);
546
547        // Test first line content (excludes newline)
548        let range = line_index.line_content_range(1);
549        assert_eq!(range, 0..6); // "Line 1"
550
551        // Test middle line content
552        let range = line_index.line_content_range(2);
553        assert_eq!(range, 7..13); // "Line 2"
554
555        // Test last line content
556        let range = line_index.line_content_range(3);
557        assert_eq!(range, 14..20); // "Line 3"
558    }
559
560    #[test]
561    fn test_line_text_range() {
562        let content = "Hello world\nAnother line";
563        let line_index = LineIndex::new(content);
564
565        // Test partial text in first line
566        let range = line_index.line_text_range(1, 1, 5); // "Hell"
567        assert_eq!(range, 0..4);
568
569        // Test partial text in second line
570        let range = line_index.line_text_range(2, 1, 7); // "Another"
571        assert_eq!(range, 12..18);
572
573        // Test bounds checking
574        let range = line_index.line_text_range(1, 1, 100); // Should clamp to line end
575        assert_eq!(range, 0..11); // "Hello world"
576    }
577
578    #[test]
579    fn test_calculate_match_range_bounds_checking() {
580        // Test case 1: match_start beyond line bounds
581        let line_content = "] not a link [";
582        let (line, start_col, end_line, end_col) = calculate_match_range(121, line_content, 57, 10);
583        assert_eq!(line, 121);
584        assert_eq!(start_col, 15); // line length + 1
585        assert_eq!(end_line, 121);
586        assert_eq!(end_col, 15); // same as start when out of bounds
587
588        // Test case 2: match extends beyond line end
589        let line_content = "short";
590        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 2, 10);
591        assert_eq!(line, 1);
592        assert_eq!(start_col, 3); // position 2 + 1
593        assert_eq!(end_line, 1);
594        assert_eq!(end_col, 6); // clamped to line length + 1
595
596        // Test case 3: normal case within bounds
597        let line_content = "normal text here";
598        let (line, start_col, end_line, end_col) = calculate_match_range(5, line_content, 7, 4);
599        assert_eq!(line, 5);
600        assert_eq!(start_col, 8); // position 7 + 1
601        assert_eq!(end_line, 5);
602        assert_eq!(end_col, 12); // position 7 + 4 + 1
603
604        // Test case 4: zero length match
605        let line_content = "test line";
606        let (line, start_col, end_line, end_col) = calculate_match_range(10, line_content, 5, 0);
607        assert_eq!(line, 10);
608        assert_eq!(start_col, 6); // position 5 + 1
609        assert_eq!(end_line, 10);
610        assert_eq!(end_col, 6); // same as start for zero length
611    }
612
613    // ============================================================================
614    // UTF-8 Multi-byte Character Tests (Issue #154)
615    // ============================================================================
616
617    #[test]
618    fn test_issue_154_korean_character_boundary() {
619        // Exact reproduction of issue #154: Korean character '후' (3 bytes: 18..21)
620        // The error was: "byte index 19 is not a char boundary; it is inside '후'"
621        let line_content = "- 2023 년 초 이후 주가 상승        +1,000% (10 배 상승)  ";
622
623        // Test match at byte 19 (middle of '후' character)
624        // This should not panic and should find the nearest character boundary
625        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 19, 1);
626
627        // Should successfully calculate without panicking
628        assert!(start_col > 0);
629        assert_eq!(line, 1);
630        assert_eq!(end_line, 1);
631        assert!(end_col >= start_col);
632    }
633
634    #[test]
635    fn test_calculate_match_range_korean() {
636        // Korean text: "안녕하세요" (Hello in Korean)
637        // Each character is 3 bytes
638        let line_content = "안녕하세요";
639        // Match at byte 3 (start of second character)
640        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 3, 3);
641        assert_eq!(line, 1);
642        assert_eq!(start_col, 2); // Second character (1-indexed)
643        assert_eq!(end_line, 1);
644        assert_eq!(end_col, 3); // End of second character
645
646        // Match at byte 4 (middle of second character - should round down)
647        let (line, start_col, end_line, _end_col) = calculate_match_range(1, line_content, 4, 3);
648        assert_eq!(line, 1);
649        assert_eq!(start_col, 2); // Should round to start of character
650        assert_eq!(end_line, 1);
651    }
652
653    #[test]
654    fn test_calculate_match_range_chinese() {
655        // Chinese text: "你好世界" (Hello World)
656        // Each character is 3 bytes
657        let line_content = "你好世界";
658        // Match at byte 6 (start of third character)
659        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 6, 3);
660        assert_eq!(line, 1);
661        assert_eq!(start_col, 3); // Third character (1-indexed)
662        assert_eq!(end_line, 1);
663        assert_eq!(end_col, 4); // End of third character
664    }
665
666    #[test]
667    fn test_calculate_match_range_japanese() {
668        // Japanese text: "こんにちは" (Hello)
669        // Each character is 3 bytes
670        let line_content = "こんにちは";
671        // Match at byte 9 (start of fourth character)
672        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 9, 3);
673        assert_eq!(line, 1);
674        assert_eq!(start_col, 4); // Fourth character (1-indexed)
675        assert_eq!(end_line, 1);
676        assert_eq!(end_col, 5); // End of fourth character
677    }
678
679    #[test]
680    fn test_calculate_match_range_mixed_unicode() {
681        // Mixed ASCII and CJK: "Hello 世界"
682        // "Hello " = 6 bytes (H, e, l, l, o, space)
683        // "世" = bytes 6-8 (3 bytes), character 7
684        // "界" = bytes 9-11 (3 bytes), character 8
685        let line_content = "Hello 世界";
686
687        // Match at byte 5 (space character)
688        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 5, 1);
689        assert_eq!(line, 1);
690        assert_eq!(start_col, 6); // Space character (1-indexed: H=1, e=2, l=3, l=4, o=5, space=6)
691        assert_eq!(end_line, 1);
692        assert_eq!(end_col, 7); // After space
693
694        // Match at byte 6 (start of first Chinese character "世")
695        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 6, 3);
696        assert_eq!(line, 1);
697        assert_eq!(start_col, 7); // First Chinese character (1-indexed)
698        assert_eq!(end_line, 1);
699        assert_eq!(end_col, 8); // End of first Chinese character
700    }
701
702    #[test]
703    fn test_calculate_trailing_range_korean() {
704        // Korean text with trailing spaces
705        let line_content = "안녕하세요   ";
706        // content_end at byte 15 (middle of last character + spaces)
707        let (line, start_col, end_line, end_col) = calculate_trailing_range(1, line_content, 15);
708        assert_eq!(line, 1);
709        assert!(start_col > 0);
710        assert_eq!(end_line, 1);
711        assert!(end_col > start_col);
712    }
713
714    #[test]
715    fn test_calculate_emphasis_range_chinese() {
716        // Chinese text with emphasis markers
717        let line_content = "这是**重要**的";
718        // start_pos and end_pos at byte boundaries within Chinese characters
719        let (line, start_col, end_line, end_col) = calculate_emphasis_range(1, line_content, 6, 12);
720        assert_eq!(line, 1);
721        assert!(start_col > 0);
722        assert_eq!(end_line, 1);
723        assert!(end_col > start_col);
724    }
725
726    #[test]
727    fn test_line_col_to_byte_range_korean() {
728        // Test that column positions (character positions) are correctly converted to byte positions
729        let content = "안녕하세요\nWorld";
730        let line_index = LineIndex::new(content);
731
732        // Column 1 (first character)
733        let range = line_index.line_col_to_byte_range(1, 1);
734        assert_eq!(range, 0..0);
735
736        // Column 2 (second character)
737        let range = line_index.line_col_to_byte_range(1, 2);
738        assert_eq!(range, 3..3); // 3 bytes for first character
739
740        // Column 3 (third character)
741        let range = line_index.line_col_to_byte_range(1, 3);
742        assert_eq!(range, 6..6); // 6 bytes for first two characters
743    }
744
745    #[test]
746    fn test_line_col_to_byte_range_with_length_chinese() {
747        // Test byte range calculation with length for Chinese characters
748        let content = "你好世界\nTest";
749        let line_index = LineIndex::new(content);
750
751        // Column 1, length 2 (first two Chinese characters)
752        let range = line_index.line_col_to_byte_range_with_length(1, 1, 2);
753        assert_eq!(range, 0..6); // 6 bytes for two 3-byte characters
754
755        // Column 2, length 1 (second Chinese character)
756        let range = line_index.line_col_to_byte_range_with_length(1, 2, 1);
757        assert_eq!(range, 3..6); // Bytes 3-6 for second character
758    }
759
760    #[test]
761    fn test_line_text_range_japanese() {
762        // Test text range calculation for Japanese characters
763        let content = "こんにちは\nHello";
764        let line_index = LineIndex::new(content);
765
766        // Columns 2-4 (second to fourth Japanese characters)
767        let range = line_index.line_text_range(1, 2, 4);
768        assert_eq!(range, 3..9); // Bytes 3-9 for three 3-byte characters
769    }
770
771    #[test]
772    fn test_find_char_boundary_edge_cases() {
773        // Test the helper function directly
774        let s = "안녕";
775
776        // Byte 0 (start) - should be valid
777        assert_eq!(find_char_boundary(s, 0), 0);
778
779        // Byte 1 (middle of first character) - should round down to 0
780        assert_eq!(find_char_boundary(s, 1), 0);
781
782        // Byte 2 (middle of first character) - should round down to 0
783        assert_eq!(find_char_boundary(s, 2), 0);
784
785        // Byte 3 (start of second character) - should be valid
786        assert_eq!(find_char_boundary(s, 3), 3);
787
788        // Byte 4 (middle of second character) - should round down to 3
789        assert_eq!(find_char_boundary(s, 4), 3);
790
791        // Byte beyond string length - should return string length
792        assert_eq!(find_char_boundary(s, 100), s.len());
793    }
794
795    #[test]
796    fn test_byte_to_char_count_unicode() {
797        // Test character counting with multi-byte characters
798        let s = "안녕하세요";
799
800        // Byte 0 (start) - 1 character
801        assert_eq!(byte_to_char_count(s, 0), 1);
802
803        // Byte 3 (start of second character) - 2 characters
804        assert_eq!(byte_to_char_count(s, 3), 2);
805
806        // Byte 6 (start of third character) - 3 characters
807        assert_eq!(byte_to_char_count(s, 6), 3);
808
809        // Byte 9 (start of fourth character) - 4 characters
810        assert_eq!(byte_to_char_count(s, 9), 4);
811
812        // Byte 12 (start of fifth character) - 5 characters
813        assert_eq!(byte_to_char_count(s, 12), 5);
814
815        // Byte 15 (end) - 6 characters (5 + 1 for 1-indexed)
816        assert_eq!(byte_to_char_count(s, 15), 6);
817    }
818
819    #[test]
820    fn test_all_range_functions_with_emoji() {
821        // Test with emoji (4-byte UTF-8 characters)
822        let line_content = "Hello 🎉 World 🌍";
823
824        // calculate_match_range
825        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 6, 4);
826        assert_eq!(line, 1);
827        assert!(start_col > 0);
828        assert_eq!(end_line, 1);
829        assert!(end_col > start_col);
830
831        // calculate_trailing_range
832        let (line, start_col, end_line, end_col) = calculate_trailing_range(1, line_content, 12);
833        assert_eq!(line, 1);
834        assert!(start_col > 0);
835        assert_eq!(end_line, 1);
836        assert!(end_col > start_col);
837
838        // calculate_emphasis_range
839        let (line, start_col, end_line, end_col) = calculate_emphasis_range(1, line_content, 0, 5);
840        assert_eq!(line, 1);
841        assert_eq!(start_col, 1);
842        assert_eq!(end_line, 1);
843        assert!(end_col > start_col);
844    }
845}