rumdl_lib/utils/
range_utils.rs

1//! Utilities for position/range conversions
2
3use std::collections::HashSet;
4use std::ops::Range;
5
6/// Find the nearest valid UTF-8 character boundary at or before the given byte index.
7/// This is critical for safely slicing strings that may contain multi-byte UTF-8 characters.
8///
9/// # Safety
10/// Returns a byte index that is guaranteed to be a valid character boundary,
11/// or the string length if the index is beyond the string.
12fn find_char_boundary(s: &str, byte_idx: usize) -> usize {
13    if byte_idx >= s.len() {
14        return s.len();
15    }
16
17    // If the index is already at a character boundary, return it
18    if s.is_char_boundary(byte_idx) {
19        return byte_idx;
20    }
21
22    // Find the nearest character boundary by scanning backwards
23    // This is safe because we know byte_idx < s.len()
24    let mut pos = byte_idx;
25    while pos > 0 && !s.is_char_boundary(pos) {
26        pos -= 1;
27    }
28    pos
29}
30
31/// Convert a byte index to a character count (1-indexed).
32/// This safely handles multi-byte UTF-8 characters by finding the nearest character boundary.
33fn byte_to_char_count(s: &str, byte_idx: usize) -> usize {
34    let safe_byte_idx = find_char_boundary(s, byte_idx);
35    s[..safe_byte_idx].chars().count() + 1 // 1-indexed
36}
37
38#[derive(Debug)]
39pub struct LineIndex<'a> {
40    line_starts: Vec<usize>,
41    content: &'a str,
42    code_block_lines: Option<HashSet<usize>>,
43}
44
45impl<'a> LineIndex<'a> {
46    pub fn new(content: &'a str) -> Self {
47        let mut line_starts = vec![0];
48        let mut pos = 0;
49
50        for c in content.chars() {
51            pos += c.len_utf8();
52            if c == '\n' {
53                line_starts.push(pos);
54            }
55        }
56
57        let mut index = Self {
58            line_starts,
59            content,
60            code_block_lines: None,
61        };
62
63        // Pre-compute code block lines for better performance
64        index.compute_code_block_lines();
65
66        index
67    }
68
69    pub fn line_col_to_byte_range(&self, line: usize, column: usize) -> Range<usize> {
70        let line = line.saturating_sub(1);
71        let line_start = *self.line_starts.get(line).unwrap_or(&self.content.len());
72
73        let current_line = self.content.lines().nth(line).unwrap_or("");
74        // Column is 1-indexed character position, not byte position
75        let char_col = column.saturating_sub(1);
76        let char_count = current_line.chars().count();
77        let safe_char_col = char_col.min(char_count);
78
79        // Convert character position to byte position
80        let byte_offset = current_line
81            .char_indices()
82            .nth(safe_char_col)
83            .map(|(idx, _)| idx)
84            .unwrap_or(current_line.len());
85
86        let start = line_start + byte_offset;
87        start..start
88    }
89
90    /// Calculate a proper byte range for replacing text with a specific length
91    /// This is the correct function to use for LSP fixes
92    ///
93    /// # Safety
94    /// This function correctly handles multi-byte UTF-8 characters by converting
95    /// character positions (columns) to byte positions.
96    pub fn line_col_to_byte_range_with_length(&self, line: usize, column: usize, length: usize) -> Range<usize> {
97        let line = line.saturating_sub(1);
98        let line_start = *self.line_starts.get(line).unwrap_or(&self.content.len());
99
100        let current_line = self.content.lines().nth(line).unwrap_or("");
101        // Column is 1-indexed character position, not byte position
102        let char_col = column.saturating_sub(1);
103        let char_count = current_line.chars().count();
104        let safe_char_col = char_col.min(char_count);
105
106        // Convert character positions to byte positions
107        let mut char_indices = current_line.char_indices();
108        let start_byte = char_indices
109            .nth(safe_char_col)
110            .map(|(idx, _)| idx)
111            .unwrap_or(current_line.len());
112
113        // Calculate end position (start + length in characters)
114        let end_char_col = (safe_char_col + length).min(char_count);
115        let end_byte = current_line
116            .char_indices()
117            .nth(end_char_col)
118            .map(|(idx, _)| idx)
119            .unwrap_or(current_line.len());
120
121        let start = line_start + start_byte;
122        let end = line_start + end_byte;
123        start..end
124    }
125
126    /// Calculate byte range for entire line replacement (including newline)
127    /// This is ideal for rules that need to replace complete lines
128    pub fn whole_line_range(&self, line: usize) -> Range<usize> {
129        let line_idx = line.saturating_sub(1);
130        let start = *self.line_starts.get(line_idx).unwrap_or(&self.content.len());
131        let end = self
132            .line_starts
133            .get(line_idx + 1)
134            .copied()
135            .unwrap_or(self.content.len());
136        start..end
137    }
138
139    /// Calculate byte range for text within a line (excluding newline)
140    /// Useful for replacing specific parts of a line
141    ///
142    /// # Safety
143    /// This function correctly handles multi-byte UTF-8 characters by converting
144    /// character positions (columns) to byte positions.
145    pub fn line_text_range(&self, line: usize, start_col: usize, end_col: usize) -> Range<usize> {
146        let line_idx = line.saturating_sub(1);
147        let line_start = *self.line_starts.get(line_idx).unwrap_or(&self.content.len());
148
149        // Get the actual line content to ensure we don't exceed bounds
150        let current_line = self.content.lines().nth(line_idx).unwrap_or("");
151        let char_count = current_line.chars().count();
152
153        // Convert character positions to byte positions
154        let start_char_col = start_col.saturating_sub(1).min(char_count);
155        let end_char_col = end_col.saturating_sub(1).min(char_count);
156
157        let mut char_indices = current_line.char_indices();
158        let start_byte = char_indices
159            .nth(start_char_col)
160            .map(|(idx, _)| idx)
161            .unwrap_or(current_line.len());
162
163        let end_byte = current_line
164            .char_indices()
165            .nth(end_char_col)
166            .map(|(idx, _)| idx)
167            .unwrap_or(current_line.len());
168
169        let start = line_start + start_byte;
170        let end = line_start + end_byte.max(start_byte);
171        start..end
172    }
173
174    /// Calculate byte range from start of line to end of line content (excluding newline)
175    /// Useful for replacing line content while preserving line structure
176    pub fn line_content_range(&self, line: usize) -> Range<usize> {
177        let line_idx = line.saturating_sub(1);
178        let line_start = *self.line_starts.get(line_idx).unwrap_or(&self.content.len());
179
180        let current_line = self.content.lines().nth(line_idx).unwrap_or("");
181        let line_end = line_start + current_line.len();
182        line_start..line_end
183    }
184
185    /// Get the global start byte offset for a given 1-based line number.
186    pub fn get_line_start_byte(&self, line_num: usize) -> Option<usize> {
187        if line_num == 0 {
188            return None; // Lines are 1-based
189        }
190        // line_num is 1-based, line_starts index is 0-based
191        self.line_starts.get(line_num - 1).cloned()
192    }
193
194    /// Check if the line at the given index is within a code block
195    pub fn is_code_block(&self, line: usize) -> bool {
196        if let Some(ref code_block_lines) = self.code_block_lines {
197            code_block_lines.contains(&line)
198        } else {
199            // Fallback to a simpler check if pre-computation wasn't done
200            self.is_code_fence(line)
201        }
202    }
203
204    /// Check if the line is a code fence marker (``` or ~~~)
205    pub fn is_code_fence(&self, line: usize) -> bool {
206        self.content.lines().nth(line).is_some_and(|l| {
207            let trimmed = l.trim();
208            trimmed.starts_with("```") || trimmed.starts_with("~~~")
209        })
210    }
211
212    /// Check if the line is a tilde code fence marker (~~~)
213    pub fn is_tilde_code_block(&self, line: usize) -> bool {
214        self.content
215            .lines()
216            .nth(line)
217            .is_some_and(|l| l.trim().starts_with("~~~"))
218    }
219
220    /// Get a reference to the content
221    pub fn get_content(&self) -> &str {
222        self.content
223    }
224
225    /// Pre-compute which lines are within code blocks for faster lookup
226    fn compute_code_block_lines(&mut self) {
227        let mut code_block_lines = HashSet::new();
228        let lines: Vec<&str> = self.content.lines().collect();
229
230        // Initialize block tracking
231        let mut in_block = false;
232        let mut active_fence_type = ' '; // '`' or '~'
233        let mut block_indent = 0;
234        let mut block_fence_length = 0;
235        let mut in_markdown_block = false;
236        let mut nested_fence_start = None;
237        let mut nested_fence_end = None;
238
239        // Process each line
240        for (i, line) in lines.iter().enumerate() {
241            let trimmed = line.trim();
242            let indent = line.len() - trimmed.len();
243
244            // 1. Detect indented code blocks (independent of fenced code blocks)
245            if line.starts_with("    ") || line.starts_with("\t") {
246                code_block_lines.insert(i);
247                continue; // Skip further processing for indented code blocks
248            }
249
250            // 2. Handle fenced code blocks (backticks and tildes)
251            if !in_block {
252                // Check for opening fences
253                if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
254                    let char_type = if trimmed.starts_with("```") { '`' } else { '~' };
255                    let count = trimmed.chars().take_while(|&c| c == char_type).count();
256                    let info_string = if trimmed.len() > count {
257                        trimmed[count..].trim()
258                    } else {
259                        ""
260                    };
261
262                    // Mark the start of a new code block
263                    in_block = true;
264                    active_fence_type = char_type;
265                    block_indent = indent;
266                    block_fence_length = count;
267                    in_markdown_block = info_string == "markdown";
268                    nested_fence_start = None;
269                    nested_fence_end = None;
270
271                    code_block_lines.insert(i);
272                }
273            } else {
274                // We're inside a code block
275                code_block_lines.insert(i);
276
277                // Detection of nested fences in markdown blocks
278                if in_markdown_block && nested_fence_start.is_none() && trimmed.starts_with("```") {
279                    // Check if this looks like a nested fence opening (has content after the backticks)
280                    let count = trimmed.chars().take_while(|&c| c == '`').count();
281                    let remaining = if trimmed.len() > count {
282                        trimmed[count..].trim()
283                    } else {
284                        ""
285                    };
286
287                    if !remaining.is_empty() {
288                        nested_fence_start = Some(i);
289                    }
290                }
291
292                // Check if we've found a nested fence end (only if we have a start)
293                if in_markdown_block
294                    && nested_fence_start.is_some()
295                    && nested_fence_end.is_none()
296                    && trimmed.starts_with("```")
297                    && trimmed.trim_start_matches('`').trim().is_empty()
298                {
299                    nested_fence_end = Some(i);
300                }
301
302                // Check if this line matches the closing fence pattern for the outer block
303                if trimmed.starts_with(&active_fence_type.to_string().repeat(3)) {
304                    let count = trimmed.chars().take_while(|&c| c == active_fence_type).count();
305                    let remaining = if trimmed.len() > count {
306                        trimmed[count..].trim()
307                    } else {
308                        ""
309                    };
310
311                    // A line is a closing fence if:
312                    // 1. It uses the same fence character as the opening fence
313                    // 2. It has at least as many fence characters as the opening fence
314                    // 3. It has no content after the fence characters (except for whitespace)
315                    // 4. Its indentation level is less than or equal to the opening fence
316                    let is_valid_closing_fence =
317                        count >= block_fence_length && remaining.is_empty() && indent <= block_indent;
318
319                    // For nested code blocks in markdown, the first backtick fence after the nested content
320                    // should be recognized as the closing fence for the outer block
321                    let is_nested_closing = nested_fence_end.is_some() && i == nested_fence_end.unwrap();
322
323                    // Skip nested closing fences
324                    if is_valid_closing_fence && !is_nested_closing {
325                        in_block = false;
326                        in_markdown_block = false;
327                    }
328                }
329            }
330        }
331
332        self.code_block_lines = Some(code_block_lines);
333    }
334}
335
336/// Calculate end position for a single-line range
337pub fn calculate_single_line_range(line: usize, start_col: usize, length: usize) -> (usize, usize, usize, usize) {
338    (line, start_col, line, start_col + length)
339}
340
341/// Calculate range for entire line
342pub fn calculate_line_range(line: usize, line_content: &str) -> (usize, usize, usize, usize) {
343    let trimmed_len = line_content.trim_end().len();
344    (line, 1, line, trimmed_len + 1)
345}
346
347/// Calculate range from regex match on a line
348///
349/// # Safety
350/// This function safely handles multi-byte UTF-8 characters by ensuring all
351/// string slicing operations occur at valid character boundaries.
352pub fn calculate_match_range(
353    line: usize,
354    line_content: &str,
355    match_start: usize,
356    match_len: usize,
357) -> (usize, usize, usize, usize) {
358    // Bounds check to prevent panic
359    let line_len = line_content.len();
360    if match_start > line_len {
361        // If match_start is beyond line bounds, return a safe range at end of line
362        let char_count = line_content.chars().count();
363        return (line, char_count + 1, line, char_count + 1);
364    }
365
366    // Find safe character boundaries for the match range
367    let safe_match_start = find_char_boundary(line_content, match_start);
368    let safe_match_end_byte = find_char_boundary(line_content, (match_start + match_len).min(line_len));
369
370    // Convert byte positions to character positions safely
371    let char_start = byte_to_char_count(line_content, safe_match_start);
372    let char_len = if safe_match_end_byte > safe_match_start {
373        // Count characters in the safe range
374        line_content[safe_match_start..safe_match_end_byte].chars().count()
375    } else {
376        0
377    };
378    (line, char_start, line, char_start + char_len)
379}
380
381/// Calculate range for trailing content (like trailing spaces)
382///
383/// # Safety
384/// This function safely handles multi-byte UTF-8 characters by ensuring all
385/// string slicing operations occur at valid character boundaries.
386pub fn calculate_trailing_range(line: usize, line_content: &str, content_end: usize) -> (usize, usize, usize, usize) {
387    // Find safe character boundary for content_end
388    let safe_content_end = find_char_boundary(line_content, content_end);
389    let char_content_end = byte_to_char_count(line_content, safe_content_end);
390    let line_char_len = line_content.chars().count() + 1;
391    (line, char_content_end, line, line_char_len)
392}
393
394/// Calculate range for a heading (entire line)
395pub fn calculate_heading_range(line: usize, line_content: &str) -> (usize, usize, usize, usize) {
396    calculate_line_range(line, line_content)
397}
398
399/// Calculate range for emphasis markers and content
400///
401/// # Safety
402/// This function safely handles multi-byte UTF-8 characters by ensuring all
403/// string slicing operations occur at valid character boundaries.
404pub fn calculate_emphasis_range(
405    line: usize,
406    line_content: &str,
407    start_pos: usize,
408    end_pos: usize,
409) -> (usize, usize, usize, usize) {
410    // Find safe character boundaries for start and end positions
411    let safe_start_pos = find_char_boundary(line_content, start_pos);
412    let safe_end_pos = find_char_boundary(line_content, end_pos);
413    let char_start = byte_to_char_count(line_content, safe_start_pos);
414    let char_end = byte_to_char_count(line_content, safe_end_pos);
415    (line, char_start, line, char_end)
416}
417
418/// Calculate range for HTML tags
419pub fn calculate_html_tag_range(
420    line: usize,
421    line_content: &str,
422    tag_start: usize,
423    tag_len: usize,
424) -> (usize, usize, usize, usize) {
425    calculate_match_range(line, line_content, tag_start, tag_len)
426}
427
428/// Calculate range for URLs
429pub fn calculate_url_range(
430    line: usize,
431    line_content: &str,
432    url_start: usize,
433    url_len: usize,
434) -> (usize, usize, usize, usize) {
435    calculate_match_range(line, line_content, url_start, url_len)
436}
437
438/// Calculate range for list markers
439pub fn calculate_list_marker_range(
440    line: usize,
441    line_content: &str,
442    marker_start: usize,
443    marker_len: usize,
444) -> (usize, usize, usize, usize) {
445    calculate_match_range(line, line_content, marker_start, marker_len)
446}
447
448/// Calculate range that exceeds a limit (like line length)
449pub fn calculate_excess_range(line: usize, line_content: &str, limit: usize) -> (usize, usize, usize, usize) {
450    let char_limit = std::cmp::min(limit, line_content.chars().count());
451    let line_char_len = line_content.chars().count() + 1;
452    (line, char_limit + 1, line, line_char_len)
453}
454
455#[cfg(test)]
456mod tests {
457    use super::*;
458
459    #[test]
460    fn test_single_line_range() {
461        let (start_line, start_col, end_line, end_col) = calculate_single_line_range(5, 10, 3);
462        assert_eq!(start_line, 5);
463        assert_eq!(start_col, 10);
464        assert_eq!(end_line, 5);
465        assert_eq!(end_col, 13);
466    }
467
468    #[test]
469    fn test_line_range() {
470        let content = "# This is a heading  ";
471        let (start_line, start_col, end_line, end_col) = calculate_line_range(1, content);
472        assert_eq!(start_line, 1);
473        assert_eq!(start_col, 1);
474        assert_eq!(end_line, 1);
475        assert_eq!(end_col, 20); // Trimmed length + 1
476    }
477
478    #[test]
479    fn test_match_range() {
480        let content = "Text <div>content</div> more";
481        let tag_start = 5; // Position of '<'
482        let tag_len = 5; // Length of "<div>"
483        let (start_line, start_col, end_line, end_col) = calculate_match_range(1, content, tag_start, tag_len);
484        assert_eq!(start_line, 1);
485        assert_eq!(start_col, 6); // 1-indexed
486        assert_eq!(end_line, 1);
487        assert_eq!(end_col, 11); // 6 + 5
488    }
489
490    #[test]
491    fn test_trailing_range() {
492        let content = "Text content   "; // 3 trailing spaces
493        let content_end = 12; // End of "Text content"
494        let (start_line, start_col, end_line, end_col) = calculate_trailing_range(1, content, content_end);
495        assert_eq!(start_line, 1);
496        assert_eq!(start_col, 13); // content_end + 1 (1-indexed)
497        assert_eq!(end_line, 1);
498        assert_eq!(end_col, 16); // Total length + 1
499    }
500
501    #[test]
502    fn test_excess_range() {
503        let content = "This line is too long for the limit";
504        let limit = 20;
505        let (start_line, start_col, end_line, end_col) = calculate_excess_range(1, content, limit);
506        assert_eq!(start_line, 1);
507        assert_eq!(start_col, 21); // limit + 1
508        assert_eq!(end_line, 1);
509        assert_eq!(end_col, 36); // Total length + 1 (35 chars + 1 = 36)
510    }
511
512    #[test]
513    fn test_whole_line_range() {
514        let content = "Line 1\nLine 2\nLine 3";
515        let line_index = LineIndex::new(content);
516
517        // Test first line (includes newline)
518        let range = line_index.whole_line_range(1);
519        assert_eq!(range, 0..7); // "Line 1\n"
520
521        // Test middle line
522        let range = line_index.whole_line_range(2);
523        assert_eq!(range, 7..14); // "Line 2\n"
524
525        // Test last line (no newline)
526        let range = line_index.whole_line_range(3);
527        assert_eq!(range, 14..20); // "Line 3"
528    }
529
530    #[test]
531    fn test_line_content_range() {
532        let content = "Line 1\nLine 2\nLine 3";
533        let line_index = LineIndex::new(content);
534
535        // Test first line content (excludes newline)
536        let range = line_index.line_content_range(1);
537        assert_eq!(range, 0..6); // "Line 1"
538
539        // Test middle line content
540        let range = line_index.line_content_range(2);
541        assert_eq!(range, 7..13); // "Line 2"
542
543        // Test last line content
544        let range = line_index.line_content_range(3);
545        assert_eq!(range, 14..20); // "Line 3"
546    }
547
548    #[test]
549    fn test_line_text_range() {
550        let content = "Hello world\nAnother line";
551        let line_index = LineIndex::new(content);
552
553        // Test partial text in first line
554        let range = line_index.line_text_range(1, 1, 5); // "Hell"
555        assert_eq!(range, 0..4);
556
557        // Test partial text in second line
558        let range = line_index.line_text_range(2, 1, 7); // "Anothe"
559        assert_eq!(range, 12..18);
560
561        // Test bounds checking
562        let range = line_index.line_text_range(1, 1, 100); // Should clamp to line end
563        assert_eq!(range, 0..11); // "Hello world"
564    }
565
566    #[test]
567    fn test_calculate_match_range_bounds_checking() {
568        // Test case 1: match_start beyond line bounds
569        let line_content = "] not a link [";
570        let (line, start_col, end_line, end_col) = calculate_match_range(121, line_content, 57, 10);
571        assert_eq!(line, 121);
572        assert_eq!(start_col, 15); // line length + 1
573        assert_eq!(end_line, 121);
574        assert_eq!(end_col, 15); // same as start when out of bounds
575
576        // Test case 2: match extends beyond line end
577        let line_content = "short";
578        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 2, 10);
579        assert_eq!(line, 1);
580        assert_eq!(start_col, 3); // position 2 + 1
581        assert_eq!(end_line, 1);
582        assert_eq!(end_col, 6); // clamped to line length + 1
583
584        // Test case 3: normal case within bounds
585        let line_content = "normal text here";
586        let (line, start_col, end_line, end_col) = calculate_match_range(5, line_content, 7, 4);
587        assert_eq!(line, 5);
588        assert_eq!(start_col, 8); // position 7 + 1
589        assert_eq!(end_line, 5);
590        assert_eq!(end_col, 12); // position 7 + 4 + 1
591
592        // Test case 4: zero length match
593        let line_content = "test line";
594        let (line, start_col, end_line, end_col) = calculate_match_range(10, line_content, 5, 0);
595        assert_eq!(line, 10);
596        assert_eq!(start_col, 6); // position 5 + 1
597        assert_eq!(end_line, 10);
598        assert_eq!(end_col, 6); // same as start for zero length
599    }
600
601    // ============================================================================
602    // UTF-8 Multi-byte Character Tests (Issue #154)
603    // ============================================================================
604
605    #[test]
606    fn test_issue_154_korean_character_boundary() {
607        // Exact reproduction of issue #154: Korean character '후' (3 bytes: 18..21)
608        // The error was: "byte index 19 is not a char boundary; it is inside '후'"
609        let line_content = "- 2023 년 초 이후 주가 상승        +1,000% (10 배 상승)  ";
610
611        // Test match at byte 19 (middle of '후' character)
612        // This should not panic and should find the nearest character boundary
613        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 19, 1);
614
615        // Should successfully calculate without panicking
616        assert!(start_col > 0);
617        assert_eq!(line, 1);
618        assert_eq!(end_line, 1);
619        assert!(end_col >= start_col);
620    }
621
622    #[test]
623    fn test_calculate_match_range_korean() {
624        // Korean text: "안녕하세요" (Hello in Korean)
625        // Each character is 3 bytes
626        let line_content = "안녕하세요";
627        // Match at byte 3 (start of second character)
628        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 3, 3);
629        assert_eq!(line, 1);
630        assert_eq!(start_col, 2); // Second character (1-indexed)
631        assert_eq!(end_line, 1);
632        assert_eq!(end_col, 3); // End of second character
633
634        // Match at byte 4 (middle of second character - should round down)
635        let (line, start_col, end_line, _end_col) = calculate_match_range(1, line_content, 4, 3);
636        assert_eq!(line, 1);
637        assert_eq!(start_col, 2); // Should round to start of character
638        assert_eq!(end_line, 1);
639    }
640
641    #[test]
642    fn test_calculate_match_range_chinese() {
643        // Chinese text: "你好世界" (Hello World)
644        // Each character is 3 bytes
645        let line_content = "你好世界";
646        // Match at byte 6 (start of third character)
647        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 6, 3);
648        assert_eq!(line, 1);
649        assert_eq!(start_col, 3); // Third character (1-indexed)
650        assert_eq!(end_line, 1);
651        assert_eq!(end_col, 4); // End of third character
652    }
653
654    #[test]
655    fn test_calculate_match_range_japanese() {
656        // Japanese text: "こんにちは" (Hello)
657        // Each character is 3 bytes
658        let line_content = "こんにちは";
659        // Match at byte 9 (start of fourth character)
660        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 9, 3);
661        assert_eq!(line, 1);
662        assert_eq!(start_col, 4); // Fourth character (1-indexed)
663        assert_eq!(end_line, 1);
664        assert_eq!(end_col, 5); // End of fourth character
665    }
666
667    #[test]
668    fn test_calculate_match_range_mixed_unicode() {
669        // Mixed ASCII and CJK: "Hello 世界"
670        // "Hello " = 6 bytes (H, e, l, l, o, space)
671        // "世" = bytes 6-8 (3 bytes), character 7
672        // "界" = bytes 9-11 (3 bytes), character 8
673        let line_content = "Hello 世界";
674
675        // Match at byte 5 (space character)
676        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 5, 1);
677        assert_eq!(line, 1);
678        assert_eq!(start_col, 6); // Space character (1-indexed: H=1, e=2, l=3, l=4, o=5, space=6)
679        assert_eq!(end_line, 1);
680        assert_eq!(end_col, 7); // After space
681
682        // Match at byte 6 (start of first Chinese character "世")
683        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 6, 3);
684        assert_eq!(line, 1);
685        assert_eq!(start_col, 7); // First Chinese character (1-indexed)
686        assert_eq!(end_line, 1);
687        assert_eq!(end_col, 8); // End of first Chinese character
688    }
689
690    #[test]
691    fn test_calculate_trailing_range_korean() {
692        // Korean text with trailing spaces
693        let line_content = "안녕하세요   ";
694        // content_end at byte 15 (middle of last character + spaces)
695        let (line, start_col, end_line, end_col) = calculate_trailing_range(1, line_content, 15);
696        assert_eq!(line, 1);
697        assert!(start_col > 0);
698        assert_eq!(end_line, 1);
699        assert!(end_col > start_col);
700    }
701
702    #[test]
703    fn test_calculate_emphasis_range_chinese() {
704        // Chinese text with emphasis markers
705        let line_content = "这是**重要**的";
706        // start_pos and end_pos at byte boundaries within Chinese characters
707        let (line, start_col, end_line, end_col) = calculate_emphasis_range(1, line_content, 6, 12);
708        assert_eq!(line, 1);
709        assert!(start_col > 0);
710        assert_eq!(end_line, 1);
711        assert!(end_col > start_col);
712    }
713
714    #[test]
715    fn test_line_col_to_byte_range_korean() {
716        // Test that column positions (character positions) are correctly converted to byte positions
717        let content = "안녕하세요\nWorld";
718        let line_index = LineIndex::new(content);
719
720        // Column 1 (first character)
721        let range = line_index.line_col_to_byte_range(1, 1);
722        assert_eq!(range, 0..0);
723
724        // Column 2 (second character)
725        let range = line_index.line_col_to_byte_range(1, 2);
726        assert_eq!(range, 3..3); // 3 bytes for first character
727
728        // Column 3 (third character)
729        let range = line_index.line_col_to_byte_range(1, 3);
730        assert_eq!(range, 6..6); // 6 bytes for first two characters
731    }
732
733    #[test]
734    fn test_line_col_to_byte_range_with_length_chinese() {
735        // Test byte range calculation with length for Chinese characters
736        let content = "你好世界\nTest";
737        let line_index = LineIndex::new(content);
738
739        // Column 1, length 2 (first two Chinese characters)
740        let range = line_index.line_col_to_byte_range_with_length(1, 1, 2);
741        assert_eq!(range, 0..6); // 6 bytes for two 3-byte characters
742
743        // Column 2, length 1 (second Chinese character)
744        let range = line_index.line_col_to_byte_range_with_length(1, 2, 1);
745        assert_eq!(range, 3..6); // Bytes 3-6 for second character
746    }
747
748    #[test]
749    fn test_line_text_range_japanese() {
750        // Test text range calculation for Japanese characters
751        let content = "こんにちは\nHello";
752        let line_index = LineIndex::new(content);
753
754        // Columns 2-4 (second to fourth Japanese characters)
755        let range = line_index.line_text_range(1, 2, 4);
756        assert_eq!(range, 3..9); // Bytes 3-9 for three 3-byte characters
757    }
758
759    #[test]
760    fn test_find_char_boundary_edge_cases() {
761        // Test the helper function directly
762        let s = "안녕";
763
764        // Byte 0 (start) - should be valid
765        assert_eq!(find_char_boundary(s, 0), 0);
766
767        // Byte 1 (middle of first character) - should round down to 0
768        assert_eq!(find_char_boundary(s, 1), 0);
769
770        // Byte 2 (middle of first character) - should round down to 0
771        assert_eq!(find_char_boundary(s, 2), 0);
772
773        // Byte 3 (start of second character) - should be valid
774        assert_eq!(find_char_boundary(s, 3), 3);
775
776        // Byte 4 (middle of second character) - should round down to 3
777        assert_eq!(find_char_boundary(s, 4), 3);
778
779        // Byte beyond string length - should return string length
780        assert_eq!(find_char_boundary(s, 100), s.len());
781    }
782
783    #[test]
784    fn test_byte_to_char_count_unicode() {
785        // Test character counting with multi-byte characters
786        let s = "안녕하세요";
787
788        // Byte 0 (start) - 1 character
789        assert_eq!(byte_to_char_count(s, 0), 1);
790
791        // Byte 3 (start of second character) - 2 characters
792        assert_eq!(byte_to_char_count(s, 3), 2);
793
794        // Byte 6 (start of third character) - 3 characters
795        assert_eq!(byte_to_char_count(s, 6), 3);
796
797        // Byte 9 (start of fourth character) - 4 characters
798        assert_eq!(byte_to_char_count(s, 9), 4);
799
800        // Byte 12 (start of fifth character) - 5 characters
801        assert_eq!(byte_to_char_count(s, 12), 5);
802
803        // Byte 15 (end) - 6 characters (5 + 1 for 1-indexed)
804        assert_eq!(byte_to_char_count(s, 15), 6);
805    }
806
807    #[test]
808    fn test_all_range_functions_with_emoji() {
809        // Test with emoji (4-byte UTF-8 characters)
810        let line_content = "Hello 🎉 World 🌍";
811
812        // calculate_match_range
813        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 6, 4);
814        assert_eq!(line, 1);
815        assert!(start_col > 0);
816        assert_eq!(end_line, 1);
817        assert!(end_col > start_col);
818
819        // calculate_trailing_range
820        let (line, start_col, end_line, end_col) = calculate_trailing_range(1, line_content, 12);
821        assert_eq!(line, 1);
822        assert!(start_col > 0);
823        assert_eq!(end_line, 1);
824        assert!(end_col > start_col);
825
826        // calculate_emphasis_range
827        let (line, start_col, end_line, end_col) = calculate_emphasis_range(1, line_content, 0, 5);
828        assert_eq!(line, 1);
829        assert_eq!(start_col, 1);
830        assert_eq!(end_line, 1);
831        assert!(end_col > start_col);
832    }
833}
rumdl_lib/utils/range_utils.rs

rumdl_lib/utils/
range_utils.rs