rumdl_lib/utils/
range_utils.rs

1//! Utilities for position/range conversions
2
3use std::collections::HashSet;
4use std::ops::Range;
5
6/// Find the nearest valid UTF-8 character boundary at or before the given byte index.
7/// This is critical for safely slicing strings that may contain multi-byte UTF-8 characters.
8///
9/// # Safety
10/// Returns a byte index that is guaranteed to be a valid character boundary,
11/// or the string length if the index is beyond the string.
12fn find_char_boundary(s: &str, byte_idx: usize) -> usize {
13    if byte_idx >= s.len() {
14        return s.len();
15    }
16
17    // If the index is already at a character boundary, return it
18    if s.is_char_boundary(byte_idx) {
19        return byte_idx;
20    }
21
22    // Find the nearest character boundary by scanning backwards
23    // This is safe because we know byte_idx < s.len()
24    let mut pos = byte_idx;
25    while pos > 0 && !s.is_char_boundary(pos) {
26        pos -= 1;
27    }
28    pos
29}
30
31/// Convert a byte index to a character count (1-indexed).
32/// This safely handles multi-byte UTF-8 characters by finding the nearest character boundary.
33fn byte_to_char_count(s: &str, byte_idx: usize) -> usize {
34    let safe_byte_idx = find_char_boundary(s, byte_idx);
35    s[..safe_byte_idx].chars().count() + 1 // 1-indexed
36}
37
38#[derive(Debug)]
39pub struct LineIndex<'a> {
40    line_starts: Vec<usize>,
41    content: &'a str,
42    code_block_lines: Option<HashSet<usize>>,
43}
44
45impl<'a> LineIndex<'a> {
46    pub fn new(content: &'a str) -> Self {
47        let mut line_starts = vec![0];
48        let mut pos = 0;
49
50        for c in content.chars() {
51            pos += c.len_utf8();
52            if c == '\n' {
53                line_starts.push(pos);
54            }
55        }
56
57        let mut index = Self {
58            line_starts,
59            content,
60            code_block_lines: None,
61        };
62
63        // Pre-compute code block lines for better performance
64        index.compute_code_block_lines();
65
66        index
67    }
68
69    pub fn line_col_to_byte_range(&self, line: usize, column: usize) -> Range<usize> {
70        let line = line.saturating_sub(1);
71        let line_start = *self.line_starts.get(line).unwrap_or(&self.content.len());
72
73        let current_line = self.content.lines().nth(line).unwrap_or("");
74        // Column is 1-indexed character position, not byte position
75        let char_col = column.saturating_sub(1);
76        let char_count = current_line.chars().count();
77        let safe_char_col = char_col.min(char_count);
78
79        // Convert character position to byte position
80        let byte_offset = current_line
81            .char_indices()
82            .nth(safe_char_col)
83            .map(|(idx, _)| idx)
84            .unwrap_or(current_line.len());
85
86        let start = line_start + byte_offset;
87        start..start
88    }
89
90    /// Calculate a proper byte range for replacing text with a specific length
91    /// This is the correct function to use for LSP fixes
92    ///
93    /// # Safety
94    /// This function correctly handles multi-byte UTF-8 characters by converting
95    /// character positions (columns) to byte positions.
96    pub fn line_col_to_byte_range_with_length(&self, line: usize, column: usize, length: usize) -> Range<usize> {
97        let line = line.saturating_sub(1);
98        let line_start = *self.line_starts.get(line).unwrap_or(&self.content.len());
99
100        let current_line = self.content.lines().nth(line).unwrap_or("");
101        // Column is 1-indexed character position, not byte position
102        let char_col = column.saturating_sub(1);
103        let char_count = current_line.chars().count();
104        let safe_char_col = char_col.min(char_count);
105
106        // Convert character positions to byte positions
107        let mut char_indices = current_line.char_indices();
108        let start_byte = char_indices
109            .nth(safe_char_col)
110            .map(|(idx, _)| idx)
111            .unwrap_or(current_line.len());
112
113        // Calculate end position (start + length in characters)
114        let end_char_col = (safe_char_col + length).min(char_count);
115        let end_byte = current_line
116            .char_indices()
117            .nth(end_char_col)
118            .map(|(idx, _)| idx)
119            .unwrap_or(current_line.len());
120
121        let start = line_start + start_byte;
122        let end = line_start + end_byte;
123        start..end
124    }
125
126    /// Calculate byte range for entire line replacement (including newline)
127    /// This is ideal for rules that need to replace complete lines
128    pub fn whole_line_range(&self, line: usize) -> Range<usize> {
129        let line_idx = line.saturating_sub(1);
130        let start = *self.line_starts.get(line_idx).unwrap_or(&self.content.len());
131        let end = self
132            .line_starts
133            .get(line_idx + 1)
134            .copied()
135            .unwrap_or(self.content.len());
136        start..end
137    }
138
139    /// Calculate byte range spanning multiple lines (from start_line to end_line inclusive)
140    /// Both lines are 1-indexed. This is useful for replacing entire blocks like tables.
141    pub fn multi_line_range(&self, start_line: usize, end_line: usize) -> Range<usize> {
142        let start_idx = start_line.saturating_sub(1);
143        let end_idx = end_line.saturating_sub(1);
144
145        let start = *self.line_starts.get(start_idx).unwrap_or(&self.content.len());
146        let end = self.line_starts.get(end_idx + 1).copied().unwrap_or(self.content.len());
147        start..end
148    }
149
150    /// Calculate byte range for text within a line (excluding newline)
151    /// Useful for replacing specific parts of a line
152    ///
153    /// # Safety
154    /// This function correctly handles multi-byte UTF-8 characters by converting
155    /// character positions (columns) to byte positions.
156    pub fn line_text_range(&self, line: usize, start_col: usize, end_col: usize) -> Range<usize> {
157        let line_idx = line.saturating_sub(1);
158        let line_start = *self.line_starts.get(line_idx).unwrap_or(&self.content.len());
159
160        // Get the actual line content to ensure we don't exceed bounds
161        let current_line = self.content.lines().nth(line_idx).unwrap_or("");
162        let char_count = current_line.chars().count();
163
164        // Convert character positions to byte positions
165        let start_char_col = start_col.saturating_sub(1).min(char_count);
166        let end_char_col = end_col.saturating_sub(1).min(char_count);
167
168        let mut char_indices = current_line.char_indices();
169        let start_byte = char_indices
170            .nth(start_char_col)
171            .map(|(idx, _)| idx)
172            .unwrap_or(current_line.len());
173
174        let end_byte = current_line
175            .char_indices()
176            .nth(end_char_col)
177            .map(|(idx, _)| idx)
178            .unwrap_or(current_line.len());
179
180        let start = line_start + start_byte;
181        let end = line_start + end_byte.max(start_byte);
182        start..end
183    }
184
185    /// Calculate byte range from start of line to end of line content (excluding newline)
186    /// Useful for replacing line content while preserving line structure
187    pub fn line_content_range(&self, line: usize) -> Range<usize> {
188        let line_idx = line.saturating_sub(1);
189        let line_start = *self.line_starts.get(line_idx).unwrap_or(&self.content.len());
190
191        let current_line = self.content.lines().nth(line_idx).unwrap_or("");
192        let line_end = line_start + current_line.len();
193        line_start..line_end
194    }
195
196    /// Get the global start byte offset for a given 1-based line number.
197    pub fn get_line_start_byte(&self, line_num: usize) -> Option<usize> {
198        if line_num == 0 {
199            return None; // Lines are 1-based
200        }
201        // line_num is 1-based, line_starts index is 0-based
202        self.line_starts.get(line_num - 1).cloned()
203    }
204
205    /// Check if the line at the given index is within a code block
206    pub fn is_code_block(&self, line: usize) -> bool {
207        if let Some(ref code_block_lines) = self.code_block_lines {
208            code_block_lines.contains(&line)
209        } else {
210            // Fallback to a simpler check if pre-computation wasn't done
211            self.is_code_fence(line)
212        }
213    }
214
215    /// Check if the line is a code fence marker (``` or ~~~)
216    pub fn is_code_fence(&self, line: usize) -> bool {
217        self.content.lines().nth(line).is_some_and(|l| {
218            let trimmed = l.trim();
219            trimmed.starts_with("```") || trimmed.starts_with("~~~")
220        })
221    }
222
223    /// Check if the line is a tilde code fence marker (~~~)
224    pub fn is_tilde_code_block(&self, line: usize) -> bool {
225        self.content
226            .lines()
227            .nth(line)
228            .is_some_and(|l| l.trim().starts_with("~~~"))
229    }
230
231    /// Get a reference to the content
232    pub fn get_content(&self) -> &str {
233        self.content
234    }
235
236    /// Pre-compute which lines are within code blocks for faster lookup
237    fn compute_code_block_lines(&mut self) {
238        let mut code_block_lines = HashSet::new();
239        let lines: Vec<&str> = self.content.lines().collect();
240
241        // Initialize block tracking
242        let mut in_block = false;
243        let mut active_fence_type = ' '; // '`' or '~'
244        let mut block_indent = 0;
245        let mut block_fence_length = 0;
246        let mut in_markdown_block = false;
247        let mut nested_fence_start = None;
248        let mut nested_fence_end = None;
249
250        // Process each line
251        for (i, line) in lines.iter().enumerate() {
252            let trimmed = line.trim();
253            let indent = line.len() - trimmed.len();
254
255            // 1. Detect indented code blocks (independent of fenced code blocks)
256            if line.starts_with("    ") || line.starts_with("\t") {
257                code_block_lines.insert(i);
258                continue; // Skip further processing for indented code blocks
259            }
260
261            // 2. Handle fenced code blocks (backticks and tildes)
262            if !in_block {
263                // Check for opening fences
264                if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
265                    let char_type = if trimmed.starts_with("```") { '`' } else { '~' };
266                    let count = trimmed.chars().take_while(|&c| c == char_type).count();
267                    let info_string = if trimmed.len() > count {
268                        trimmed[count..].trim()
269                    } else {
270                        ""
271                    };
272
273                    // Mark the start of a new code block
274                    in_block = true;
275                    active_fence_type = char_type;
276                    block_indent = indent;
277                    block_fence_length = count;
278                    in_markdown_block = info_string == "markdown";
279                    nested_fence_start = None;
280                    nested_fence_end = None;
281
282                    code_block_lines.insert(i);
283                }
284            } else {
285                // We're inside a code block
286                code_block_lines.insert(i);
287
288                // Detection of nested fences in markdown blocks
289                if in_markdown_block && nested_fence_start.is_none() && trimmed.starts_with("```") {
290                    // Check if this looks like a nested fence opening (has content after the backticks)
291                    let count = trimmed.chars().take_while(|&c| c == '`').count();
292                    let remaining = if trimmed.len() > count {
293                        trimmed[count..].trim()
294                    } else {
295                        ""
296                    };
297
298                    if !remaining.is_empty() {
299                        nested_fence_start = Some(i);
300                    }
301                }
302
303                // Check if we've found a nested fence end (only if we have a start)
304                if in_markdown_block
305                    && nested_fence_start.is_some()
306                    && nested_fence_end.is_none()
307                    && trimmed.starts_with("```")
308                    && trimmed.trim_start_matches('`').trim().is_empty()
309                {
310                    nested_fence_end = Some(i);
311                }
312
313                // Check if this line matches the closing fence pattern for the outer block
314                if trimmed.starts_with(&active_fence_type.to_string().repeat(3)) {
315                    let count = trimmed.chars().take_while(|&c| c == active_fence_type).count();
316                    let remaining = if trimmed.len() > count {
317                        trimmed[count..].trim()
318                    } else {
319                        ""
320                    };
321
322                    // A line is a closing fence if:
323                    // 1. It uses the same fence character as the opening fence
324                    // 2. It has at least as many fence characters as the opening fence
325                    // 3. It has no content after the fence characters (except for whitespace)
326                    // 4. Its indentation level is less than or equal to the opening fence
327                    let is_valid_closing_fence =
328                        count >= block_fence_length && remaining.is_empty() && indent <= block_indent;
329
330                    // For nested code blocks in markdown, the first backtick fence after the nested content
331                    // should be recognized as the closing fence for the outer block
332                    let is_nested_closing = nested_fence_end.is_some() && i == nested_fence_end.unwrap();
333
334                    // Skip nested closing fences
335                    if is_valid_closing_fence && !is_nested_closing {
336                        in_block = false;
337                        in_markdown_block = false;
338                    }
339                }
340            }
341        }
342
343        self.code_block_lines = Some(code_block_lines);
344    }
345}
346
347/// Calculate end position for a single-line range
348pub fn calculate_single_line_range(line: usize, start_col: usize, length: usize) -> (usize, usize, usize, usize) {
349    (line, start_col, line, start_col + length)
350}
351
352/// Calculate range for entire line
353pub fn calculate_line_range(line: usize, line_content: &str) -> (usize, usize, usize, usize) {
354    let trimmed_len = line_content.trim_end().len();
355    (line, 1, line, trimmed_len + 1)
356}
357
358/// Calculate range from regex match on a line
359///
360/// # Safety
361/// This function safely handles multi-byte UTF-8 characters by ensuring all
362/// string slicing operations occur at valid character boundaries.
363pub fn calculate_match_range(
364    line: usize,
365    line_content: &str,
366    match_start: usize,
367    match_len: usize,
368) -> (usize, usize, usize, usize) {
369    // Bounds check to prevent panic
370    let line_len = line_content.len();
371    if match_start > line_len {
372        // If match_start is beyond line bounds, return a safe range at end of line
373        let char_count = line_content.chars().count();
374        return (line, char_count + 1, line, char_count + 1);
375    }
376
377    // Find safe character boundaries for the match range
378    let safe_match_start = find_char_boundary(line_content, match_start);
379    let safe_match_end_byte = find_char_boundary(line_content, (match_start + match_len).min(line_len));
380
381    // Convert byte positions to character positions safely
382    let char_start = byte_to_char_count(line_content, safe_match_start);
383    let char_len = if safe_match_end_byte > safe_match_start {
384        // Count characters in the safe range
385        line_content[safe_match_start..safe_match_end_byte].chars().count()
386    } else {
387        0
388    };
389    (line, char_start, line, char_start + char_len)
390}
391
392/// Calculate range for trailing content (like trailing spaces)
393///
394/// # Safety
395/// This function safely handles multi-byte UTF-8 characters by ensuring all
396/// string slicing operations occur at valid character boundaries.
397pub fn calculate_trailing_range(line: usize, line_content: &str, content_end: usize) -> (usize, usize, usize, usize) {
398    // Find safe character boundary for content_end
399    let safe_content_end = find_char_boundary(line_content, content_end);
400    let char_content_end = byte_to_char_count(line_content, safe_content_end);
401    let line_char_len = line_content.chars().count() + 1;
402    (line, char_content_end, line, line_char_len)
403}
404
405/// Calculate range for a heading (entire line)
406pub fn calculate_heading_range(line: usize, line_content: &str) -> (usize, usize, usize, usize) {
407    calculate_line_range(line, line_content)
408}
409
410/// Calculate range for emphasis markers and content
411///
412/// # Safety
413/// This function safely handles multi-byte UTF-8 characters by ensuring all
414/// string slicing operations occur at valid character boundaries.
415pub fn calculate_emphasis_range(
416    line: usize,
417    line_content: &str,
418    start_pos: usize,
419    end_pos: usize,
420) -> (usize, usize, usize, usize) {
421    // Find safe character boundaries for start and end positions
422    let safe_start_pos = find_char_boundary(line_content, start_pos);
423    let safe_end_pos = find_char_boundary(line_content, end_pos);
424    let char_start = byte_to_char_count(line_content, safe_start_pos);
425    let char_end = byte_to_char_count(line_content, safe_end_pos);
426    (line, char_start, line, char_end)
427}
428
429/// Calculate range for HTML tags
430pub fn calculate_html_tag_range(
431    line: usize,
432    line_content: &str,
433    tag_start: usize,
434    tag_len: usize,
435) -> (usize, usize, usize, usize) {
436    calculate_match_range(line, line_content, tag_start, tag_len)
437}
438
439/// Calculate range for URLs
440pub fn calculate_url_range(
441    line: usize,
442    line_content: &str,
443    url_start: usize,
444    url_len: usize,
445) -> (usize, usize, usize, usize) {
446    calculate_match_range(line, line_content, url_start, url_len)
447}
448
449/// Calculate range for list markers
450pub fn calculate_list_marker_range(
451    line: usize,
452    line_content: &str,
453    marker_start: usize,
454    marker_len: usize,
455) -> (usize, usize, usize, usize) {
456    calculate_match_range(line, line_content, marker_start, marker_len)
457}
458
459/// Calculate range that exceeds a limit (like line length)
460pub fn calculate_excess_range(line: usize, line_content: &str, limit: usize) -> (usize, usize, usize, usize) {
461    let char_limit = std::cmp::min(limit, line_content.chars().count());
462    let line_char_len = line_content.chars().count() + 1;
463    (line, char_limit + 1, line, line_char_len)
464}
465
466#[cfg(test)]
467mod tests {
468    use super::*;
469
470    #[test]
471    fn test_single_line_range() {
472        let (start_line, start_col, end_line, end_col) = calculate_single_line_range(5, 10, 3);
473        assert_eq!(start_line, 5);
474        assert_eq!(start_col, 10);
475        assert_eq!(end_line, 5);
476        assert_eq!(end_col, 13);
477    }
478
479    #[test]
480    fn test_line_range() {
481        let content = "# This is a heading  ";
482        let (start_line, start_col, end_line, end_col) = calculate_line_range(1, content);
483        assert_eq!(start_line, 1);
484        assert_eq!(start_col, 1);
485        assert_eq!(end_line, 1);
486        assert_eq!(end_col, 20); // Trimmed length + 1
487    }
488
489    #[test]
490    fn test_match_range() {
491        let content = "Text <div>content</div> more";
492        let tag_start = 5; // Position of '<'
493        let tag_len = 5; // Length of "<div>"
494        let (start_line, start_col, end_line, end_col) = calculate_match_range(1, content, tag_start, tag_len);
495        assert_eq!(start_line, 1);
496        assert_eq!(start_col, 6); // 1-indexed
497        assert_eq!(end_line, 1);
498        assert_eq!(end_col, 11); // 6 + 5
499    }
500
501    #[test]
502    fn test_trailing_range() {
503        let content = "Text content   "; // 3 trailing spaces
504        let content_end = 12; // End of "Text content"
505        let (start_line, start_col, end_line, end_col) = calculate_trailing_range(1, content, content_end);
506        assert_eq!(start_line, 1);
507        assert_eq!(start_col, 13); // content_end + 1 (1-indexed)
508        assert_eq!(end_line, 1);
509        assert_eq!(end_col, 16); // Total length + 1
510    }
511
512    #[test]
513    fn test_excess_range() {
514        let content = "This line is too long for the limit";
515        let limit = 20;
516        let (start_line, start_col, end_line, end_col) = calculate_excess_range(1, content, limit);
517        assert_eq!(start_line, 1);
518        assert_eq!(start_col, 21); // limit + 1
519        assert_eq!(end_line, 1);
520        assert_eq!(end_col, 36); // Total length + 1 (35 chars + 1 = 36)
521    }
522
523    #[test]
524    fn test_whole_line_range() {
525        let content = "Line 1\nLine 2\nLine 3";
526        let line_index = LineIndex::new(content);
527
528        // Test first line (includes newline)
529        let range = line_index.whole_line_range(1);
530        assert_eq!(range, 0..7); // "Line 1\n"
531
532        // Test middle line
533        let range = line_index.whole_line_range(2);
534        assert_eq!(range, 7..14); // "Line 2\n"
535
536        // Test last line (no newline)
537        let range = line_index.whole_line_range(3);
538        assert_eq!(range, 14..20); // "Line 3"
539    }
540
541    #[test]
542    fn test_line_content_range() {
543        let content = "Line 1\nLine 2\nLine 3";
544        let line_index = LineIndex::new(content);
545
546        // Test first line content (excludes newline)
547        let range = line_index.line_content_range(1);
548        assert_eq!(range, 0..6); // "Line 1"
549
550        // Test middle line content
551        let range = line_index.line_content_range(2);
552        assert_eq!(range, 7..13); // "Line 2"
553
554        // Test last line content
555        let range = line_index.line_content_range(3);
556        assert_eq!(range, 14..20); // "Line 3"
557    }
558
559    #[test]
560    fn test_line_text_range() {
561        let content = "Hello world\nAnother line";
562        let line_index = LineIndex::new(content);
563
564        // Test partial text in first line
565        let range = line_index.line_text_range(1, 1, 5); // "Hell"
566        assert_eq!(range, 0..4);
567
568        // Test partial text in second line
569        let range = line_index.line_text_range(2, 1, 7); // "Another"
570        assert_eq!(range, 12..18);
571
572        // Test bounds checking
573        let range = line_index.line_text_range(1, 1, 100); // Should clamp to line end
574        assert_eq!(range, 0..11); // "Hello world"
575    }
576
577    #[test]
578    fn test_calculate_match_range_bounds_checking() {
579        // Test case 1: match_start beyond line bounds
580        let line_content = "] not a link [";
581        let (line, start_col, end_line, end_col) = calculate_match_range(121, line_content, 57, 10);
582        assert_eq!(line, 121);
583        assert_eq!(start_col, 15); // line length + 1
584        assert_eq!(end_line, 121);
585        assert_eq!(end_col, 15); // same as start when out of bounds
586
587        // Test case 2: match extends beyond line end
588        let line_content = "short";
589        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 2, 10);
590        assert_eq!(line, 1);
591        assert_eq!(start_col, 3); // position 2 + 1
592        assert_eq!(end_line, 1);
593        assert_eq!(end_col, 6); // clamped to line length + 1
594
595        // Test case 3: normal case within bounds
596        let line_content = "normal text here";
597        let (line, start_col, end_line, end_col) = calculate_match_range(5, line_content, 7, 4);
598        assert_eq!(line, 5);
599        assert_eq!(start_col, 8); // position 7 + 1
600        assert_eq!(end_line, 5);
601        assert_eq!(end_col, 12); // position 7 + 4 + 1
602
603        // Test case 4: zero length match
604        let line_content = "test line";
605        let (line, start_col, end_line, end_col) = calculate_match_range(10, line_content, 5, 0);
606        assert_eq!(line, 10);
607        assert_eq!(start_col, 6); // position 5 + 1
608        assert_eq!(end_line, 10);
609        assert_eq!(end_col, 6); // same as start for zero length
610    }
611
612    // ============================================================================
613    // UTF-8 Multi-byte Character Tests (Issue #154)
614    // ============================================================================
615
616    #[test]
617    fn test_issue_154_korean_character_boundary() {
618        // Exact reproduction of issue #154: Korean character '후' (3 bytes: 18..21)
619        // The error was: "byte index 19 is not a char boundary; it is inside '후'"
620        let line_content = "- 2023 년 초 이후 주가 상승        +1,000% (10 배 상승)  ";
621
622        // Test match at byte 19 (middle of '후' character)
623        // This should not panic and should find the nearest character boundary
624        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 19, 1);
625
626        // Should successfully calculate without panicking
627        assert!(start_col > 0);
628        assert_eq!(line, 1);
629        assert_eq!(end_line, 1);
630        assert!(end_col >= start_col);
631    }
632
633    #[test]
634    fn test_calculate_match_range_korean() {
635        // Korean text: "안녕하세요" (Hello in Korean)
636        // Each character is 3 bytes
637        let line_content = "안녕하세요";
638        // Match at byte 3 (start of second character)
639        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 3, 3);
640        assert_eq!(line, 1);
641        assert_eq!(start_col, 2); // Second character (1-indexed)
642        assert_eq!(end_line, 1);
643        assert_eq!(end_col, 3); // End of second character
644
645        // Match at byte 4 (middle of second character - should round down)
646        let (line, start_col, end_line, _end_col) = calculate_match_range(1, line_content, 4, 3);
647        assert_eq!(line, 1);
648        assert_eq!(start_col, 2); // Should round to start of character
649        assert_eq!(end_line, 1);
650    }
651
652    #[test]
653    fn test_calculate_match_range_chinese() {
654        // Chinese text: "你好世界" (Hello World)
655        // Each character is 3 bytes
656        let line_content = "你好世界";
657        // Match at byte 6 (start of third character)
658        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 6, 3);
659        assert_eq!(line, 1);
660        assert_eq!(start_col, 3); // Third character (1-indexed)
661        assert_eq!(end_line, 1);
662        assert_eq!(end_col, 4); // End of third character
663    }
664
665    #[test]
666    fn test_calculate_match_range_japanese() {
667        // Japanese text: "こんにちは" (Hello)
668        // Each character is 3 bytes
669        let line_content = "こんにちは";
670        // Match at byte 9 (start of fourth character)
671        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 9, 3);
672        assert_eq!(line, 1);
673        assert_eq!(start_col, 4); // Fourth character (1-indexed)
674        assert_eq!(end_line, 1);
675        assert_eq!(end_col, 5); // End of fourth character
676    }
677
678    #[test]
679    fn test_calculate_match_range_mixed_unicode() {
680        // Mixed ASCII and CJK: "Hello 世界"
681        // "Hello " = 6 bytes (H, e, l, l, o, space)
682        // "世" = bytes 6-8 (3 bytes), character 7
683        // "界" = bytes 9-11 (3 bytes), character 8
684        let line_content = "Hello 世界";
685
686        // Match at byte 5 (space character)
687        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 5, 1);
688        assert_eq!(line, 1);
689        assert_eq!(start_col, 6); // Space character (1-indexed: H=1, e=2, l=3, l=4, o=5, space=6)
690        assert_eq!(end_line, 1);
691        assert_eq!(end_col, 7); // After space
692
693        // Match at byte 6 (start of first Chinese character "世")
694        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 6, 3);
695        assert_eq!(line, 1);
696        assert_eq!(start_col, 7); // First Chinese character (1-indexed)
697        assert_eq!(end_line, 1);
698        assert_eq!(end_col, 8); // End of first Chinese character
699    }
700
701    #[test]
702    fn test_calculate_trailing_range_korean() {
703        // Korean text with trailing spaces
704        let line_content = "안녕하세요   ";
705        // content_end at byte 15 (middle of last character + spaces)
706        let (line, start_col, end_line, end_col) = calculate_trailing_range(1, line_content, 15);
707        assert_eq!(line, 1);
708        assert!(start_col > 0);
709        assert_eq!(end_line, 1);
710        assert!(end_col > start_col);
711    }
712
713    #[test]
714    fn test_calculate_emphasis_range_chinese() {
715        // Chinese text with emphasis markers
716        let line_content = "这是**重要**的";
717        // start_pos and end_pos at byte boundaries within Chinese characters
718        let (line, start_col, end_line, end_col) = calculate_emphasis_range(1, line_content, 6, 12);
719        assert_eq!(line, 1);
720        assert!(start_col > 0);
721        assert_eq!(end_line, 1);
722        assert!(end_col > start_col);
723    }
724
725    #[test]
726    fn test_line_col_to_byte_range_korean() {
727        // Test that column positions (character positions) are correctly converted to byte positions
728        let content = "안녕하세요\nWorld";
729        let line_index = LineIndex::new(content);
730
731        // Column 1 (first character)
732        let range = line_index.line_col_to_byte_range(1, 1);
733        assert_eq!(range, 0..0);
734
735        // Column 2 (second character)
736        let range = line_index.line_col_to_byte_range(1, 2);
737        assert_eq!(range, 3..3); // 3 bytes for first character
738
739        // Column 3 (third character)
740        let range = line_index.line_col_to_byte_range(1, 3);
741        assert_eq!(range, 6..6); // 6 bytes for first two characters
742    }
743
744    #[test]
745    fn test_line_col_to_byte_range_with_length_chinese() {
746        // Test byte range calculation with length for Chinese characters
747        let content = "你好世界\nTest";
748        let line_index = LineIndex::new(content);
749
750        // Column 1, length 2 (first two Chinese characters)
751        let range = line_index.line_col_to_byte_range_with_length(1, 1, 2);
752        assert_eq!(range, 0..6); // 6 bytes for two 3-byte characters
753
754        // Column 2, length 1 (second Chinese character)
755        let range = line_index.line_col_to_byte_range_with_length(1, 2, 1);
756        assert_eq!(range, 3..6); // Bytes 3-6 for second character
757    }
758
759    #[test]
760    fn test_line_text_range_japanese() {
761        // Test text range calculation for Japanese characters
762        let content = "こんにちは\nHello";
763        let line_index = LineIndex::new(content);
764
765        // Columns 2-4 (second to fourth Japanese characters)
766        let range = line_index.line_text_range(1, 2, 4);
767        assert_eq!(range, 3..9); // Bytes 3-9 for three 3-byte characters
768    }
769
770    #[test]
771    fn test_find_char_boundary_edge_cases() {
772        // Test the helper function directly
773        let s = "안녕";
774
775        // Byte 0 (start) - should be valid
776        assert_eq!(find_char_boundary(s, 0), 0);
777
778        // Byte 1 (middle of first character) - should round down to 0
779        assert_eq!(find_char_boundary(s, 1), 0);
780
781        // Byte 2 (middle of first character) - should round down to 0
782        assert_eq!(find_char_boundary(s, 2), 0);
783
784        // Byte 3 (start of second character) - should be valid
785        assert_eq!(find_char_boundary(s, 3), 3);
786
787        // Byte 4 (middle of second character) - should round down to 3
788        assert_eq!(find_char_boundary(s, 4), 3);
789
790        // Byte beyond string length - should return string length
791        assert_eq!(find_char_boundary(s, 100), s.len());
792    }
793
794    #[test]
795    fn test_byte_to_char_count_unicode() {
796        // Test character counting with multi-byte characters
797        let s = "안녕하세요";
798
799        // Byte 0 (start) - 1 character
800        assert_eq!(byte_to_char_count(s, 0), 1);
801
802        // Byte 3 (start of second character) - 2 characters
803        assert_eq!(byte_to_char_count(s, 3), 2);
804
805        // Byte 6 (start of third character) - 3 characters
806        assert_eq!(byte_to_char_count(s, 6), 3);
807
808        // Byte 9 (start of fourth character) - 4 characters
809        assert_eq!(byte_to_char_count(s, 9), 4);
810
811        // Byte 12 (start of fifth character) - 5 characters
812        assert_eq!(byte_to_char_count(s, 12), 5);
813
814        // Byte 15 (end) - 6 characters (5 + 1 for 1-indexed)
815        assert_eq!(byte_to_char_count(s, 15), 6);
816    }
817
818    #[test]
819    fn test_all_range_functions_with_emoji() {
820        // Test with emoji (4-byte UTF-8 characters)
821        let line_content = "Hello 🎉 World 🌍";
822
823        // calculate_match_range
824        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 6, 4);
825        assert_eq!(line, 1);
826        assert!(start_col > 0);
827        assert_eq!(end_line, 1);
828        assert!(end_col > start_col);
829
830        // calculate_trailing_range
831        let (line, start_col, end_line, end_col) = calculate_trailing_range(1, line_content, 12);
832        assert_eq!(line, 1);
833        assert!(start_col > 0);
834        assert_eq!(end_line, 1);
835        assert!(end_col > start_col);
836
837        // calculate_emphasis_range
838        let (line, start_col, end_line, end_col) = calculate_emphasis_range(1, line_content, 0, 5);
839        assert_eq!(line, 1);
840        assert_eq!(start_col, 1);
841        assert_eq!(end_line, 1);
842        assert!(end_col > start_col);
843    }
844}
rumdl_lib/utils/range_utils.rs

rumdl_lib/utils/
range_utils.rs