rumdl_lib/utils/
range_utils.rs

1//! Utilities for position/range conversions
2
3use crate::utils::calculate_indentation_width_default;
4use std::collections::HashSet;
5use std::ops::Range;
6
7/// Find the nearest valid UTF-8 character boundary at or before the given byte index.
8/// This is critical for safely slicing strings that may contain multi-byte UTF-8 characters.
9///
10/// # Safety
11/// Returns a byte index that is guaranteed to be a valid character boundary,
12/// or the string length if the index is beyond the string.
13fn find_char_boundary(s: &str, byte_idx: usize) -> usize {
14    if byte_idx >= s.len() {
15        return s.len();
16    }
17
18    // If the index is already at a character boundary, return it
19    if s.is_char_boundary(byte_idx) {
20        return byte_idx;
21    }
22
23    // Find the nearest character boundary by scanning backwards
24    // This is safe because we know byte_idx < s.len()
25    let mut pos = byte_idx;
26    while pos > 0 && !s.is_char_boundary(pos) {
27        pos -= 1;
28    }
29    pos
30}
31
32/// Convert a byte index to a character count (1-indexed).
33/// This safely handles multi-byte UTF-8 characters by finding the nearest character boundary.
34fn byte_to_char_count(s: &str, byte_idx: usize) -> usize {
35    let safe_byte_idx = find_char_boundary(s, byte_idx);
36    s[..safe_byte_idx].chars().count() + 1 // 1-indexed
37}
38
39#[derive(Debug)]
40pub struct LineIndex<'a> {
41    line_starts: Vec<usize>,
42    content: &'a str,
43    code_block_lines: Option<HashSet<usize>>,
44}
45
46impl<'a> LineIndex<'a> {
47    pub fn new(content: &'a str) -> Self {
48        let mut line_starts = vec![0];
49        let mut pos = 0;
50
51        for c in content.chars() {
52            pos += c.len_utf8();
53            if c == '\n' {
54                line_starts.push(pos);
55            }
56        }
57
58        let mut index = Self {
59            line_starts,
60            content,
61            code_block_lines: None,
62        };
63
64        // Pre-compute code block lines for better performance
65        index.compute_code_block_lines();
66
67        index
68    }
69
70    /// Create a `LineIndex` from pre-computed line start byte offsets.
71    /// Each entry is the byte offset of the first character on that line.
72    /// The first entry must be 0 (start of content).
73    pub fn with_line_starts(content: &'a str, line_starts: Vec<usize>) -> Self {
74        let mut index = Self {
75            line_starts,
76            content,
77            code_block_lines: None,
78        };
79
80        // Pre-compute code block lines for better performance
81        index.compute_code_block_lines();
82
83        index
84    }
85
86    /// Create a `LineIndex` from pre-computed line starts and code block byte ranges.
87    ///
88    /// Instead of re-scanning content to find code blocks, this converts
89    /// the already-detected byte ranges into line-level information.
90    pub fn with_line_starts_and_code_blocks(
91        content: &'a str,
92        line_starts: Vec<usize>,
93        code_block_byte_ranges: &[(usize, usize)],
94    ) -> Self {
95        let mut code_block_lines = HashSet::new();
96
97        for &(block_start, block_end) in code_block_byte_ranges {
98            let start_line = match line_starts.binary_search(&block_start) {
99                Ok(idx) => idx,
100                Err(idx) => idx.saturating_sub(1),
101            };
102            let end_line = if block_end == 0 {
103                0
104            } else {
105                match line_starts.binary_search(&block_end) {
106                    // block_end exactly at a line start means the block ended on the previous line
107                    Ok(idx) => idx.saturating_sub(1),
108                    Err(idx) => idx.saturating_sub(1),
109                }
110            };
111            for line_idx in start_line..=end_line {
112                code_block_lines.insert(line_idx);
113            }
114        }
115
116        Self {
117            line_starts,
118            content,
119            code_block_lines: Some(code_block_lines),
120        }
121    }
122
123    /// Get the content of a line by 0-based index using pre-computed byte offsets.
124    /// Returns the line content without the trailing newline character.
125    fn get_line(&self, line_idx: usize) -> Option<&'a str> {
126        let start = *self.line_starts.get(line_idx)?;
127        let end = self
128            .line_starts
129            .get(line_idx + 1)
130            .copied()
131            .unwrap_or(self.content.len());
132        let line = &self.content[start..end];
133        // Strip trailing newline (and optional \r before it)
134        let line = line.strip_suffix('\n').unwrap_or(line);
135        let line = line.strip_suffix('\r').unwrap_or(line);
136        Some(line)
137    }
138
139    pub fn line_col_to_byte_range(&self, line: usize, column: usize) -> Range<usize> {
140        let line = line.saturating_sub(1);
141        let line_start = *self.line_starts.get(line).unwrap_or(&self.content.len());
142
143        let current_line = self.get_line(line).unwrap_or("");
144        // Column is 1-indexed character position, not byte position
145        let char_col = column.saturating_sub(1);
146        let char_count = current_line.chars().count();
147        let safe_char_col = char_col.min(char_count);
148
149        // Convert character position to byte position
150        let byte_offset = current_line
151            .char_indices()
152            .nth(safe_char_col)
153            .map_or(current_line.len(), |(idx, _)| idx);
154
155        let start = line_start + byte_offset;
156        start..start
157    }
158
159    /// Calculate a proper byte range for replacing text with a specific length
160    /// This is the correct function to use for LSP fixes
161    ///
162    /// # Safety
163    /// This function correctly handles multi-byte UTF-8 characters by converting
164    /// character positions (columns) to byte positions.
165    pub fn line_col_to_byte_range_with_length(&self, line: usize, column: usize, length: usize) -> Range<usize> {
166        let line = line.saturating_sub(1);
167        let line_start = *self.line_starts.get(line).unwrap_or(&self.content.len());
168        let line_end = self.line_starts.get(line + 1).copied().unwrap_or(self.content.len());
169        let mut current_line = &self.content[line_start..line_end];
170        if let Some(stripped) = current_line.strip_suffix('\n') {
171            current_line = stripped.strip_suffix('\r').unwrap_or(stripped);
172        }
173        if current_line.is_ascii() {
174            let line_len = current_line.len();
175            let start_byte = column.saturating_sub(1).min(line_len);
176            let end_byte = start_byte.saturating_add(length).min(line_len);
177            let start = line_start + start_byte;
178            let end = line_start + end_byte;
179            return start..end;
180        }
181        // Column is 1-indexed character position, not byte position
182        let char_col = column.saturating_sub(1);
183        let char_count = current_line.chars().count();
184        let safe_char_col = char_col.min(char_count);
185
186        // Convert character positions to byte positions
187        let mut char_indices = current_line.char_indices();
188        let start_byte = char_indices
189            .nth(safe_char_col)
190            .map_or(current_line.len(), |(idx, _)| idx);
191
192        // Calculate end position (start + length in characters)
193        let end_char_col = (safe_char_col + length).min(char_count);
194        let end_byte = current_line
195            .char_indices()
196            .nth(end_char_col)
197            .map_or(current_line.len(), |(idx, _)| idx);
198
199        let start = line_start + start_byte;
200        let end = line_start + end_byte;
201        start..end
202    }
203
204    /// Calculate byte range for entire line replacement (including newline)
205    /// This is ideal for rules that need to replace complete lines
206    pub fn whole_line_range(&self, line: usize) -> Range<usize> {
207        let line_idx = line.saturating_sub(1);
208        let start = *self.line_starts.get(line_idx).unwrap_or(&self.content.len());
209        let end = self
210            .line_starts
211            .get(line_idx + 1)
212            .copied()
213            .unwrap_or(self.content.len());
214        start..end
215    }
216
217    /// Calculate byte range spanning multiple lines (from start_line to end_line inclusive)
218    /// Both lines are 1-indexed. This is useful for replacing entire blocks like tables.
219    pub fn multi_line_range(&self, start_line: usize, end_line: usize) -> Range<usize> {
220        let start_idx = start_line.saturating_sub(1);
221        let end_idx = end_line.saturating_sub(1);
222
223        let start = *self.line_starts.get(start_idx).unwrap_or(&self.content.len());
224        let end = self.line_starts.get(end_idx + 1).copied().unwrap_or(self.content.len());
225        start..end
226    }
227
228    /// Calculate byte range for text within a line (excluding newline)
229    /// Useful for replacing specific parts of a line
230    ///
231    /// # Safety
232    /// This function correctly handles multi-byte UTF-8 characters by converting
233    /// character positions (columns) to byte positions.
234    pub fn line_text_range(&self, line: usize, start_col: usize, end_col: usize) -> Range<usize> {
235        let line_idx = line.saturating_sub(1);
236        let line_start = *self.line_starts.get(line_idx).unwrap_or(&self.content.len());
237
238        // Get the actual line content to ensure we don't exceed bounds
239        let current_line = self.get_line(line_idx).unwrap_or("");
240        let char_count = current_line.chars().count();
241
242        // Convert character positions to byte positions
243        let start_char_col = start_col.saturating_sub(1).min(char_count);
244        let end_char_col = end_col.saturating_sub(1).min(char_count);
245
246        let mut char_indices = current_line.char_indices();
247        let start_byte = char_indices
248            .nth(start_char_col)
249            .map_or(current_line.len(), |(idx, _)| idx);
250
251        let end_byte = current_line
252            .char_indices()
253            .nth(end_char_col)
254            .map_or(current_line.len(), |(idx, _)| idx);
255
256        let start = line_start + start_byte;
257        let end = line_start + end_byte.max(start_byte);
258        start..end
259    }
260
261    /// Calculate byte range from start of line to end of line content (excluding newline)
262    /// Useful for replacing line content while preserving line structure
263    pub fn line_content_range(&self, line: usize) -> Range<usize> {
264        let line_idx = line.saturating_sub(1);
265        let line_start = *self.line_starts.get(line_idx).unwrap_or(&self.content.len());
266
267        let current_line = self.get_line(line_idx).unwrap_or("");
268        let line_end = line_start + current_line.len();
269        line_start..line_end
270    }
271
272    /// Get the global start byte offset for a given 1-based line number.
273    pub fn get_line_start_byte(&self, line_num: usize) -> Option<usize> {
274        if line_num == 0 {
275            return None; // Lines are 1-based
276        }
277        // line_num is 1-based, line_starts index is 0-based
278        self.line_starts.get(line_num - 1).copied()
279    }
280
281    /// Check if the line at the given index is within a code block
282    pub fn is_code_block(&self, line: usize) -> bool {
283        if let Some(ref code_block_lines) = self.code_block_lines {
284            code_block_lines.contains(&line)
285        } else {
286            // Fallback to a simpler check if pre-computation wasn't done
287            self.is_code_fence(line)
288        }
289    }
290
291    /// Check if the line is a code fence marker (``` or ~~~)
292    pub fn is_code_fence(&self, line: usize) -> bool {
293        self.get_line(line).is_some_and(|l| {
294            let trimmed = l.trim();
295            trimmed.starts_with("```") || trimmed.starts_with("~~~")
296        })
297    }
298
299    /// Check if the line is a tilde code fence marker (~~~)
300    pub fn is_tilde_code_block(&self, line: usize) -> bool {
301        self.get_line(line).is_some_and(|l| l.trim().starts_with("~~~"))
302    }
303
304    /// Get a reference to the content
305    pub fn get_content(&self) -> &str {
306        self.content
307    }
308
309    /// Pre-compute which lines are within code blocks for faster lookup
310    fn compute_code_block_lines(&mut self) {
311        let mut code_block_lines = HashSet::new();
312        let lines: Vec<&str> = self.content.lines().collect();
313
314        // Initialize block tracking
315        let mut in_block = false;
316        let mut active_fence_type = ' '; // '`' or '~'
317        let mut block_indent = 0;
318        let mut block_fence_length = 0;
319        let mut in_markdown_block = false;
320        let mut nested_fence_start = None;
321        let mut nested_fence_end = None;
322
323        // Process each line
324        for (i, line) in lines.iter().enumerate() {
325            let trimmed = line.trim();
326            let indent = line.len() - trimmed.len();
327
328            // 1. Detect indented code blocks (4+ columns accounting for tab expansion)
329            if calculate_indentation_width_default(line) >= 4 {
330                code_block_lines.insert(i);
331                continue; // Skip further processing for indented code blocks
332            }
333
334            // 2. Handle fenced code blocks (backticks and tildes)
335            if !in_block {
336                // Check for opening fences
337                if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
338                    let char_type = if trimmed.starts_with("```") { '`' } else { '~' };
339                    let count = trimmed.chars().take_while(|&c| c == char_type).count();
340                    let info_string = if trimmed.len() > count {
341                        trimmed[count..].trim()
342                    } else {
343                        ""
344                    };
345
346                    // Mark the start of a new code block
347                    in_block = true;
348                    active_fence_type = char_type;
349                    block_indent = indent;
350                    block_fence_length = count;
351                    in_markdown_block = info_string == "markdown";
352                    nested_fence_start = None;
353                    nested_fence_end = None;
354
355                    code_block_lines.insert(i);
356                }
357            } else {
358                // We're inside a code block
359                code_block_lines.insert(i);
360
361                // Detection of nested fences in markdown blocks
362                if in_markdown_block && nested_fence_start.is_none() && trimmed.starts_with("```") {
363                    // Check if this looks like a nested fence opening (has content after the backticks)
364                    let count = trimmed.chars().take_while(|&c| c == '`').count();
365                    let remaining = if trimmed.len() > count {
366                        trimmed[count..].trim()
367                    } else {
368                        ""
369                    };
370
371                    if !remaining.is_empty() {
372                        nested_fence_start = Some(i);
373                    }
374                }
375
376                // Check if we've found a nested fence end (only if we have a start)
377                if in_markdown_block
378                    && nested_fence_start.is_some()
379                    && nested_fence_end.is_none()
380                    && trimmed.starts_with("```")
381                    && trimmed.trim_start_matches('`').trim().is_empty()
382                {
383                    nested_fence_end = Some(i);
384                }
385
386                // Check if this line matches the closing fence pattern for the outer block
387                if trimmed.starts_with(&active_fence_type.to_string().repeat(3)) {
388                    let count = trimmed.chars().take_while(|&c| c == active_fence_type).count();
389                    let remaining = if trimmed.len() > count {
390                        trimmed[count..].trim()
391                    } else {
392                        ""
393                    };
394
395                    // A line is a closing fence if:
396                    // 1. It uses the same fence character as the opening fence
397                    // 2. It has at least as many fence characters as the opening fence
398                    // 3. It has no content after the fence characters (except for whitespace)
399                    // 4. Its indentation level is less than or equal to the opening fence
400                    let is_valid_closing_fence =
401                        count >= block_fence_length && remaining.is_empty() && indent <= block_indent;
402
403                    // For nested code blocks in markdown, the first backtick fence after the nested content
404                    // should be recognized as the closing fence for the outer block
405                    let is_nested_closing = nested_fence_end.is_some() && i == nested_fence_end.unwrap();
406
407                    // Skip nested closing fences
408                    if is_valid_closing_fence && !is_nested_closing {
409                        in_block = false;
410                        in_markdown_block = false;
411                    }
412                }
413            }
414        }
415
416        self.code_block_lines = Some(code_block_lines);
417    }
418}
419
420/// Calculate end position for a single-line range
421pub fn calculate_single_line_range(line: usize, start_col: usize, length: usize) -> (usize, usize, usize, usize) {
422    (line, start_col, line, start_col + length)
423}
424
425/// Calculate range for entire line
426pub fn calculate_line_range(line: usize, line_content: &str) -> (usize, usize, usize, usize) {
427    let trimmed_len = line_content.trim_end().len();
428    (line, 1, line, trimmed_len + 1)
429}
430
431/// Calculate range from regex match on a line
432///
433/// # Safety
434/// This function safely handles multi-byte UTF-8 characters by ensuring all
435/// string slicing operations occur at valid character boundaries.
436pub fn calculate_match_range(
437    line: usize,
438    line_content: &str,
439    match_start: usize,
440    match_len: usize,
441) -> (usize, usize, usize, usize) {
442    // Bounds check to prevent panic
443    let line_len = line_content.len();
444    if match_start > line_len {
445        // If match_start is beyond line bounds, return a safe range at end of line
446        let char_count = line_content.chars().count();
447        return (line, char_count + 1, line, char_count + 1);
448    }
449
450    // Find safe character boundaries for the match range
451    let safe_match_start = find_char_boundary(line_content, match_start);
452    let safe_match_end_byte = find_char_boundary(line_content, (match_start + match_len).min(line_len));
453
454    // Convert byte positions to character positions safely
455    let char_start = byte_to_char_count(line_content, safe_match_start);
456    let char_len = if safe_match_end_byte > safe_match_start {
457        // Count characters in the safe range
458        line_content[safe_match_start..safe_match_end_byte].chars().count()
459    } else {
460        0
461    };
462    (line, char_start, line, char_start + char_len)
463}
464
465/// Calculate range for trailing content (like trailing spaces)
466///
467/// # Safety
468/// This function safely handles multi-byte UTF-8 characters by ensuring all
469/// string slicing operations occur at valid character boundaries.
470pub fn calculate_trailing_range(line: usize, line_content: &str, content_end: usize) -> (usize, usize, usize, usize) {
471    // Find safe character boundary for content_end
472    let safe_content_end = find_char_boundary(line_content, content_end);
473    let char_content_end = byte_to_char_count(line_content, safe_content_end);
474    let line_char_len = line_content.chars().count() + 1;
475    (line, char_content_end, line, line_char_len)
476}
477
478/// Calculate range for a heading (entire line)
479pub fn calculate_heading_range(line: usize, line_content: &str) -> (usize, usize, usize, usize) {
480    calculate_line_range(line, line_content)
481}
482
483/// Calculate range for emphasis markers and content
484///
485/// # Safety
486/// This function safely handles multi-byte UTF-8 characters by ensuring all
487/// string slicing operations occur at valid character boundaries.
488pub fn calculate_emphasis_range(
489    line: usize,
490    line_content: &str,
491    start_pos: usize,
492    end_pos: usize,
493) -> (usize, usize, usize, usize) {
494    // Find safe character boundaries for start and end positions
495    let safe_start_pos = find_char_boundary(line_content, start_pos);
496    let safe_end_pos = find_char_boundary(line_content, end_pos);
497    let char_start = byte_to_char_count(line_content, safe_start_pos);
498    let char_end = byte_to_char_count(line_content, safe_end_pos);
499    (line, char_start, line, char_end)
500}
501
502/// Calculate range for HTML tags
503pub fn calculate_html_tag_range(
504    line: usize,
505    line_content: &str,
506    tag_start: usize,
507    tag_len: usize,
508) -> (usize, usize, usize, usize) {
509    calculate_match_range(line, line_content, tag_start, tag_len)
510}
511
512/// Calculate range for URLs
513pub fn calculate_url_range(
514    line: usize,
515    line_content: &str,
516    url_start: usize,
517    url_len: usize,
518) -> (usize, usize, usize, usize) {
519    calculate_match_range(line, line_content, url_start, url_len)
520}
521
522/// Calculate range for list markers
523pub fn calculate_list_marker_range(
524    line: usize,
525    line_content: &str,
526    marker_start: usize,
527    marker_len: usize,
528) -> (usize, usize, usize, usize) {
529    calculate_match_range(line, line_content, marker_start, marker_len)
530}
531
532/// Calculate range that exceeds a limit (like line length)
533pub fn calculate_excess_range(line: usize, line_content: &str, limit: usize) -> (usize, usize, usize, usize) {
534    let char_limit = std::cmp::min(limit, line_content.chars().count());
535    let line_char_len = line_content.chars().count() + 1;
536    (line, char_limit + 1, line, line_char_len)
537}
538
539#[cfg(test)]
540mod tests {
541    use super::*;
542
543    #[test]
544    fn test_single_line_range() {
545        let (start_line, start_col, end_line, end_col) = calculate_single_line_range(5, 10, 3);
546        assert_eq!(start_line, 5);
547        assert_eq!(start_col, 10);
548        assert_eq!(end_line, 5);
549        assert_eq!(end_col, 13);
550    }
551
552    #[test]
553    fn test_line_range() {
554        let content = "# This is a heading  ";
555        let (start_line, start_col, end_line, end_col) = calculate_line_range(1, content);
556        assert_eq!(start_line, 1);
557        assert_eq!(start_col, 1);
558        assert_eq!(end_line, 1);
559        assert_eq!(end_col, 20); // Trimmed length + 1
560    }
561
562    #[test]
563    fn test_match_range() {
564        let content = "Text <div>content</div> more";
565        let tag_start = 5; // Position of '<'
566        let tag_len = 5; // Length of "<div>"
567        let (start_line, start_col, end_line, end_col) = calculate_match_range(1, content, tag_start, tag_len);
568        assert_eq!(start_line, 1);
569        assert_eq!(start_col, 6); // 1-indexed
570        assert_eq!(end_line, 1);
571        assert_eq!(end_col, 11); // 6 + 5
572    }
573
574    #[test]
575    fn test_trailing_range() {
576        let content = "Text content   "; // 3 trailing spaces
577        let content_end = 12; // End of "Text content"
578        let (start_line, start_col, end_line, end_col) = calculate_trailing_range(1, content, content_end);
579        assert_eq!(start_line, 1);
580        assert_eq!(start_col, 13); // content_end + 1 (1-indexed)
581        assert_eq!(end_line, 1);
582        assert_eq!(end_col, 16); // Total length + 1
583    }
584
585    #[test]
586    fn test_excess_range() {
587        let content = "This line is too long for the limit";
588        let limit = 20;
589        let (start_line, start_col, end_line, end_col) = calculate_excess_range(1, content, limit);
590        assert_eq!(start_line, 1);
591        assert_eq!(start_col, 21); // limit + 1
592        assert_eq!(end_line, 1);
593        assert_eq!(end_col, 36); // Total length + 1 (35 chars + 1 = 36)
594    }
595
596    #[test]
597    fn test_whole_line_range() {
598        let content = "Line 1\nLine 2\nLine 3";
599        let line_index = LineIndex::new(content);
600
601        // Test first line (includes newline)
602        let range = line_index.whole_line_range(1);
603        assert_eq!(range, 0..7); // "Line 1\n"
604
605        // Test middle line
606        let range = line_index.whole_line_range(2);
607        assert_eq!(range, 7..14); // "Line 2\n"
608
609        // Test last line (no newline)
610        let range = line_index.whole_line_range(3);
611        assert_eq!(range, 14..20); // "Line 3"
612    }
613
614    #[test]
615    fn test_line_content_range() {
616        let content = "Line 1\nLine 2\nLine 3";
617        let line_index = LineIndex::new(content);
618
619        // Test first line content (excludes newline)
620        let range = line_index.line_content_range(1);
621        assert_eq!(range, 0..6); // "Line 1"
622
623        // Test middle line content
624        let range = line_index.line_content_range(2);
625        assert_eq!(range, 7..13); // "Line 2"
626
627        // Test last line content
628        let range = line_index.line_content_range(3);
629        assert_eq!(range, 14..20); // "Line 3"
630    }
631
632    #[test]
633    fn test_line_text_range() {
634        let content = "Hello world\nAnother line";
635        let line_index = LineIndex::new(content);
636
637        // Test partial text in first line
638        let range = line_index.line_text_range(1, 1, 5); // "Hell"
639        assert_eq!(range, 0..4);
640
641        // Test partial text in second line
642        let range = line_index.line_text_range(2, 1, 7); // "Another"
643        assert_eq!(range, 12..18);
644
645        // Test bounds checking
646        let range = line_index.line_text_range(1, 1, 100); // Should clamp to line end
647        assert_eq!(range, 0..11); // "Hello world"
648    }
649
650    #[test]
651    fn test_calculate_match_range_bounds_checking() {
652        // Test case 1: match_start beyond line bounds
653        let line_content = "] not a link [";
654        let (line, start_col, end_line, end_col) = calculate_match_range(121, line_content, 57, 10);
655        assert_eq!(line, 121);
656        assert_eq!(start_col, 15); // line length + 1
657        assert_eq!(end_line, 121);
658        assert_eq!(end_col, 15); // same as start when out of bounds
659
660        // Test case 2: match extends beyond line end
661        let line_content = "short";
662        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 2, 10);
663        assert_eq!(line, 1);
664        assert_eq!(start_col, 3); // position 2 + 1
665        assert_eq!(end_line, 1);
666        assert_eq!(end_col, 6); // clamped to line length + 1
667
668        // Test case 3: normal case within bounds
669        let line_content = "normal text here";
670        let (line, start_col, end_line, end_col) = calculate_match_range(5, line_content, 7, 4);
671        assert_eq!(line, 5);
672        assert_eq!(start_col, 8); // position 7 + 1
673        assert_eq!(end_line, 5);
674        assert_eq!(end_col, 12); // position 7 + 4 + 1
675
676        // Test case 4: zero length match
677        let line_content = "test line";
678        let (line, start_col, end_line, end_col) = calculate_match_range(10, line_content, 5, 0);
679        assert_eq!(line, 10);
680        assert_eq!(start_col, 6); // position 5 + 1
681        assert_eq!(end_line, 10);
682        assert_eq!(end_col, 6); // same as start for zero length
683    }
684
685    // ============================================================================
686    // UTF-8 Multi-byte Character Tests (Issue #154)
687    // ============================================================================
688
689    #[test]
690    fn test_issue_154_korean_character_boundary() {
691        // Exact reproduction of issue #154: Korean character '후' (3 bytes: 18..21)
692        // The error was: "byte index 19 is not a char boundary; it is inside '후'"
693        let line_content = "- 2023 년 초 이후 주가 상승        +1,000% (10 배 상승)  ";
694
695        // Test match at byte 19 (middle of '후' character)
696        // This should not panic and should find the nearest character boundary
697        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 19, 1);
698
699        // Should successfully calculate without panicking
700        assert!(start_col > 0);
701        assert_eq!(line, 1);
702        assert_eq!(end_line, 1);
703        assert!(end_col >= start_col);
704    }
705
706    #[test]
707    fn test_calculate_match_range_korean() {
708        // Korean text: "안녕하세요" (Hello in Korean)
709        // Each character is 3 bytes
710        let line_content = "안녕하세요";
711        // Match at byte 3 (start of second character)
712        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 3, 3);
713        assert_eq!(line, 1);
714        assert_eq!(start_col, 2); // Second character (1-indexed)
715        assert_eq!(end_line, 1);
716        assert_eq!(end_col, 3); // End of second character
717
718        // Match at byte 4 (middle of second character - should round down)
719        let (line, start_col, end_line, _end_col) = calculate_match_range(1, line_content, 4, 3);
720        assert_eq!(line, 1);
721        assert_eq!(start_col, 2); // Should round to start of character
722        assert_eq!(end_line, 1);
723    }
724
725    #[test]
726    fn test_calculate_match_range_chinese() {
727        // Chinese text: "你好世界" (Hello World)
728        // Each character is 3 bytes
729        let line_content = "你好世界";
730        // Match at byte 6 (start of third character)
731        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 6, 3);
732        assert_eq!(line, 1);
733        assert_eq!(start_col, 3); // Third character (1-indexed)
734        assert_eq!(end_line, 1);
735        assert_eq!(end_col, 4); // End of third character
736    }
737
738    #[test]
739    fn test_calculate_match_range_japanese() {
740        // Japanese text: "こんにちは" (Hello)
741        // Each character is 3 bytes
742        let line_content = "こんにちは";
743        // Match at byte 9 (start of fourth character)
744        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 9, 3);
745        assert_eq!(line, 1);
746        assert_eq!(start_col, 4); // Fourth character (1-indexed)
747        assert_eq!(end_line, 1);
748        assert_eq!(end_col, 5); // End of fourth character
749    }
750
751    #[test]
752    fn test_calculate_match_range_mixed_unicode() {
753        // Mixed ASCII and CJK: "Hello 世界"
754        // "Hello " = 6 bytes (H, e, l, l, o, space)
755        // "世" = bytes 6-8 (3 bytes), character 7
756        // "界" = bytes 9-11 (3 bytes), character 8
757        let line_content = "Hello 世界";
758
759        // Match at byte 5 (space character)
760        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 5, 1);
761        assert_eq!(line, 1);
762        assert_eq!(start_col, 6); // Space character (1-indexed: H=1, e=2, l=3, l=4, o=5, space=6)
763        assert_eq!(end_line, 1);
764        assert_eq!(end_col, 7); // After space
765
766        // Match at byte 6 (start of first Chinese character "世")
767        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 6, 3);
768        assert_eq!(line, 1);
769        assert_eq!(start_col, 7); // First Chinese character (1-indexed)
770        assert_eq!(end_line, 1);
771        assert_eq!(end_col, 8); // End of first Chinese character
772    }
773
774    #[test]
775    fn test_calculate_trailing_range_korean() {
776        // Korean text with trailing spaces
777        let line_content = "안녕하세요   ";
778        // content_end at byte 15 (middle of last character + spaces)
779        let (line, start_col, end_line, end_col) = calculate_trailing_range(1, line_content, 15);
780        assert_eq!(line, 1);
781        assert!(start_col > 0);
782        assert_eq!(end_line, 1);
783        assert!(end_col > start_col);
784    }
785
786    #[test]
787    fn test_calculate_emphasis_range_chinese() {
788        // Chinese text with emphasis markers
789        let line_content = "这是**重要**的";
790        // start_pos and end_pos at byte boundaries within Chinese characters
791        let (line, start_col, end_line, end_col) = calculate_emphasis_range(1, line_content, 6, 12);
792        assert_eq!(line, 1);
793        assert!(start_col > 0);
794        assert_eq!(end_line, 1);
795        assert!(end_col > start_col);
796    }
797
798    #[test]
799    fn test_line_col_to_byte_range_korean() {
800        // Test that column positions (character positions) are correctly converted to byte positions
801        let content = "안녕하세요\nWorld";
802        let line_index = LineIndex::new(content);
803
804        // Column 1 (first character)
805        let range = line_index.line_col_to_byte_range(1, 1);
806        assert_eq!(range, 0..0);
807
808        // Column 2 (second character)
809        let range = line_index.line_col_to_byte_range(1, 2);
810        assert_eq!(range, 3..3); // 3 bytes for first character
811
812        // Column 3 (third character)
813        let range = line_index.line_col_to_byte_range(1, 3);
814        assert_eq!(range, 6..6); // 6 bytes for first two characters
815    }
816
817    #[test]
818    fn test_line_col_to_byte_range_with_length_chinese() {
819        // Test byte range calculation with length for Chinese characters
820        let content = "你好世界\nTest";
821        let line_index = LineIndex::new(content);
822
823        // Column 1, length 2 (first two Chinese characters)
824        let range = line_index.line_col_to_byte_range_with_length(1, 1, 2);
825        assert_eq!(range, 0..6); // 6 bytes for two 3-byte characters
826
827        // Column 2, length 1 (second Chinese character)
828        let range = line_index.line_col_to_byte_range_with_length(1, 2, 1);
829        assert_eq!(range, 3..6); // Bytes 3-6 for second character
830    }
831
832    #[test]
833    fn test_line_text_range_japanese() {
834        // Test text range calculation for Japanese characters
835        let content = "こんにちは\nHello";
836        let line_index = LineIndex::new(content);
837
838        // Columns 2-4 (second to fourth Japanese characters)
839        let range = line_index.line_text_range(1, 2, 4);
840        assert_eq!(range, 3..9); // Bytes 3-9 for three 3-byte characters
841    }
842
843    #[test]
844    fn test_find_char_boundary_edge_cases() {
845        // Test the helper function directly
846        let s = "안녕";
847
848        // Byte 0 (start) - should be valid
849        assert_eq!(find_char_boundary(s, 0), 0);
850
851        // Byte 1 (middle of first character) - should round down to 0
852        assert_eq!(find_char_boundary(s, 1), 0);
853
854        // Byte 2 (middle of first character) - should round down to 0
855        assert_eq!(find_char_boundary(s, 2), 0);
856
857        // Byte 3 (start of second character) - should be valid
858        assert_eq!(find_char_boundary(s, 3), 3);
859
860        // Byte 4 (middle of second character) - should round down to 3
861        assert_eq!(find_char_boundary(s, 4), 3);
862
863        // Byte beyond string length - should return string length
864        assert_eq!(find_char_boundary(s, 100), s.len());
865    }
866
867    #[test]
868    fn test_byte_to_char_count_unicode() {
869        // Test character counting with multi-byte characters
870        let s = "안녕하세요";
871
872        // Byte 0 (start) - 1 character
873        assert_eq!(byte_to_char_count(s, 0), 1);
874
875        // Byte 3 (start of second character) - 2 characters
876        assert_eq!(byte_to_char_count(s, 3), 2);
877
878        // Byte 6 (start of third character) - 3 characters
879        assert_eq!(byte_to_char_count(s, 6), 3);
880
881        // Byte 9 (start of fourth character) - 4 characters
882        assert_eq!(byte_to_char_count(s, 9), 4);
883
884        // Byte 12 (start of fifth character) - 5 characters
885        assert_eq!(byte_to_char_count(s, 12), 5);
886
887        // Byte 15 (end) - 6 characters (5 + 1 for 1-indexed)
888        assert_eq!(byte_to_char_count(s, 15), 6);
889    }
890
891    #[test]
892    fn test_all_range_functions_with_emoji() {
893        // Test with emoji (4-byte UTF-8 characters)
894        let line_content = "Hello 🎉 World 🌍";
895
896        // calculate_match_range
897        let (line, start_col, end_line, end_col) = calculate_match_range(1, line_content, 6, 4);
898        assert_eq!(line, 1);
899        assert!(start_col > 0);
900        assert_eq!(end_line, 1);
901        assert!(end_col > start_col);
902
903        // calculate_trailing_range
904        let (line, start_col, end_line, end_col) = calculate_trailing_range(1, line_content, 12);
905        assert_eq!(line, 1);
906        assert!(start_col > 0);
907        assert_eq!(end_line, 1);
908        assert!(end_col > start_col);
909
910        // calculate_emphasis_range
911        let (line, start_col, end_line, end_col) = calculate_emphasis_range(1, line_content, 0, 5);
912        assert_eq!(line, 1);
913        assert_eq!(start_col, 1);
914        assert_eq!(end_line, 1);
915        assert!(end_col > start_col);
916    }
917}
rumdl_lib/utils/range_utils.rs

rumdl_lib/utils/
range_utils.rs