Skip to main content

perl_position_tracking/
line_index.rs

1//! Line index for efficient UTF-16 position calculations.
2use ropey::Rope;
3
4/// Returns true if `b` is a UTF-8 continuation byte (0b10xxxxxx).
5#[inline]
6fn is_utf8_continuation(b: u8) -> bool {
7    (b & 0b1100_0000) == 0b1000_0000
8}
9
10/// Caches byte offsets for line starts to speed up coordinate conversion.
11#[derive(Debug, Clone)]
12pub struct LineStartsCache {
13    line_starts: Vec<usize>,
14}
15impl LineStartsCache {
16    /// Clamp `offset` into `text` and ensure it is on a UTF-8 char boundary.
17    fn normalize_text_offset(text: &str, offset: usize) -> usize {
18        let mut normalized = offset.min(text.len());
19        while normalized > 0 && !text.is_char_boundary(normalized) {
20            normalized -= 1;
21        }
22        normalized
23    }
24
25    /// Builds a cache from UTF-8 source text.
26    pub fn new(text: &str) -> Self {
27        let mut ls = vec![0];
28        let mut i = 0;
29        let b = text.as_bytes();
30        while i < b.len() {
31            if b[i] == b'\n' {
32                ls.push(i + 1);
33            } else if b[i] == b'\r' {
34                if i + 1 < b.len() && b[i + 1] == b'\n' {
35                    ls.push(i + 2);
36                    i += 1;
37                } else {
38                    ls.push(i + 1);
39                }
40            }
41            i += 1;
42        }
43        Self { line_starts: ls }
44    }
45
46    /// Builds a cache from a [`Rope`] buffer.
47    pub fn new_rope(rope: &Rope) -> Self {
48        let mut ls = vec![0];
49        for li in 0..rope.len_lines() {
50            if li > 0 {
51                ls.push(rope.line_to_byte(li));
52            }
53        }
54        Self { line_starts: ls }
55    }
56
57    /// Converts a byte offset in `text` to `(line, column_utf16)`.
58    pub fn offset_to_position(&self, text: &str, offset: usize) -> (u32, u32) {
59        let offset = Self::normalize_text_offset(text, offset);
60        let line = self.line_starts.binary_search(&offset).unwrap_or_else(|i| i.saturating_sub(1));
61        let ls = self.line_starts[line];
62        (line as u32, text[ls..offset].chars().map(|c| c.len_utf16()).sum::<usize>() as u32)
63    }
64
65    /// Converts `(line, column_utf16)` into a byte offset in `text`.
66    pub fn position_to_offset(&self, text: &str, line: u32, character: u32) -> usize {
67        let line = line as usize;
68        if line >= self.line_starts.len() {
69            return text.len();
70        }
71        let ls = self.line_starts[line];
72        let le = if line + 1 < self.line_starts.len() {
73            let ns = self.line_starts[line + 1];
74            let mut end = ns.saturating_sub(1);
75            let b = text.as_bytes();
76            while end > ls && (b[end] == b'\n' || b[end] == b'\r') {
77                end = end.saturating_sub(1);
78            }
79            end + 1
80        } else {
81            text.len()
82        };
83        let lt = &text[ls..le];
84        let mut uc = 0;
85        let mut bo = 0;
86        for ch in lt.chars() {
87            if uc >= character as usize {
88                break;
89            }
90            uc += ch.len_utf16();
91            bo += ch.len_utf8();
92        }
93        ls + bo.min(lt.len())
94    }
95
96    /// Converts a byte offset in `rope` to `(line, column_utf16)`.
97    pub fn offset_to_position_rope(&self, rope: &Rope, offset: usize) -> (u32, u32) {
98        let offset = Self::normalize_rope_offset(rope, offset);
99        let line = self.line_starts.binary_search(&offset).unwrap_or_else(|i| i.saturating_sub(1));
100        let ls = self.line_starts[line];
101        (
102            line as u32,
103            rope.byte_slice(ls..offset).chars().map(|c| c.len_utf16()).sum::<usize>() as u32,
104        )
105    }
106
107    /// Clamp `offset` into `rope` and snap it back to a UTF-8 char boundary.
108    ///
109    /// `Rope::byte_slice` panics if the offset splits a multi-byte codepoint, so
110    /// the clamp here mirrors [`Self::normalize_text_offset`]. Ropey 1.x does
111    /// not expose `is_char_boundary` directly, so we inspect the byte at the
112    /// candidate offset: UTF-8 continuation bytes always satisfy `b & 0xC0 ==
113    /// 0x80` (top two bits are `10`), and every other byte is a codepoint
114    /// start.
115    fn normalize_rope_offset(rope: &Rope, offset: usize) -> usize {
116        let len = rope.len_bytes();
117        let mut normalized = offset.min(len);
118        while normalized > 0 && normalized < len && is_utf8_continuation(rope.byte(normalized)) {
119            normalized -= 1;
120        }
121        normalized
122    }
123
124    /// Converts `(line, column_utf16)` into a byte offset in `rope`.
125    pub fn position_to_offset_rope(&self, rope: &Rope, line: u32, character: u32) -> usize {
126        let line = line as usize;
127        if line >= self.line_starts.len() {
128            return rope.len_bytes();
129        }
130        let ls = self.line_starts[line];
131        let le = if line + 1 < self.line_starts.len() {
132            self.line_starts[line + 1]
133        } else {
134            rope.len_bytes()
135        };
136        let sl = rope.byte_slice(ls..le);
137        let mut uc = 0;
138        let mut bo = 0;
139        for ch in sl.chars() {
140            if uc >= character as usize {
141                break;
142            }
143            uc += ch.len_utf16();
144            bo += ch.len_utf8();
145        }
146        ls + bo
147    }
148}
149
150/// Stores line information for efficient position lookups, owning the text.
151#[derive(Debug, Clone)]
152pub struct LineIndex {
153    /// Byte offset of each line start
154    line_starts: Vec<usize>,
155    /// The source text
156    text: String,
157}
158
159impl LineIndex {
160    /// Create a new LineIndex from source text
161    pub fn new(text: String) -> Self {
162        let mut line_starts = vec![0];
163        let bytes = text.as_bytes();
164        let mut i = 0;
165        while i < bytes.len() {
166            if bytes[i] == b'\n' {
167                line_starts.push(i + 1);
168            } else if bytes[i] == b'\r' {
169                if i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
170                    line_starts.push(i + 2);
171                    i += 1;
172                } else {
173                    line_starts.push(i + 1);
174                }
175            }
176            i += 1;
177        }
178
179        Self { line_starts, text }
180    }
181
182    /// Convert byte offset to position (0-based line and UTF-16 column)
183    pub fn offset_to_position(&self, offset: usize) -> (u32, u32) {
184        let offset = self.normalize_offset(offset);
185        let line = self.line_starts.binary_search(&offset).unwrap_or_else(|i| i.saturating_sub(1));
186
187        let line_start = self.line_starts[line];
188        let column = self.utf16_column(line, offset - line_start);
189
190        (line as u32, column as u32)
191    }
192
193    /// Convert position to byte offset
194    pub fn position_to_offset(&self, line: u32, character: u32) -> Option<usize> {
195        let line = line as usize;
196        if line >= self.line_starts.len() {
197            return None;
198        }
199
200        let line_start = self.line_starts[line];
201        let line_end = if line + 1 < self.line_starts.len() {
202            // Don't subtract 1 - include the newline in the line
203            self.line_starts[line + 1]
204        } else {
205            self.text.len()
206        };
207
208        // Get the full line including newline
209        let line_text = &self.text[line_start..line_end];
210
211        // Find the byte offset for the UTF-16 character position
212        let byte_offset = self.utf16_to_byte_offset(line_text, character as usize)?;
213
214        Some(line_start + byte_offset)
215    }
216
217    /// Get UTF-16 column from byte offset within a line
218    fn utf16_column(&self, line: usize, byte_offset: usize) -> usize {
219        let line_start = self.line_starts[line];
220
221        // Get the text from line start to the target byte offset
222        let target_byte = line_start + byte_offset;
223        if target_byte > self.text.len() {
224            return 0;
225        }
226
227        let line_text = &self.text[line_start..target_byte];
228
229        // Count UTF-16 code units in the substring
230        line_text.chars().map(|ch| ch.len_utf16()).sum()
231    }
232
233    /// Convert UTF-16 offset to byte offset within a line
234    fn utf16_to_byte_offset(&self, line_text: &str, utf16_offset: usize) -> Option<usize> {
235        let mut current_utf16 = 0;
236
237        for (byte_offset, ch) in line_text.char_indices() {
238            if current_utf16 == utf16_offset {
239                return Some(byte_offset);
240            }
241            current_utf16 += ch.len_utf16();
242            if current_utf16 > utf16_offset {
243                // UTF-16 offset is in the middle of a character
244                return None;
245            }
246        }
247
248        // Check if we're at the end of the line
249        if current_utf16 == utf16_offset { Some(line_text.len()) } else { None }
250    }
251
252    /// Normalize a byte offset so it is inside the text and on a UTF-8 codepoint boundary.
253    fn normalize_offset(&self, offset: usize) -> usize {
254        let mut normalized = offset.min(self.text.len());
255        while normalized > 0 && !self.text.is_char_boundary(normalized) {
256            normalized -= 1;
257        }
258        normalized
259    }
260
261    /// Create a range from byte offsets
262    pub fn range(&self, start: usize, end: usize) -> ((u32, u32), (u32, u32)) {
263        let start_pos = self.offset_to_position(start);
264        let end_pos = self.offset_to_position(end);
265        (start_pos, end_pos)
266    }
267}