Skip to main content

perl_line_index/
lib.rs

1//! Byte-oriented line/column indexing helpers.
2//!
3//! This crate has one responsibility: map byte offsets to `(line, column)`
4//! and back using cached line starts.
5
6#![deny(unsafe_code)]
7#![warn(rust_2018_idioms)]
8#![warn(missing_docs)]
9
10/// Line index for byte <-> (line, col) mapping.
11#[derive(Clone, Debug)]
12pub struct LineIndex {
13    /// Byte offset of each line start.
14    line_starts: Vec<usize>,
15    /// Total UTF-8 byte length of the indexed text.
16    text_len: usize,
17}
18
19impl LineIndex {
20    /// Build a line index from UTF-8 text.
21    #[must_use]
22    pub fn new(text: &str) -> Self {
23        let mut line_starts = vec![0];
24        for (idx, ch) in text.char_indices() {
25            if ch == '\n' {
26                line_starts.push(idx + 1);
27            }
28        }
29        Self { line_starts, text_len: text.len() }
30    }
31
32    /// Convert a byte offset to `(line, column)` using byte columns.
33    #[must_use]
34    pub fn byte_to_position(&self, byte: usize) -> (usize, usize) {
35        let line = self.line_starts.binary_search(&byte).unwrap_or_else(|i| i.saturating_sub(1));
36        let column = byte - self.line_starts[line];
37        (line, column)
38    }
39
40    /// Convert `(line, column)` back to byte offset.
41    ///
42    /// Returns `None` when the line is out of range or when the column extends
43    /// past the end of the line (including the newline character, but not the
44    /// start of the next line).
45    #[must_use]
46    pub fn position_to_byte(&self, line: usize, column: usize) -> Option<usize> {
47        let start = *self.line_starts.get(line)?;
48        // line_end is the last addressable byte on this line (the newline char for
49        // non-final lines, or text_len for the final line).  next_line_start itself
50        // belongs to the *next* line, so we subtract one.
51        let line_end = self
52            .line_starts
53            .get(line + 1)
54            .map_or(self.text_len, |next_start| next_start.saturating_sub(1));
55        let max_column = line_end.saturating_sub(start);
56
57        if column > max_column {
58            return None;
59        }
60
61        Some(start + column)
62    }
63
64    /// Convert `(line, column)` back to byte offset where `column` is a
65    /// **UTF-16 code unit** offset (the unit used by the LSP `Position.character`
66    /// field).
67    ///
68    /// This is the LSP-safe counterpart to [`position_to_byte`], which uses
69    /// raw byte columns.  On lines that contain only ASCII, the two functions
70    /// return identical results.  On lines with multibyte UTF-8 characters the
71    /// byte offset can be larger than the UTF-16 column number.
72    ///
73    /// # Parameters
74    ///
75    /// - `text` — the original UTF-8 source text that was used to build this
76    ///   index.  The caller must pass the same text that was given to [`new`].
77    /// - `line` — 0-based line number.
78    /// - `column` — 0-based UTF-16 code unit offset from the start of the line.
79    ///
80    /// # Return value
81    ///
82    /// Returns `None` when:
83    /// - `line` is out of range (same as [`position_to_byte`]).
84    /// - `column` is past the end of the line (UTF-16 length of the line text).
85    /// - `column` points into the middle of a UTF-16 surrogate pair (the
86    ///   interior of a supplementary character, U+10000..=U+10FFFF).
87    ///
88    /// The returned byte offset is always on a UTF-8 character boundary.
89    ///
90    /// [`new`]: Self::new
91    /// [`position_to_byte`]: Self::position_to_byte
92    #[must_use]
93    pub fn position_to_byte_utf16(&self, text: &str, line: usize, column: usize) -> Option<usize> {
94        let line_start = *self.line_starts.get(line)?;
95        // Determine where this line's content ends (exclusive).  For non-final
96        // lines that includes the '\n' byte itself so positions pointing at the
97        // newline are accepted.  For the final line we use text_len.
98        let line_end = self.line_starts.get(line + 1).copied().unwrap_or(self.text_len);
99        let line_text = text.get(line_start..line_end)?;
100
101        // Walk the line, accumulating UTF-16 units until we reach `column`.
102        let mut utf16_units: usize = 0;
103        for (byte_offset, ch) in line_text.char_indices() {
104            if utf16_units == column {
105                return Some(line_start + byte_offset);
106            }
107            let ch_utf16 = ch.len_utf16();
108            // Guard against column landing in the middle of a surrogate pair.
109            if utf16_units + ch_utf16 > column {
110                return None;
111            }
112            utf16_units += ch_utf16;
113        }
114
115        // `column` == total UTF-16 length of the line is the one-past-end
116        // position (valid for range ends).
117        if utf16_units == column { Some(line_start + line_text.len()) } else { None }
118    }
119
120    /// Convert `(line, column)` back to byte offset, returning `None` when
121    /// the column crosses the line boundary.
122    ///
123    /// The newline character at the end of a line is the last addressable
124    /// column on that line.  The byte at `next_line_start` belongs to the
125    /// *next* line and is therefore out of range.
126    #[must_use]
127    pub fn position_to_byte_checked(&self, line: usize, column: usize) -> Option<usize> {
128        let start = *self.line_starts.get(line)?;
129        // Subtract one from next_line_start so the newline byte is reachable
130        // but the first byte of the next line is not.
131        let line_end = self
132            .line_starts
133            .get(line + 1)
134            .map_or(self.text_len, |next_start| next_start.saturating_sub(1));
135        let max_column = line_end.saturating_sub(start);
136
137        if column > max_column {
138            return None;
139        }
140
141        Some(start + column)
142    }
143}
144
145#[cfg(test)]
146mod tests {
147    use super::*;
148
149    #[test]
150    fn empty_string_has_one_line() {
151        let idx = LineIndex::new("");
152        assert_eq!(idx.byte_to_position(0), (0, 0));
153        assert_eq!(idx.position_to_byte(0, 0), Some(0));
154        assert_eq!(idx.position_to_byte(1, 0), None);
155    }
156
157    #[test]
158    fn single_line_no_newline() {
159        let idx = LineIndex::new("hello");
160        assert_eq!(idx.byte_to_position(0), (0, 0));
161        assert_eq!(idx.byte_to_position(4), (0, 4));
162        assert_eq!(idx.position_to_byte(0, 0), Some(0));
163        assert_eq!(idx.position_to_byte(0, 4), Some(4));
164        assert_eq!(idx.position_to_byte(0, 5), Some(5));
165        assert_eq!(idx.position_to_byte(0, 6), None);
166    }
167
168    #[test]
169    fn two_lines_byte_to_position() {
170        // "ab\ncd"  bytes: a=0, b=1, \n=2, c=3, d=4
171        let idx = LineIndex::new("ab\ncd");
172        assert_eq!(idx.byte_to_position(0), (0, 0));
173        assert_eq!(idx.byte_to_position(1), (0, 1));
174        assert_eq!(idx.byte_to_position(2), (0, 2)); // the newline is on line 0
175        assert_eq!(idx.byte_to_position(3), (1, 0));
176        assert_eq!(idx.byte_to_position(4), (1, 1));
177    }
178
179    #[test]
180    fn two_lines_position_to_byte() {
181        let idx = LineIndex::new("ab\ncd");
182        assert_eq!(idx.position_to_byte(0, 0), Some(0));
183        assert_eq!(idx.position_to_byte(0, 2), Some(2)); // newline byte
184        assert_eq!(idx.position_to_byte(1, 0), Some(3));
185        assert_eq!(idx.position_to_byte(1, 1), Some(4));
186        assert_eq!(idx.position_to_byte(1, 2), Some(5)); // last line, end of text
187        assert_eq!(idx.position_to_byte(1, 3), None); // beyond text
188        assert_eq!(idx.position_to_byte(2, 0), None); // no third line
189    }
190
191    #[test]
192    fn position_to_byte_checked_excludes_newline_as_next_line_start() {
193        // "ab\ncd"
194        let idx = LineIndex::new("ab\ncd");
195        // Line 0 ends at the newline (byte 2); col 2 = newline byte is still on line 0
196        assert_eq!(idx.position_to_byte_checked(0, 2), Some(2));
197        // col 3 is the first byte of line 1 — out of range for line 0
198        assert_eq!(idx.position_to_byte_checked(0, 3), None);
199        assert_eq!(idx.position_to_byte_checked(1, 0), Some(3));
200        assert_eq!(idx.position_to_byte_checked(2, 0), None);
201    }
202
203    #[test]
204    fn trailing_newline_creates_empty_last_line() {
205        // "foo\n" — line 1 starts at byte 4 and is empty
206        let idx = LineIndex::new("foo\n");
207        assert_eq!(idx.byte_to_position(3), (0, 3)); // newline
208        assert_eq!(idx.byte_to_position(4), (1, 0)); // empty last line start
209        assert_eq!(idx.position_to_byte(1, 0), Some(4));
210    }
211
212    #[test]
213    fn multiple_lines_roundtrip() {
214        let text = "line0\nline1\nline2";
215        let idx = LineIndex::new(text);
216        for (byte, _) in text.char_indices() {
217            let (line, col) = idx.byte_to_position(byte);
218            assert_eq!(idx.position_to_byte(line, col), Some(byte));
219        }
220    }
221
222    #[test]
223    fn columns_are_byte_offsets_for_multibyte_chars() {
224        // "αβ\nγ" — α=0xCEB1 (2 bytes at 0,1), β=0xCEB2 (2 bytes at 2,3),
225        // \n at byte 4, γ=0xCEB3 (2 bytes at 5,6).
226        let text = "αβ\nγ";
227        let idx = LineIndex::new(text);
228        assert_eq!(idx.byte_to_position(0), (0, 0)); // start of α
229        assert_eq!(idx.byte_to_position(2), (0, 2)); // start of β (byte column, not char)
230        assert_eq!(idx.byte_to_position(4), (0, 4)); // newline
231        assert_eq!(idx.byte_to_position(5), (1, 0)); // start of γ
232        assert_eq!(idx.byte_to_position(6), (1, 1)); // continuation byte of γ
233        // Column accounting is in bytes — column 2 on line 0 is the *byte* offset of β.
234        assert_eq!(idx.position_to_byte(0, 2), Some(2));
235        assert_eq!(idx.position_to_byte(1, 0), Some(5));
236    }
237
238    #[test]
239    fn four_byte_emoji_at_line_boundary() {
240        // 😀 = U+1F600, encoded as F0 9F 98 80 (4 bytes).
241        // "a\n😀b" — a=0, \n=1, emoji=2..6, b=6.
242        let text = "a\n😀b";
243        let idx = LineIndex::new(text);
244        assert_eq!(idx.byte_to_position(2), (1, 0)); // first byte of emoji starts line 1
245        assert_eq!(idx.byte_to_position(5), (1, 3)); // last continuation byte of emoji
246        assert_eq!(idx.byte_to_position(6), (1, 4)); // 'b' immediately after emoji
247        assert_eq!(idx.position_to_byte(1, 0), Some(2));
248        assert_eq!(idx.position_to_byte(1, 4), Some(6));
249    }
250
251    #[test]
252    fn crlf_keeps_carriage_return_on_preceding_line() {
253        // Only \n terminates a line; \r is content. "a\r\nb": a=0, \r=1, \n=2, b=3.
254        let text = "a\r\nb";
255        let idx = LineIndex::new(text);
256        assert_eq!(idx.byte_to_position(0), (0, 0));
257        assert_eq!(idx.byte_to_position(1), (0, 1)); // \r still on line 0
258        assert_eq!(idx.byte_to_position(2), (0, 2)); // \n still on line 0
259        assert_eq!(idx.byte_to_position(3), (1, 0)); // b starts line 1
260        // position_to_byte_checked: line 0 ends at the \n (col 2), col 3 is the next line.
261        assert_eq!(idx.position_to_byte_checked(0, 2), Some(2));
262        assert_eq!(idx.position_to_byte_checked(0, 3), None);
263    }
264
265    #[test]
266    fn consecutive_newlines_create_empty_lines() {
267        // "\n\n\n" — four lines (0..3), each starting at byte 0,1,2,3 respectively;
268        // line 3 is empty trailing line.
269        let text = "\n\n\n";
270        let idx = LineIndex::new(text);
271        assert_eq!(idx.byte_to_position(0), (0, 0)); // newline of line 0
272        assert_eq!(idx.byte_to_position(1), (1, 0)); // newline of line 1
273        assert_eq!(idx.byte_to_position(2), (2, 0)); // newline of line 2
274        assert_eq!(idx.byte_to_position(3), (3, 0)); // empty trailing line
275        // Each non-final empty line addresses exactly one byte (its newline).
276        assert_eq!(idx.position_to_byte(1, 0), Some(1));
277        assert_eq!(idx.position_to_byte(1, 1), None); // past line 1's only byte
278        assert_eq!(idx.position_to_byte(3, 0), Some(3));
279        assert_eq!(idx.position_to_byte(3, 1), None);
280    }
281
282    #[test]
283    fn single_newline_is_two_lines() {
284        // "\n" — line 0 has just the newline (1 addressable byte); line 1 is empty.
285        let idx = LineIndex::new("\n");
286        assert_eq!(idx.byte_to_position(0), (0, 0));
287        assert_eq!(idx.byte_to_position(1), (1, 0));
288        assert_eq!(idx.position_to_byte(0, 0), Some(0));
289        assert_eq!(idx.position_to_byte(0, 1), None); // col 1 on line 0 is past the newline
290        assert_eq!(idx.position_to_byte(1, 0), Some(1));
291        assert_eq!(idx.position_to_byte_checked(0, 0), Some(0));
292        assert_eq!(idx.position_to_byte_checked(1, 0), Some(1));
293    }
294
295    #[test]
296    fn position_to_byte_checked_at_trailing_empty_line() {
297        // "foo\n" — line 1 is empty; col 0 is the only valid position.
298        let idx = LineIndex::new("foo\n");
299        assert_eq!(idx.position_to_byte_checked(1, 0), Some(4));
300        assert_eq!(idx.position_to_byte_checked(1, 1), None);
301        assert_eq!(idx.position_to_byte_checked(2, 0), None);
302    }
303
304    #[test]
305    fn unicode_roundtrip_at_every_char_boundary() {
306        // Roundtrip across multibyte boundaries — every char_indices() position
307        // must map back to itself.
308        let text = "héllo\n世界\n🎉end";
309        let idx = LineIndex::new(text);
310        for (byte, _) in text.char_indices() {
311            let (line, col) = idx.byte_to_position(byte);
312            assert_eq!(
313                idx.position_to_byte(line, col),
314                Some(byte),
315                "roundtrip failed at byte {byte}"
316            );
317        }
318    }
319
320    #[test]
321    fn checked_empty_string_origin_is_addressable() -> Result<(), Box<dyn std::error::Error>> {
322        let idx = LineIndex::new("");
323        assert_eq!(idx.position_to_byte_checked(0, 0), Some(0));
324        Ok(())
325    }
326
327    #[test]
328    fn checked_empty_string_col_one_out_of_range() -> Result<(), Box<dyn std::error::Error>> {
329        let idx = LineIndex::new("");
330        assert_eq!(idx.position_to_byte_checked(0, 1), None);
331        Ok(())
332    }
333
334    #[test]
335    fn checked_empty_string_line_one_out_of_range() -> Result<(), Box<dyn std::error::Error>> {
336        let idx = LineIndex::new("");
337        assert_eq!(idx.position_to_byte_checked(1, 0), None);
338        Ok(())
339    }
340
341    #[test]
342    fn checked_single_line_all_columns_in_range() -> Result<(), Box<dyn std::error::Error>> {
343        let idx = LineIndex::new("hello");
344        assert_eq!(idx.position_to_byte_checked(0, 0), Some(0));
345        assert_eq!(idx.position_to_byte_checked(0, 4), Some(4));
346        assert_eq!(idx.position_to_byte_checked(0, 5), Some(5));
347        Ok(())
348    }
349
350    #[test]
351    fn checked_single_line_col_beyond_text_len_is_none() -> Result<(), Box<dyn std::error::Error>> {
352        let idx = LineIndex::new("hello");
353        assert_eq!(idx.position_to_byte_checked(0, 6), None);
354        Ok(())
355    }
356
357    #[test]
358    fn checked_newline_byte_is_last_addressable_on_its_line()
359    -> Result<(), Box<dyn std::error::Error>> {
360        let idx = LineIndex::new("abc\ndef");
361        assert_eq!(idx.position_to_byte_checked(0, 3), Some(3));
362        Ok(())
363    }
364
365    #[test]
366    fn checked_next_line_start_is_not_addressable_on_current_line()
367    -> Result<(), Box<dyn std::error::Error>> {
368        let idx = LineIndex::new("abc\ndef");
369        assert_eq!(idx.position_to_byte_checked(0, 4), None);
370        assert_eq!(idx.position_to_byte_checked(0, 100), None);
371        Ok(())
372    }
373
374    #[test]
375    fn checked_trailing_newline_empty_final_line_origin() -> Result<(), Box<dyn std::error::Error>>
376    {
377        let idx = LineIndex::new("foo\n");
378        assert_eq!(idx.position_to_byte_checked(1, 0), Some(4));
379        Ok(())
380    }
381
382    #[test]
383    fn checked_trailing_newline_empty_final_line_col_one_is_none()
384    -> Result<(), Box<dyn std::error::Error>> {
385        let idx = LineIndex::new("foo\n");
386        assert_eq!(idx.position_to_byte_checked(1, 1), None);
387        Ok(())
388    }
389
390    #[test]
391    fn checked_crlf_cr_byte_addressable_on_its_line() -> Result<(), Box<dyn std::error::Error>> {
392        let idx = LineIndex::new("ab\r\ncd");
393        assert_eq!(idx.position_to_byte_checked(0, 2), Some(2));
394        assert_eq!(idx.position_to_byte_checked(0, 3), Some(3));
395        assert_eq!(idx.position_to_byte_checked(0, 4), None);
396        Ok(())
397    }
398
399    #[test]
400    fn checked_crlf_second_line() -> Result<(), Box<dyn std::error::Error>> {
401        let idx = LineIndex::new("ab\r\ncd");
402        assert_eq!(idx.position_to_byte_checked(1, 0), Some(4));
403        assert_eq!(idx.position_to_byte_checked(1, 1), Some(5));
404        assert_eq!(idx.position_to_byte_checked(1, 2), Some(6));
405        assert_eq!(idx.position_to_byte_checked(1, 3), None);
406        Ok(())
407    }
408
409    #[test]
410    fn checked_unicode_two_byte_char_boundary() -> Result<(), Box<dyn std::error::Error>> {
411        let idx = LineIndex::new("caf\u{00e9}");
412        assert_eq!(idx.position_to_byte_checked(0, 3), Some(3));
413        assert_eq!(idx.position_to_byte_checked(0, 4), Some(4));
414        assert_eq!(idx.position_to_byte_checked(0, 5), Some(5));
415        assert_eq!(idx.position_to_byte_checked(0, 6), None);
416        Ok(())
417    }
418
419    #[test]
420    fn checked_unicode_multiline_second_line_boundary() -> Result<(), Box<dyn std::error::Error>> {
421        let idx = LineIndex::new("a\u{00e9}\nb");
422        assert_eq!(idx.position_to_byte_checked(0, 3), Some(3));
423        assert_eq!(idx.position_to_byte_checked(0, 4), None);
424        assert_eq!(idx.position_to_byte_checked(1, 0), Some(4));
425        assert_eq!(idx.position_to_byte_checked(1, 1), Some(5));
426        assert_eq!(idx.position_to_byte_checked(1, 2), None);
427        Ok(())
428    }
429
430    #[test]
431    fn checked_and_unchecked_agree_on_final_line() -> Result<(), Box<dyn std::error::Error>> {
432        let idx = LineIndex::new("abc\nxyz");
433        for col in 0..=5 {
434            assert_eq!(
435                idx.position_to_byte(1, col),
436                idx.position_to_byte_checked(1, col),
437                "methods diverged at col {col} on final line"
438            );
439        }
440        Ok(())
441    }
442
443    #[test]
444    fn byte_to_position_at_text_len_single_line() -> Result<(), Box<dyn std::error::Error>> {
445        let idx = LineIndex::new("hi");
446        assert_eq!(idx.byte_to_position(2), (0, 2));
447        Ok(())
448    }
449
450    #[test]
451    fn byte_to_position_at_text_len_multiline() -> Result<(), Box<dyn std::error::Error>> {
452        let idx = LineIndex::new("a\nb");
453        assert_eq!(idx.byte_to_position(3), (1, 1));
454        Ok(())
455    }
456
457    #[test]
458    fn clone_preserves_index_state() -> Result<(), Box<dyn std::error::Error>> {
459        let idx = LineIndex::new("foo\nbar");
460        let cloned = idx.clone();
461        assert_eq!(idx.byte_to_position(4), cloned.byte_to_position(4));
462        assert_eq!(idx.position_to_byte(1, 0), cloned.position_to_byte(1, 0));
463        Ok(())
464    }
465
466    #[test]
467    fn debug_format_is_non_empty() -> Result<(), Box<dyn std::error::Error>> {
468        let idx = LineIndex::new("test\ndata");
469        let debug_str = format!("{idx:?}");
470        assert!(!debug_str.is_empty());
471        Ok(())
472    }
473
474    #[test]
475    fn consecutive_newlines_produce_empty_lines() -> Result<(), Box<dyn std::error::Error>> {
476        let idx = LineIndex::new("\n\n");
477        assert_eq!(idx.byte_to_position(0), (0, 0));
478        assert_eq!(idx.byte_to_position(1), (1, 0));
479        assert_eq!(idx.byte_to_position(2), (2, 0));
480        assert_eq!(idx.position_to_byte(0, 0), Some(0));
481        assert_eq!(idx.position_to_byte(1, 0), Some(1));
482        assert_eq!(idx.position_to_byte(2, 0), Some(2));
483        assert_eq!(idx.position_to_byte(3, 0), None);
484        Ok(())
485    }
486}