Skip to main content

perl_line_index/
lib.rs

1//! Byte-oriented line/column indexing helpers.
2//!
3//! This crate has one responsibility: map byte offsets to `(line, column)`
4//! and back using cached line starts.
5
6#![deny(unsafe_code)]
7#![warn(rust_2018_idioms)]
8#![warn(missing_docs)]
9
10/// Line index for byte <-> (line, col) mapping.
11#[derive(Clone, Debug)]
12pub struct LineIndex {
13    /// Byte offset of each line start.
14    line_starts: Vec<usize>,
15    /// Total UTF-8 byte length of the indexed text.
16    text_len: usize,
17}
18
19impl LineIndex {
20    /// Build a line index from UTF-8 text.
21    #[must_use]
22    pub fn new(text: &str) -> Self {
23        let mut line_starts = vec![0];
24        for (idx, ch) in text.char_indices() {
25            if ch == '\n' {
26                line_starts.push(idx + 1);
27            }
28        }
29        Self { line_starts, text_len: text.len() }
30    }
31
32    /// Convert a byte offset to `(line, column)` using byte columns.
33    #[must_use]
34    pub fn byte_to_position(&self, byte: usize) -> (usize, usize) {
35        let line = self.line_starts.binary_search(&byte).unwrap_or_else(|i| i.saturating_sub(1));
36        let column = byte - self.line_starts[line];
37        (line, column)
38    }
39
40    /// Convert `(line, column)` back to byte offset.
41    ///
42    /// Returns `None` when the line is out of range or when the column extends
43    /// past the end of the line (including the newline character, but not the
44    /// start of the next line).
45    #[must_use]
46    pub fn position_to_byte(&self, line: usize, column: usize) -> Option<usize> {
47        let start = *self.line_starts.get(line)?;
48        // line_end is the last addressable byte on this line (the newline char for
49        // non-final lines, or text_len for the final line).  next_line_start itself
50        // belongs to the *next* line, so we subtract one.
51        let line_end = self
52            .line_starts
53            .get(line + 1)
54            .map_or(self.text_len, |next_start| next_start.saturating_sub(1));
55        let max_column = line_end.saturating_sub(start);
56
57        if column > max_column {
58            return None;
59        }
60
61        Some(start + column)
62    }
63
64    /// Convert `(line, column)` back to byte offset, returning `None` when
65    /// the column crosses the line boundary.
66    ///
67    /// The newline character at the end of a line is the last addressable
68    /// column on that line.  The byte at `next_line_start` belongs to the
69    /// *next* line and is therefore out of range.
70    #[must_use]
71    pub fn position_to_byte_checked(&self, line: usize, column: usize) -> Option<usize> {
72        let start = *self.line_starts.get(line)?;
73        // Subtract one from next_line_start so the newline byte is reachable
74        // but the first byte of the next line is not.
75        let line_end = self
76            .line_starts
77            .get(line + 1)
78            .map_or(self.text_len, |next_start| next_start.saturating_sub(1));
79        let max_column = line_end.saturating_sub(start);
80
81        if column > max_column {
82            return None;
83        }
84
85        Some(start + column)
86    }
87}
88
89#[cfg(test)]
90mod tests {
91    use super::*;
92
93    #[test]
94    fn empty_string_has_one_line() {
95        let idx = LineIndex::new("");
96        assert_eq!(idx.byte_to_position(0), (0, 0));
97        assert_eq!(idx.position_to_byte(0, 0), Some(0));
98        assert_eq!(idx.position_to_byte(1, 0), None);
99    }
100
101    #[test]
102    fn single_line_no_newline() {
103        let idx = LineIndex::new("hello");
104        assert_eq!(idx.byte_to_position(0), (0, 0));
105        assert_eq!(idx.byte_to_position(4), (0, 4));
106        assert_eq!(idx.position_to_byte(0, 0), Some(0));
107        assert_eq!(idx.position_to_byte(0, 4), Some(4));
108        assert_eq!(idx.position_to_byte(0, 5), Some(5));
109        assert_eq!(idx.position_to_byte(0, 6), None);
110    }
111
112    #[test]
113    fn two_lines_byte_to_position() {
114        // "ab\ncd"  bytes: a=0, b=1, \n=2, c=3, d=4
115        let idx = LineIndex::new("ab\ncd");
116        assert_eq!(idx.byte_to_position(0), (0, 0));
117        assert_eq!(idx.byte_to_position(1), (0, 1));
118        assert_eq!(idx.byte_to_position(2), (0, 2)); // the newline is on line 0
119        assert_eq!(idx.byte_to_position(3), (1, 0));
120        assert_eq!(idx.byte_to_position(4), (1, 1));
121    }
122
123    #[test]
124    fn two_lines_position_to_byte() {
125        let idx = LineIndex::new("ab\ncd");
126        assert_eq!(idx.position_to_byte(0, 0), Some(0));
127        assert_eq!(idx.position_to_byte(0, 2), Some(2)); // newline byte
128        assert_eq!(idx.position_to_byte(1, 0), Some(3));
129        assert_eq!(idx.position_to_byte(1, 1), Some(4));
130        assert_eq!(idx.position_to_byte(1, 2), Some(5)); // last line, end of text
131        assert_eq!(idx.position_to_byte(1, 3), None); // beyond text
132        assert_eq!(idx.position_to_byte(2, 0), None); // no third line
133    }
134
135    #[test]
136    fn position_to_byte_checked_excludes_newline_as_next_line_start() {
137        // "ab\ncd"
138        let idx = LineIndex::new("ab\ncd");
139        // Line 0 ends at the newline (byte 2); col 2 = newline byte is still on line 0
140        assert_eq!(idx.position_to_byte_checked(0, 2), Some(2));
141        // col 3 is the first byte of line 1 — out of range for line 0
142        assert_eq!(idx.position_to_byte_checked(0, 3), None);
143        assert_eq!(idx.position_to_byte_checked(1, 0), Some(3));
144        assert_eq!(idx.position_to_byte_checked(2, 0), None);
145    }
146
147    #[test]
148    fn trailing_newline_creates_empty_last_line() {
149        // "foo\n" — line 1 starts at byte 4 and is empty
150        let idx = LineIndex::new("foo\n");
151        assert_eq!(idx.byte_to_position(3), (0, 3)); // newline
152        assert_eq!(idx.byte_to_position(4), (1, 0)); // empty last line start
153        assert_eq!(idx.position_to_byte(1, 0), Some(4));
154    }
155
156    #[test]
157    fn multiple_lines_roundtrip() {
158        let text = "line0\nline1\nline2";
159        let idx = LineIndex::new(text);
160        for (byte, _) in text.char_indices() {
161            let (line, col) = idx.byte_to_position(byte);
162            assert_eq!(idx.position_to_byte(line, col), Some(byte));
163        }
164    }
165
166    #[test]
167    fn columns_are_byte_offsets_for_multibyte_chars() {
168        // "αβ\nγ" — α=0xCEB1 (2 bytes at 0,1), β=0xCEB2 (2 bytes at 2,3),
169        // \n at byte 4, γ=0xCEB3 (2 bytes at 5,6).
170        let text = "αβ\nγ";
171        let idx = LineIndex::new(text);
172        assert_eq!(idx.byte_to_position(0), (0, 0)); // start of α
173        assert_eq!(idx.byte_to_position(2), (0, 2)); // start of β (byte column, not char)
174        assert_eq!(idx.byte_to_position(4), (0, 4)); // newline
175        assert_eq!(idx.byte_to_position(5), (1, 0)); // start of γ
176        assert_eq!(idx.byte_to_position(6), (1, 1)); // continuation byte of γ
177        // Column accounting is in bytes — column 2 on line 0 is the *byte* offset of β.
178        assert_eq!(idx.position_to_byte(0, 2), Some(2));
179        assert_eq!(idx.position_to_byte(1, 0), Some(5));
180    }
181
182    #[test]
183    fn four_byte_emoji_at_line_boundary() {
184        // 😀 = U+1F600, encoded as F0 9F 98 80 (4 bytes).
185        // "a\n😀b" — a=0, \n=1, emoji=2..6, b=6.
186        let text = "a\n😀b";
187        let idx = LineIndex::new(text);
188        assert_eq!(idx.byte_to_position(2), (1, 0)); // first byte of emoji starts line 1
189        assert_eq!(idx.byte_to_position(5), (1, 3)); // last continuation byte of emoji
190        assert_eq!(idx.byte_to_position(6), (1, 4)); // 'b' immediately after emoji
191        assert_eq!(idx.position_to_byte(1, 0), Some(2));
192        assert_eq!(idx.position_to_byte(1, 4), Some(6));
193    }
194
195    #[test]
196    fn crlf_keeps_carriage_return_on_preceding_line() {
197        // Only \n terminates a line; \r is content. "a\r\nb": a=0, \r=1, \n=2, b=3.
198        let text = "a\r\nb";
199        let idx = LineIndex::new(text);
200        assert_eq!(idx.byte_to_position(0), (0, 0));
201        assert_eq!(idx.byte_to_position(1), (0, 1)); // \r still on line 0
202        assert_eq!(idx.byte_to_position(2), (0, 2)); // \n still on line 0
203        assert_eq!(idx.byte_to_position(3), (1, 0)); // b starts line 1
204        // position_to_byte_checked: line 0 ends at the \n (col 2), col 3 is the next line.
205        assert_eq!(idx.position_to_byte_checked(0, 2), Some(2));
206        assert_eq!(idx.position_to_byte_checked(0, 3), None);
207    }
208
209    #[test]
210    fn consecutive_newlines_create_empty_lines() {
211        // "\n\n\n" — four lines (0..3), each starting at byte 0,1,2,3 respectively;
212        // line 3 is empty trailing line.
213        let text = "\n\n\n";
214        let idx = LineIndex::new(text);
215        assert_eq!(idx.byte_to_position(0), (0, 0)); // newline of line 0
216        assert_eq!(idx.byte_to_position(1), (1, 0)); // newline of line 1
217        assert_eq!(idx.byte_to_position(2), (2, 0)); // newline of line 2
218        assert_eq!(idx.byte_to_position(3), (3, 0)); // empty trailing line
219        // Each non-final empty line addresses exactly one byte (its newline).
220        assert_eq!(idx.position_to_byte(1, 0), Some(1));
221        assert_eq!(idx.position_to_byte(1, 1), None); // past line 1's only byte
222        assert_eq!(idx.position_to_byte(3, 0), Some(3));
223        assert_eq!(idx.position_to_byte(3, 1), None);
224    }
225
226    #[test]
227    fn single_newline_is_two_lines() {
228        // "\n" — line 0 has just the newline (1 addressable byte); line 1 is empty.
229        let idx = LineIndex::new("\n");
230        assert_eq!(idx.byte_to_position(0), (0, 0));
231        assert_eq!(idx.byte_to_position(1), (1, 0));
232        assert_eq!(idx.position_to_byte(0, 0), Some(0));
233        assert_eq!(idx.position_to_byte(0, 1), None); // col 1 on line 0 is past the newline
234        assert_eq!(idx.position_to_byte(1, 0), Some(1));
235        assert_eq!(idx.position_to_byte_checked(0, 0), Some(0));
236        assert_eq!(idx.position_to_byte_checked(1, 0), Some(1));
237    }
238
239    #[test]
240    fn position_to_byte_checked_at_trailing_empty_line() {
241        // "foo\n" — line 1 is empty; col 0 is the only valid position.
242        let idx = LineIndex::new("foo\n");
243        assert_eq!(idx.position_to_byte_checked(1, 0), Some(4));
244        assert_eq!(idx.position_to_byte_checked(1, 1), None);
245        assert_eq!(idx.position_to_byte_checked(2, 0), None);
246    }
247
248    #[test]
249    fn unicode_roundtrip_at_every_char_boundary() {
250        // Roundtrip across multibyte boundaries — every char_indices() position
251        // must map back to itself.
252        let text = "héllo\n世界\n🎉end";
253        let idx = LineIndex::new(text);
254        for (byte, _) in text.char_indices() {
255            let (line, col) = idx.byte_to_position(byte);
256            assert_eq!(
257                idx.position_to_byte(line, col),
258                Some(byte),
259                "roundtrip failed at byte {byte}"
260            );
261        }
262    }
263
264    #[test]
265    fn checked_empty_string_origin_is_addressable() -> Result<(), Box<dyn std::error::Error>> {
266        let idx = LineIndex::new("");
267        assert_eq!(idx.position_to_byte_checked(0, 0), Some(0));
268        Ok(())
269    }
270
271    #[test]
272    fn checked_empty_string_col_one_out_of_range() -> Result<(), Box<dyn std::error::Error>> {
273        let idx = LineIndex::new("");
274        assert_eq!(idx.position_to_byte_checked(0, 1), None);
275        Ok(())
276    }
277
278    #[test]
279    fn checked_empty_string_line_one_out_of_range() -> Result<(), Box<dyn std::error::Error>> {
280        let idx = LineIndex::new("");
281        assert_eq!(idx.position_to_byte_checked(1, 0), None);
282        Ok(())
283    }
284
285    #[test]
286    fn checked_single_line_all_columns_in_range() -> Result<(), Box<dyn std::error::Error>> {
287        let idx = LineIndex::new("hello");
288        assert_eq!(idx.position_to_byte_checked(0, 0), Some(0));
289        assert_eq!(idx.position_to_byte_checked(0, 4), Some(4));
290        assert_eq!(idx.position_to_byte_checked(0, 5), Some(5));
291        Ok(())
292    }
293
294    #[test]
295    fn checked_single_line_col_beyond_text_len_is_none() -> Result<(), Box<dyn std::error::Error>> {
296        let idx = LineIndex::new("hello");
297        assert_eq!(idx.position_to_byte_checked(0, 6), None);
298        Ok(())
299    }
300
301    #[test]
302    fn checked_newline_byte_is_last_addressable_on_its_line()
303    -> Result<(), Box<dyn std::error::Error>> {
304        let idx = LineIndex::new("abc\ndef");
305        assert_eq!(idx.position_to_byte_checked(0, 3), Some(3));
306        Ok(())
307    }
308
309    #[test]
310    fn checked_next_line_start_is_not_addressable_on_current_line()
311    -> Result<(), Box<dyn std::error::Error>> {
312        let idx = LineIndex::new("abc\ndef");
313        assert_eq!(idx.position_to_byte_checked(0, 4), None);
314        assert_eq!(idx.position_to_byte_checked(0, 100), None);
315        Ok(())
316    }
317
318    #[test]
319    fn checked_trailing_newline_empty_final_line_origin() -> Result<(), Box<dyn std::error::Error>>
320    {
321        let idx = LineIndex::new("foo\n");
322        assert_eq!(idx.position_to_byte_checked(1, 0), Some(4));
323        Ok(())
324    }
325
326    #[test]
327    fn checked_trailing_newline_empty_final_line_col_one_is_none()
328    -> Result<(), Box<dyn std::error::Error>> {
329        let idx = LineIndex::new("foo\n");
330        assert_eq!(idx.position_to_byte_checked(1, 1), None);
331        Ok(())
332    }
333
334    #[test]
335    fn checked_crlf_cr_byte_addressable_on_its_line() -> Result<(), Box<dyn std::error::Error>> {
336        let idx = LineIndex::new("ab\r\ncd");
337        assert_eq!(idx.position_to_byte_checked(0, 2), Some(2));
338        assert_eq!(idx.position_to_byte_checked(0, 3), Some(3));
339        assert_eq!(idx.position_to_byte_checked(0, 4), None);
340        Ok(())
341    }
342
343    #[test]
344    fn checked_crlf_second_line() -> Result<(), Box<dyn std::error::Error>> {
345        let idx = LineIndex::new("ab\r\ncd");
346        assert_eq!(idx.position_to_byte_checked(1, 0), Some(4));
347        assert_eq!(idx.position_to_byte_checked(1, 1), Some(5));
348        assert_eq!(idx.position_to_byte_checked(1, 2), Some(6));
349        assert_eq!(idx.position_to_byte_checked(1, 3), None);
350        Ok(())
351    }
352
353    #[test]
354    fn checked_unicode_two_byte_char_boundary() -> Result<(), Box<dyn std::error::Error>> {
355        let idx = LineIndex::new("caf\u{00e9}");
356        assert_eq!(idx.position_to_byte_checked(0, 3), Some(3));
357        assert_eq!(idx.position_to_byte_checked(0, 4), Some(4));
358        assert_eq!(idx.position_to_byte_checked(0, 5), Some(5));
359        assert_eq!(idx.position_to_byte_checked(0, 6), None);
360        Ok(())
361    }
362
363    #[test]
364    fn checked_unicode_multiline_second_line_boundary() -> Result<(), Box<dyn std::error::Error>> {
365        let idx = LineIndex::new("a\u{00e9}\nb");
366        assert_eq!(idx.position_to_byte_checked(0, 3), Some(3));
367        assert_eq!(idx.position_to_byte_checked(0, 4), None);
368        assert_eq!(idx.position_to_byte_checked(1, 0), Some(4));
369        assert_eq!(idx.position_to_byte_checked(1, 1), Some(5));
370        assert_eq!(idx.position_to_byte_checked(1, 2), None);
371        Ok(())
372    }
373
374    #[test]
375    fn checked_and_unchecked_agree_on_final_line() -> Result<(), Box<dyn std::error::Error>> {
376        let idx = LineIndex::new("abc\nxyz");
377        for col in 0..=5 {
378            assert_eq!(
379                idx.position_to_byte(1, col),
380                idx.position_to_byte_checked(1, col),
381                "methods diverged at col {col} on final line"
382            );
383        }
384        Ok(())
385    }
386
387    #[test]
388    fn byte_to_position_at_text_len_single_line() -> Result<(), Box<dyn std::error::Error>> {
389        let idx = LineIndex::new("hi");
390        assert_eq!(idx.byte_to_position(2), (0, 2));
391        Ok(())
392    }
393
394    #[test]
395    fn byte_to_position_at_text_len_multiline() -> Result<(), Box<dyn std::error::Error>> {
396        let idx = LineIndex::new("a\nb");
397        assert_eq!(idx.byte_to_position(3), (1, 1));
398        Ok(())
399    }
400
401    #[test]
402    fn clone_preserves_index_state() -> Result<(), Box<dyn std::error::Error>> {
403        let idx = LineIndex::new("foo\nbar");
404        let cloned = idx.clone();
405        assert_eq!(idx.byte_to_position(4), cloned.byte_to_position(4));
406        assert_eq!(idx.position_to_byte(1, 0), cloned.position_to_byte(1, 0));
407        Ok(())
408    }
409
410    #[test]
411    fn debug_format_is_non_empty() -> Result<(), Box<dyn std::error::Error>> {
412        let idx = LineIndex::new("test\ndata");
413        let debug_str = format!("{idx:?}");
414        assert!(!debug_str.is_empty());
415        Ok(())
416    }
417
418    #[test]
419    fn consecutive_newlines_produce_empty_lines() -> Result<(), Box<dyn std::error::Error>> {
420        let idx = LineIndex::new("\n\n");
421        assert_eq!(idx.byte_to_position(0), (0, 0));
422        assert_eq!(idx.byte_to_position(1), (1, 0));
423        assert_eq!(idx.byte_to_position(2), (2, 0));
424        assert_eq!(idx.position_to_byte(0, 0), Some(0));
425        assert_eq!(idx.position_to_byte(1, 0), Some(1));
426        assert_eq!(idx.position_to_byte(2, 0), Some(2));
427        assert_eq!(idx.position_to_byte(3, 0), None);
428        Ok(())
429    }
430}