Skip to main content

sevenmark_utils/
line_index.rs

1//! Line index for byte offset → LSP Position conversion
2//!
3//! Provides O(log n) line lookup via binary search on precomputed line starts,
4//! plus O(k) UTF-16 character offset calculation within the line.
5
6use line_span::LineSpans;
7use sevenmark_ast::Span;
8
9/// Precomputed line start offsets for fast byte-offset-to-position conversion.
10///
11/// Construction is O(n) where n is the text length.
12/// Each position lookup is O(log L + k) where L is the number of lines
13/// and k is the byte length from the line start to the target offset.
14pub struct LineIndex {
15    /// Byte offset of each line's first character.
16    /// Always non-empty: at minimum contains `[0]` for single-line text.
17    line_starts: Vec<usize>,
18}
19
20impl LineIndex {
21    /// Builds a line index from the given text in O(n).
22    pub fn new(text: &str) -> Self {
23        let line_starts: Vec<usize> = text.line_spans().map(|span| span.range().start).collect();
24
25        // Empty text still has one logical line starting at offset 0
26        if line_starts.is_empty() {
27            return Self {
28                line_starts: vec![0],
29            };
30        }
31
32        Self { line_starts }
33    }
34
35    /// Converts a byte offset to an LSP-compatible (line, character) pair.
36    ///
37    /// - `line`: 0-based line number
38    /// - `character`: 0-based UTF-16 code unit offset from line start
39    ///
40    /// If `offset` exceeds the text length, it is clamped to `text.len()`.
41    pub fn byte_offset_to_position(&self, text: &str, offset: usize) -> (u32, u32) {
42        let offset = offset.min(text.len());
43
44        // Binary search: find the last line whose start <= offset
45        // partition_point returns the first index where line_starts[i] > offset
46        let line = self.line_starts.partition_point(|&start| start <= offset);
47        // line is now 1 past the target, so subtract 1 (partition_point returns > 0
48        // because line_starts[0] == 0 <= any offset >= 0)
49        let line = line.saturating_sub(1);
50
51        let line_start = self.line_starts[line];
52        let character = utf16_len(&text[line_start..offset]);
53
54        (line as u32, character)
55    }
56
57    /// Converts a parser `Span` (byte offsets) to an LSP Range pair.
58    ///
59    /// Returns `((start_line, start_char), (end_line, end_char))`.
60    pub fn span_to_range(&self, text: &str, span: &Span) -> ((u32, u32), (u32, u32)) {
61        let start = self.byte_offset_to_position(text, span.start);
62        let end = self.byte_offset_to_position(text, span.end);
63        (start, end)
64    }
65
66    /// Converts an LSP Position (line, character in UTF-16 code units) to a byte offset.
67    ///
68    /// Returns `text.len()` if the position is beyond the end of the text.
69    pub fn position_to_byte_offset(&self, text: &str, line: u32, character: u32) -> usize {
70        let line = line as usize;
71        if line >= self.line_starts.len() {
72            return text.len();
73        }
74
75        let line_start = self.line_starts[line];
76        // Line ends at either the next line's start or text end
77        let line_end = self
78            .line_starts
79            .get(line + 1)
80            .copied()
81            .unwrap_or(text.len());
82        let line_text = &text[line_start..line_end];
83
84        // Walk the line counting UTF-16 code units until we reach `character`
85        let mut utf16_count = 0u32;
86        let mut byte_offset = 0;
87        for ch in line_text.chars() {
88            if utf16_count >= character {
89                break;
90            }
91            utf16_count += ch.len_utf16() as u32;
92            byte_offset += ch.len_utf8();
93        }
94
95        line_start + byte_offset
96    }
97}
98
99/// Counts the number of UTF-16 code units in a UTF-8 string slice.
100///
101/// Each BMP character (U+0000..U+FFFF) contributes 1 unit,
102/// each supplementary character (U+10000..) contributes 2 units (surrogate pair).
103#[inline]
104fn utf16_len(s: &str) -> u32 {
105    s.chars().map(|c| c.len_utf16() as u32).sum()
106}
107
108#[cfg(test)]
109mod tests {
110    use super::*;
111
112    #[test]
113    fn test_single_line_ascii() {
114        let text = "hello world";
115        let idx = LineIndex::new(text);
116        assert_eq!(idx.byte_offset_to_position(text, 0), (0, 0));
117        assert_eq!(idx.byte_offset_to_position(text, 5), (0, 5));
118        assert_eq!(idx.byte_offset_to_position(text, 11), (0, 11));
119    }
120
121    #[test]
122    fn test_multi_line_ascii() {
123        let text = "aaa\nbbb\nccc";
124        let idx = LineIndex::new(text);
125        // Line 0: "aaa" (bytes 0..3)
126        assert_eq!(idx.byte_offset_to_position(text, 0), (0, 0));
127        assert_eq!(idx.byte_offset_to_position(text, 2), (0, 2));
128        // Line 1: "bbb" (bytes 4..7)
129        assert_eq!(idx.byte_offset_to_position(text, 4), (1, 0));
130        assert_eq!(idx.byte_offset_to_position(text, 6), (1, 2));
131        // Line 2: "ccc" (bytes 8..11)
132        assert_eq!(idx.byte_offset_to_position(text, 8), (2, 0));
133        assert_eq!(idx.byte_offset_to_position(text, 11), (2, 3));
134    }
135
136    #[test]
137    fn test_newline_boundary() {
138        // The '\n' at byte 3 belongs to line 0
139        let text = "ab\ncd\nef";
140        let idx = LineIndex::new(text);
141        assert_eq!(idx.byte_offset_to_position(text, 2), (0, 2)); // '\n'
142        assert_eq!(idx.byte_offset_to_position(text, 3), (1, 0)); // 'c'
143        assert_eq!(idx.byte_offset_to_position(text, 5), (1, 2)); // '\n'
144        assert_eq!(idx.byte_offset_to_position(text, 6), (2, 0)); // 'e'
145    }
146
147    #[test]
148    fn test_korean_utf8() {
149        // "한글" = 6 bytes UTF-8, 2 UTF-16 code units (BMP chars)
150        let text = "한글";
151        let idx = LineIndex::new(text);
152        assert_eq!(idx.byte_offset_to_position(text, 0), (0, 0)); // '한' start
153        assert_eq!(idx.byte_offset_to_position(text, 3), (0, 1)); // '글' start
154        assert_eq!(idx.byte_offset_to_position(text, 6), (0, 2)); // end
155    }
156
157    #[test]
158    fn test_emoji_surrogate_pair() {
159        // "a🚀b" = 1 + 4 + 1 = 6 bytes
160        // UTF-16: 'a'=1, '🚀'=2 (surrogate pair), 'b'=1 → total 4
161        let text = "a🚀b";
162        let idx = LineIndex::new(text);
163        assert_eq!(idx.byte_offset_to_position(text, 0), (0, 0)); // 'a'
164        assert_eq!(idx.byte_offset_to_position(text, 1), (0, 1)); // '🚀' start
165        assert_eq!(idx.byte_offset_to_position(text, 5), (0, 3)); // 'b'
166        assert_eq!(idx.byte_offset_to_position(text, 6), (0, 4)); // end
167    }
168
169    #[test]
170    fn test_mixed_multiline_korean() {
171        // "한\n글" = 3 + 1 + 3 = 7 bytes
172        let text = "한\n글";
173        let idx = LineIndex::new(text);
174        assert_eq!(idx.byte_offset_to_position(text, 0), (0, 0)); // '한'
175        assert_eq!(idx.byte_offset_to_position(text, 3), (0, 1)); // '\n' (still line 0)
176        assert_eq!(idx.byte_offset_to_position(text, 4), (1, 0)); // '글'
177        assert_eq!(idx.byte_offset_to_position(text, 7), (1, 1)); // end
178    }
179
180    #[test]
181    fn test_empty_text() {
182        let text = "";
183        let idx = LineIndex::new(text);
184        assert_eq!(idx.byte_offset_to_position(text, 0), (0, 0));
185    }
186
187    #[test]
188    fn test_crlf() {
189        let text = "aa\r\nbb";
190        let idx = LineIndex::new(text);
191        assert_eq!(idx.byte_offset_to_position(text, 0), (0, 0)); // 'a'
192        assert_eq!(idx.byte_offset_to_position(text, 2), (0, 2)); // '\r'
193        assert_eq!(idx.byte_offset_to_position(text, 4), (1, 0)); // 'b'
194    }
195
196    #[test]
197    fn test_offset_clamped() {
198        let text = "abc";
199        let idx = LineIndex::new(text);
200        // Offset beyond text length gets clamped
201        assert_eq!(idx.byte_offset_to_position(text, 100), (0, 3));
202    }
203
204    #[test]
205    fn test_span_to_range() {
206        let text = "aaa\nbbb\nccc";
207        let idx = LineIndex::new(text);
208        let span = Span::new(4, 7); // "bbb" on line 1
209        let ((sl, sc), (el, ec)) = idx.span_to_range(text, &span);
210        assert_eq!((sl, sc), (1, 0));
211        assert_eq!((el, ec), (1, 3));
212    }
213
214    #[test]
215    fn test_span_across_lines() {
216        let text = "aaa\nbbb\nccc";
217        let idx = LineIndex::new(text);
218        let span = Span::new(2, 9); // from "a" on line 0 to "c" on line 2
219        let ((sl, sc), (el, ec)) = idx.span_to_range(text, &span);
220        assert_eq!((sl, sc), (0, 2));
221        assert_eq!((el, ec), (2, 1));
222    }
223
224    #[test]
225    fn test_trailing_newline() {
226        let text = "abc\n";
227        let idx = LineIndex::new(text);
228        assert_eq!(idx.byte_offset_to_position(text, 3), (0, 3)); // '\n'
229        assert_eq!(idx.byte_offset_to_position(text, 4), (0, 4)); // end (past newline)
230    }
231
232    #[test]
233    fn test_multiple_empty_lines() {
234        let text = "a\n\n\nb";
235        let idx = LineIndex::new(text);
236        assert_eq!(idx.byte_offset_to_position(text, 0), (0, 0)); // 'a'
237        assert_eq!(idx.byte_offset_to_position(text, 2), (1, 0)); // empty line 1
238        assert_eq!(idx.byte_offset_to_position(text, 3), (2, 0)); // empty line 2
239        assert_eq!(idx.byte_offset_to_position(text, 4), (3, 0)); // 'b'
240    }
241
242    // === position_to_byte_offset tests ===
243
244    #[test]
245    fn test_pos_to_byte_ascii() {
246        let text = "aaa\nbbb\nccc";
247        let idx = LineIndex::new(text);
248        assert_eq!(idx.position_to_byte_offset(text, 0, 0), 0);
249        assert_eq!(idx.position_to_byte_offset(text, 0, 2), 2);
250        assert_eq!(idx.position_to_byte_offset(text, 1, 0), 4);
251        assert_eq!(idx.position_to_byte_offset(text, 1, 2), 6);
252        assert_eq!(idx.position_to_byte_offset(text, 2, 0), 8);
253        assert_eq!(idx.position_to_byte_offset(text, 2, 3), 11);
254    }
255
256    #[test]
257    fn test_pos_to_byte_korean() {
258        // "한글" = 6 bytes, 2 UTF-16 code units
259        let text = "한글";
260        let idx = LineIndex::new(text);
261        assert_eq!(idx.position_to_byte_offset(text, 0, 0), 0);
262        assert_eq!(idx.position_to_byte_offset(text, 0, 1), 3); // after '한'
263        assert_eq!(idx.position_to_byte_offset(text, 0, 2), 6); // end
264    }
265
266    #[test]
267    fn test_pos_to_byte_emoji() {
268        // "a🚀b": UTF-16 positions: a=0, 🚀=1..2 (surrogate), b=3
269        let text = "a🚀b";
270        let idx = LineIndex::new(text);
271        assert_eq!(idx.position_to_byte_offset(text, 0, 0), 0); // 'a'
272        assert_eq!(idx.position_to_byte_offset(text, 0, 1), 1); // '🚀' start
273        assert_eq!(idx.position_to_byte_offset(text, 0, 3), 5); // 'b'
274        assert_eq!(idx.position_to_byte_offset(text, 0, 4), 6); // end
275    }
276
277    #[test]
278    fn test_pos_to_byte_roundtrip() {
279        let text = "Hello, 세계!\n🚀 rocket\nend";
280        let idx = LineIndex::new(text);
281
282        // Test roundtrip for several byte offsets
283        for byte_offset in [0, 1, 7, 10, 14, 15, 19, 23, 26] {
284            if byte_offset <= text.len() {
285                let (line, character) = idx.byte_offset_to_position(text, byte_offset);
286                let recovered = idx.position_to_byte_offset(text, line, character);
287                assert_eq!(
288                    recovered, byte_offset,
289                    "roundtrip failed for byte_offset={byte_offset}"
290                );
291            }
292        }
293    }
294
295    #[test]
296    fn test_pos_to_byte_beyond_end() {
297        let text = "abc";
298        let idx = LineIndex::new(text);
299        // Line beyond end → text.len()
300        assert_eq!(idx.position_to_byte_offset(text, 5, 0), text.len());
301        // Character beyond line end → clamps to line end
302        assert_eq!(idx.position_to_byte_offset(text, 0, 100), 3);
303    }
304}