Skip to main content

cyrs_syntax/
line_index.rs

1//! `LineIndex` — byte-offset → line/column conversion for diagnostic rendering.
2//!
3//! Spec 0001 §4.5. Offsets into source code are UTF-8 byte [`TextSize`]
4//! values (via `text-size`). [`LineIndex`] pre-computes line start positions
5//! so that every later lookup is `O(log lines)`. A separate table of wide
6//! characters per line lets us cheaply convert UTF-8 column offsets to
7//! UTF-16 code units — the conversion used at the LSP boundary.
8//!
9//! The implementation mirrors rust-analyzer's `ide_db::line_index` (§23
10//! defers to rust-analyzer house style) but is owned in-tree, since
11//! rust-analyzer's crate is not a dependency.
12
13use text_size::{TextRange, TextSize};
14
15/// A position in source code: 0-based line and 0-based UTF-8 column (byte
16/// offset within the line). Multi-byte characters occupy multiple columns.
17#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
18pub struct LineCol {
19    /// 0-based line number.
20    pub line: u32,
21    /// UTF-8 byte offset within the line (not UTF-16 code units).
22    pub col: u32,
23}
24
25/// A position in LSP-flavoured UTF-16 code units. Used ONLY at the LSP
26/// boundary (spec §4.5). Do not propagate inside the analyser — the rest of
27/// the compiler uses [`LineCol`] + UTF-8 offsets.
28#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
29pub struct WideLineCol {
30    /// 0-based line number.
31    pub line: u32,
32    /// UTF-16 code-unit offset within the line.
33    pub col: u32,
34}
35
36/// Line-start table over the original source.
37#[derive(Debug, Clone)]
38pub struct LineIndex {
39    /// Byte offset of the start of each line. `newlines[0] == 0`.
40    newlines: Vec<TextSize>,
41    /// For each line, a sorted list of entries for every non-ASCII char on
42    /// that line. Empty for pure-ASCII lines.
43    wide_chars: Vec<Vec<WideChar>>,
44}
45
46#[derive(Debug, Clone, Copy)]
47struct WideChar {
48    /// UTF-8 byte offset within the line.
49    start: TextSize,
50    /// UTF-8 byte length of the character.
51    len: TextSize,
52}
53
54impl LineIndex {
55    /// Build a `LineIndex` from full source text. `O(n)` in input length.
56    #[must_use]
57    pub fn new(text: &str) -> Self {
58        let mut newlines = vec![TextSize::from(0)];
59        let mut wide_chars: Vec<Vec<WideChar>> = vec![Vec::new()];
60        let mut line_start = TextSize::from(0);
61
62        for (offset, ch) in text.char_indices() {
63            let offset = TextSize::try_from(offset).expect("source fits in u32 bytes");
64            if ch == '\n' {
65                let next_line_start = offset + TextSize::of('\n');
66                newlines.push(next_line_start);
67                wide_chars.push(Vec::new());
68                line_start = next_line_start;
69                continue;
70            }
71            if !ch.is_ascii() {
72                let start_in_line = offset - line_start;
73                wide_chars
74                    .last_mut()
75                    .expect("at least one line always exists")
76                    .push(WideChar {
77                        start: start_in_line,
78                        len: TextSize::of(ch),
79                    });
80            }
81        }
82
83        Self {
84            newlines,
85            wide_chars,
86        }
87    }
88
89    /// Convert a byte offset to 0-based line/column (UTF-8).
90    ///
91    /// If `offset` falls inside a multi-byte character the result reports
92    /// the mechanical byte-offset within the line — callers that need
93    /// char-aligned columns are responsible for snapping.
94    ///
95    /// # Panics
96    /// Panics if the computed line index does not fit in `u32`, which
97    /// requires a source file with more than `u32::MAX` lines — impossible
98    /// for any input that also fits in a `TextSize`.
99    #[must_use]
100    pub fn line_col(&self, offset: TextSize) -> LineCol {
101        let line = self
102            .newlines
103            .partition_point(|&start| start <= offset)
104            .saturating_sub(1);
105        let line_start = self.newlines[line];
106        let col = offset - line_start;
107        LineCol {
108            line: u32::try_from(line).expect("line count fits in u32"),
109            col: u32::from(col),
110        }
111    }
112
113    /// Convert a [`LineCol`] (UTF-8) to the LSP UTF-16 flavour.
114    #[must_use]
115    pub fn to_utf16(&self, pos: LineCol) -> WideLineCol {
116        let line = pos.line as usize;
117        if line >= self.wide_chars.len() || self.wide_chars[line].is_empty() {
118            return WideLineCol {
119                line: pos.line,
120                col: pos.col,
121            };
122        }
123        let mut col = pos.col;
124        for wc in &self.wide_chars[line] {
125            if u32::from(wc.start) >= pos.col {
126                break;
127            }
128            // A non-ASCII char in UTF-8 occupies `wc.len` bytes; in UTF-16
129            // it occupies 1 code unit for BMP (len <= 3) and 2 for astral
130            // (len == 4).
131            let utf8_len = u32::from(wc.len);
132            let utf16_len: u32 = if utf8_len == 4 { 2 } else { 1 };
133            col = col - utf8_len + utf16_len;
134        }
135        WideLineCol {
136            line: pos.line,
137            col,
138        }
139    }
140
141    /// Convert an LSP UTF-16 position back to a UTF-8 byte [`LineCol`].
142    #[must_use]
143    pub fn from_utf16(&self, pos: WideLineCol) -> LineCol {
144        let line = pos.line as usize;
145        if line >= self.wide_chars.len() || self.wide_chars[line].is_empty() {
146            return LineCol {
147                line: pos.line,
148                col: pos.col,
149            };
150        }
151        let mut utf16_seen: u32 = 0;
152        let mut col = pos.col;
153        for wc in &self.wide_chars[line] {
154            let wc_col_utf8 = u32::from(wc.start);
155            // Number of ASCII cols between the previous wide char and this
156            // one is the same in both encodings — skip past them.
157            if wc_col_utf8 + utf16_seen >= col {
158                break;
159            }
160            let utf8_len = u32::from(wc.len);
161            let utf16_len: u32 = if utf8_len == 4 { 2 } else { 1 };
162            // The caller gave us a UTF-16 col; we advance by
163            // `(utf8_len - utf16_len)` extra bytes for this char.
164            col = col + utf8_len - utf16_len;
165            utf16_seen += utf16_len;
166        }
167        LineCol {
168            line: pos.line,
169            col,
170        }
171    }
172
173    /// Range of the given 0-based line in full-source byte offsets.
174    /// Includes the trailing `\n` if the line has one. Returns `None` for
175    /// lines past the end.
176    ///
177    /// For the final line of the input (which has no trailing newline in
178    /// the index) the range extends to `TextSize::from(u32::MAX)` as a
179    /// sentinel end — callers that care about the "real" tail should clamp
180    /// to the input length.
181    #[must_use]
182    pub fn line_range(&self, line: u32) -> Option<TextRange> {
183        let idx = line as usize;
184        let start = *self.newlines.get(idx)?;
185        let end = self
186            .newlines
187            .get(idx + 1)
188            .copied()
189            .unwrap_or(TextSize::from(u32::MAX));
190        Some(TextRange::new(start, end))
191    }
192
193    /// Number of lines in the input. Always `>= 1`; empty input has a
194    /// single empty line.
195    #[must_use]
196    pub fn line_count(&self) -> u32 {
197        u32::try_from(self.newlines.len()).expect("line count fits in u32")
198    }
199}
200
201#[cfg(test)]
202mod tests {
203    use super::{LineCol, LineIndex, WideLineCol};
204    use pretty_assertions::assert_eq;
205    use text_size::{TextRange, TextSize};
206
207    #[test]
208    fn empty_input_is_one_line() {
209        let idx = LineIndex::new("");
210        assert_eq!(idx.line_count(), 1);
211        assert_eq!(idx.line_col(TextSize::from(0)), LineCol { line: 0, col: 0 });
212    }
213
214    #[test]
215    fn ascii_single_line_line_col() {
216        let idx = LineIndex::new("abc");
217        assert_eq!(idx.line_count(), 1);
218        assert_eq!(idx.line_col(TextSize::from(2)), LineCol { line: 0, col: 2 });
219    }
220
221    #[test]
222    fn ascii_multi_line_line_col() {
223        // "ab\ncde\nf"
224        //  0 1  2 3 4 5  6 7
225        // line 0: "ab\n" (offsets 0..3)
226        // line 1: "cde\n" (offsets 3..7)
227        // line 2: "f" (offset 7)
228        let idx = LineIndex::new("ab\ncde\nf");
229        assert_eq!(idx.line_count(), 3);
230        assert_eq!(idx.line_col(TextSize::from(0)), LineCol { line: 0, col: 0 });
231        assert_eq!(idx.line_col(TextSize::from(2)), LineCol { line: 0, col: 2 });
232        assert_eq!(idx.line_col(TextSize::from(3)), LineCol { line: 1, col: 0 });
233        assert_eq!(idx.line_col(TextSize::from(6)), LineCol { line: 1, col: 3 });
234        assert_eq!(idx.line_col(TextSize::from(7)), LineCol { line: 2, col: 0 });
235    }
236
237    #[test]
238    fn utf8_offset_is_bytes_not_chars() {
239        // "éllo" — 'é' is 2 UTF-8 bytes (0xC3 0xA9).
240        // offset 2 is the byte after 'é' → col 2.
241        let idx = LineIndex::new("éllo");
242        assert_eq!(idx.line_col(TextSize::from(2)), LineCol { line: 0, col: 2 });
243        // Offset 1 lands in the middle of 'é'; we report the mechanical
244        // result (col 1) — callers that want char alignment must snap.
245        assert_eq!(idx.line_col(TextSize::from(1)), LineCol { line: 0, col: 1 });
246    }
247
248    #[test]
249    fn utf16_round_trip_bmp() {
250        // "abé": 'a'=1B, 'b'=1B, 'é'=2B UTF-8; in UTF-16 all three chars
251        // are 1 code unit. After 'é' UTF-8 col is 4, UTF-16 col is 3.
252        let idx = LineIndex::new("abé");
253        let utf8 = idx.line_col(TextSize::from(4));
254        assert_eq!(utf8, LineCol { line: 0, col: 4 });
255        let utf16 = idx.to_utf16(utf8);
256        assert_eq!(utf16, WideLineCol { line: 0, col: 3 });
257        let back = idx.from_utf16(utf16);
258        assert_eq!(back, utf8);
259    }
260
261    #[test]
262    fn utf16_round_trip_astral() {
263        // "a\u{1F600}b": 'a'=1B, grin=4B UTF-8 / 2 UTF-16 code units, 'b'=1B.
264        // UTF-8 offset after grin is 5; UTF-16 col is 3 (1 + 2).
265        let idx = LineIndex::new("a\u{1F600}b");
266        let utf8 = idx.line_col(TextSize::from(5));
267        assert_eq!(utf8, LineCol { line: 0, col: 5 });
268        let utf16 = idx.to_utf16(utf8);
269        assert_eq!(utf16, WideLineCol { line: 0, col: 3 });
270        let back = idx.from_utf16(utf16);
271        assert_eq!(back, utf8);
272    }
273
274    #[test]
275    fn line_range_last_line_open_ended() {
276        // "ab\ncd": line 0 = 0..3, line 1 = 3..u32::MAX sentinel.
277        let idx = LineIndex::new("ab\ncd");
278        assert_eq!(
279            idx.line_range(0),
280            Some(TextRange::new(TextSize::from(0), TextSize::from(3)))
281        );
282        assert_eq!(
283            idx.line_range(1),
284            Some(TextRange::new(TextSize::from(3), TextSize::from(u32::MAX)))
285        );
286        assert_eq!(idx.line_range(2), None);
287    }
288}