Skip to main content

qala_compiler/
span.rs

1//! source locations: a [`Span`] is a byte offset plus a length into the source
2//! string, and [`LineIndex`] turns a byte offset into a 1-based line and column.
3//!
4//! the scanner only ever advances byte offsets, so spans stay two `u32`s with no
5//! line or column fields. line and column are computed on demand from a line-start
6//! table; the cost is paid only when a diagnostic is actually rendered.
7
8/// a region of the source: `start` is a byte offset, `len` is a byte length.
9///
10/// `u32` fields cap a source file at 4 GiB, which is fine for a single-file
11/// teaching language and halves the struct size against `usize`. `Copy` matters
12/// because spans are attached to every token and every AST node and get cloned
13/// constantly.
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
15pub struct Span {
16    /// byte offset of the first byte of this region, from the start of the source.
17    pub start: u32,
18    /// length of this region in bytes.
19    pub len: u32,
20}
21
22impl Span {
23    /// construct a span from a byte offset and a byte length.
24    pub fn new(start: usize, len: usize) -> Self {
25        Span {
26            start: start as u32,
27            len: len as u32,
28        }
29    }
30
31    /// the byte offset one past the last byte of this region (`start + len`).
32    pub fn end(self) -> usize {
33        (self.start + self.len) as usize
34    }
35
36    /// the smallest span covering both inputs.
37    ///
38    /// uses `min` on the start and `max` on the end, so the result is correct
39    /// regardless of argument order and whether the two spans overlap, touch, or
40    /// have a gap between them. the parser uses this to span a whole expression
41    /// from its first token to its last.
42    pub fn merge(self, other: Span) -> Span {
43        let start = self.start.min(other.start);
44        let end = (self.end().max(other.end())) as u32;
45        Span {
46            start,
47            len: end - start,
48        }
49    }
50
51    /// alias for [`Span::merge`]; reads better at parser call sites
52    /// (`first.to(last)`).
53    pub fn to(self, other: Span) -> Span {
54        self.merge(other)
55    }
56
57    /// the source text this span covers.
58    ///
59    /// the caller must pass the same source the span was produced from; the span
60    /// indexes it with its own `start` and `end`, both of which the scanner
61    /// produced on UTF-8 boundaries.
62    pub fn slice(self, src: &str) -> &str {
63        &src[self.start as usize..self.end()]
64    }
65}
66
67/// a table of line-start byte offsets, built once from the source, used to map a
68/// byte offset to a 1-based line and column.
69///
70/// `line_starts[0]` is always `0` (the first line starts at the first byte); an
71/// entry `offset + 1` is pushed after each `\n` byte. `\r\n` needs no special
72/// case: the `\n` ends the line and a lone `\r` is just whitespace inside the
73/// preceding line.
74pub struct LineIndex {
75    /// byte offset where each line begins; one entry per line, strictly increasing.
76    line_starts: Vec<u32>,
77}
78
79impl LineIndex {
80    /// scan the source once and record where each line starts.
81    pub fn new(src: &str) -> Self {
82        let mut line_starts = vec![0u32];
83        for (i, b) in src.bytes().enumerate() {
84            if b == b'\n' {
85                line_starts.push(i as u32 + 1);
86            }
87        }
88        LineIndex { line_starts }
89    }
90
91    /// the 1-based line and 1-based column of `byte_offset` in `src`.
92    ///
93    /// the column is a count of characters from the start of the line, so a tab
94    /// counts as one column, not eight. this is deliberate: the logical column is
95    /// character-based, and any tab-expanded display column is the renderer's job
96    /// later. an offset at end of file returns a valid pair and never panics.
97    /// an offset beyond `src.len()` is clamped to `src.len()` rather than
98    /// panicking; this keeps WASM alive when a downstream caller (e.g. the
99    /// diagnostics layer) passes a span-derived offset that is slightly wrong.
100    pub fn location(&self, src: &str, byte_offset: usize) -> (usize, usize) {
101        let byte_offset = byte_offset.min(src.len());
102        let off = byte_offset as u32;
103        // index of the last line start that is <= off, i.e. the line containing
104        // off. line_starts[0] == 0 and off >= 0, so partition_point returns at
105        // least 1 here and the subtraction never underflows.
106        let line = self.line_starts.partition_point(|&s| s <= off) - 1;
107        let line_start = self.line_starts[line] as usize;
108        let col = src[line_start..byte_offset].chars().count() + 1;
109        (line + 1, col)
110    }
111}
112
113#[cfg(test)]
114mod tests {
115    use super::*;
116
117    #[test]
118    fn merge_overlapping_spans() {
119        // 0..5 and 3..8 -> 0..8
120        let a = Span::new(0, 5);
121        let b = Span::new(3, 5);
122        let m = a.merge(b);
123        assert_eq!(m.start, 0);
124        assert_eq!(m.len, 8);
125    }
126
127    #[test]
128    fn merge_adjacent_spans() {
129        // 0..3 and 3..7 -> 0..7, one contiguous span, no gap
130        let a = Span::new(0, 3);
131        let b = Span::new(3, 4);
132        let m = a.merge(b);
133        assert_eq!(m.start, 0);
134        assert_eq!(m.len, 7);
135    }
136
137    #[test]
138    fn merge_gapped_spans() {
139        // 0..2 and 5..7 -> 0..7, the gap (2..5) is included
140        let a = Span::new(0, 2);
141        let b = Span::new(5, 2);
142        let m = a.merge(b);
143        assert_eq!(m.start, 0);
144        assert_eq!(m.len, 7);
145    }
146
147    #[test]
148    fn merge_is_order_independent() {
149        let a = Span::new(2, 3); // 2..5
150        let b = Span::new(7, 4); // 7..11
151        assert_eq!(a.merge(b), b.merge(a));
152        let m = a.merge(b);
153        assert_eq!(m.start, 2);
154        assert_eq!(m.len, 9);
155    }
156
157    #[test]
158    fn to_is_an_alias_for_merge() {
159        let a = Span::new(1, 2);
160        let b = Span::new(10, 3);
161        assert_eq!(a.to(b), a.merge(b));
162        assert_eq!(b.to(a), a.merge(b));
163    }
164
165    #[test]
166    fn slice_returns_the_covered_source() {
167        let src = "let x = 42;";
168        let span = Span::new(4, 1); // the `x`
169        assert_eq!(span.slice(src), "x");
170        let span = Span::new(8, 2); // the `42`
171        assert_eq!(span.slice(src), "42");
172        // a token's span round-trips: start/len slice back to the right text.
173        let whole = Span::new(0, src.len());
174        assert_eq!(whole.slice(src), src);
175    }
176
177    #[test]
178    fn end_is_start_plus_len() {
179        let s = Span::new(5, 3);
180        assert_eq!(s.end(), 8);
181        assert_eq!(Span::new(0, 0).end(), 0);
182    }
183
184    #[test]
185    fn location_at_offset_zero_is_line_1_col_1() {
186        let src = "abc\ndef";
187        let idx = LineIndex::new(src);
188        assert_eq!(idx.location(src, 0), (1, 1));
189    }
190
191    #[test]
192    fn location_at_a_newline_is_on_the_line_it_ends() {
193        // "abc\ndef": the '\n' is byte 3, on line 1, column 4.
194        let src = "abc\ndef";
195        let idx = LineIndex::new(src);
196        assert_eq!(idx.location(src, 3), (1, 4));
197    }
198
199    #[test]
200    fn location_just_after_a_newline_is_next_line_col_1() {
201        // byte 4 is 'd', the first char of line 2.
202        let src = "abc\ndef";
203        let idx = LineIndex::new(src);
204        assert_eq!(idx.location(src, 4), (2, 1));
205    }
206
207    #[test]
208    fn location_at_end_of_file_is_valid_and_does_not_panic() {
209        let src = "abc\ndef";
210        let idx = LineIndex::new(src);
211        // src.len() == 7: end of line 2, column 4 (one past 'f').
212        assert_eq!(idx.location(src, src.len()), (2, 4));
213        // an empty file: offset 0 == len 0, line 1 column 1.
214        let empty = "";
215        let eidx = LineIndex::new(empty);
216        assert_eq!(eidx.location(empty, 0), (1, 1));
217    }
218
219    #[test]
220    fn location_clamps_out_of_range_offset_instead_of_panicking() {
221        // an offset exactly at src.len() is valid (end-of-file position).
222        let src = "abc\ndef";
223        let idx = LineIndex::new(src);
224        let at_eof = idx.location(src, src.len());
225        // an offset past the end must return the same position, not panic.
226        assert_eq!(idx.location(src, src.len() + 1), at_eof);
227        assert_eq!(idx.location(src, src.len() + 100), at_eof);
228        // empty source: offset 0 and offset 5 both map to (1, 1).
229        let empty = "";
230        let eidx = LineIndex::new(empty);
231        assert_eq!(eidx.location(empty, 5), (1, 1));
232    }
233
234    #[test]
235    fn location_counts_columns_as_characters_so_tabs_are_one_column() {
236        // line 2 is "\t\tx": two tabs then 'x'. the 'x' is byte 6, column 3
237        // (char count from line start: tab, tab, then x is the 3rd char), not
238        // column 17 or anything tab-expanded. this is LEX-06.
239        let src = "fn f() {\n\t\tx\n}";
240        let idx = LineIndex::new(src);
241        let x_offset = src.find('x').unwrap();
242        assert_eq!(idx.location(src, x_offset), (2, 3));
243        // a space-indented line gives the same column for the same character
244        // position: "  y" -> 'y' at column 3.
245        let src2 = "fn f() {\n  y\n}";
246        let idx2 = LineIndex::new(src2);
247        let y_offset = src2.find('y').unwrap();
248        assert_eq!(idx2.location(src2, y_offset), (2, 3));
249    }
250
251    #[test]
252    fn crlf_line_endings_break_on_the_newline_not_the_carriage_return() {
253        // "a\r\nb": '\r' is byte 1 (still line 1), '\n' is byte 2 (line 1),
254        // 'b' is byte 3 (line 2, column 1). a lone '\r' is not a line break.
255        let src = "a\r\nb";
256        let idx = LineIndex::new(src);
257        assert_eq!(idx.location(src, 1), (1, 2)); // the '\r'
258        assert_eq!(idx.location(src, 3), (2, 1)); // the 'b'
259    }
260}