qala_compiler/span.rs
1//! source locations: a [`Span`] is a byte offset plus a length into the source
2//! string, and [`LineIndex`] turns a byte offset into a 1-based line and column.
3//!
4//! the scanner only ever advances byte offsets, so spans stay two `u32`s with no
5//! line or column fields. line and column are computed on demand from a line-start
6//! table; the cost is paid only when a diagnostic is actually rendered.
7
8/// a region of the source: `start` is a byte offset, `len` is a byte length.
9///
10/// `u32` fields cap a source file at 4 GiB, which is fine for a single-file
11/// teaching language and halves the struct size against `usize`. `Copy` matters
12/// because spans are attached to every token and every AST node and get cloned
13/// constantly.
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
15pub struct Span {
16 /// byte offset of the first byte of this region, from the start of the source.
17 pub start: u32,
18 /// length of this region in bytes.
19 pub len: u32,
20}
21
22impl Span {
23 /// construct a span from a byte offset and a byte length.
24 pub fn new(start: usize, len: usize) -> Self {
25 Span {
26 start: start as u32,
27 len: len as u32,
28 }
29 }
30
31 /// the byte offset one past the last byte of this region (`start + len`).
32 pub fn end(self) -> usize {
33 (self.start + self.len) as usize
34 }
35
36 /// the smallest span covering both inputs.
37 ///
38 /// uses `min` on the start and `max` on the end, so the result is correct
39 /// regardless of argument order and whether the two spans overlap, touch, or
40 /// have a gap between them. the parser uses this to span a whole expression
41 /// from its first token to its last.
42 pub fn merge(self, other: Span) -> Span {
43 let start = self.start.min(other.start);
44 let end = (self.end().max(other.end())) as u32;
45 Span {
46 start,
47 len: end - start,
48 }
49 }
50
51 /// alias for [`Span::merge`]; reads better at parser call sites
52 /// (`first.to(last)`).
53 pub fn to(self, other: Span) -> Span {
54 self.merge(other)
55 }
56
57 /// the source text this span covers.
58 ///
59 /// the caller must pass the same source the span was produced from; the span
60 /// indexes it with its own `start` and `end`, both of which the scanner
61 /// produced on UTF-8 boundaries.
62 pub fn slice(self, src: &str) -> &str {
63 &src[self.start as usize..self.end()]
64 }
65}
66
67/// a table of line-start byte offsets, built once from the source, used to map a
68/// byte offset to a 1-based line and column.
69///
70/// `line_starts[0]` is always `0` (the first line starts at the first byte); an
71/// entry `offset + 1` is pushed after each `\n` byte. `\r\n` needs no special
72/// case: the `\n` ends the line and a lone `\r` is just whitespace inside the
73/// preceding line.
74pub struct LineIndex {
75 /// byte offset where each line begins; one entry per line, strictly increasing.
76 line_starts: Vec<u32>,
77}
78
79impl LineIndex {
80 /// scan the source once and record where each line starts.
81 pub fn new(src: &str) -> Self {
82 let mut line_starts = vec![0u32];
83 for (i, b) in src.bytes().enumerate() {
84 if b == b'\n' {
85 line_starts.push(i as u32 + 1);
86 }
87 }
88 LineIndex { line_starts }
89 }
90
91 /// the 1-based line and 1-based column of `byte_offset` in `src`.
92 ///
93 /// the column is a count of characters from the start of the line, so a tab
94 /// counts as one column, not eight. this is deliberate: the logical column is
95 /// character-based, and any tab-expanded display column is the renderer's job
96 /// later. an offset at end of file returns a valid pair and never panics.
97 /// an offset beyond `src.len()` is clamped to `src.len()` rather than
98 /// panicking; this keeps WASM alive when a downstream caller (e.g. the
99 /// diagnostics layer) passes a span-derived offset that is slightly wrong.
100 pub fn location(&self, src: &str, byte_offset: usize) -> (usize, usize) {
101 let byte_offset = byte_offset.min(src.len());
102 let off = byte_offset as u32;
103 // index of the last line start that is <= off, i.e. the line containing
104 // off. line_starts[0] == 0 and off >= 0, so partition_point returns at
105 // least 1 here and the subtraction never underflows.
106 let line = self.line_starts.partition_point(|&s| s <= off) - 1;
107 let line_start = self.line_starts[line] as usize;
108 let col = src[line_start..byte_offset].chars().count() + 1;
109 (line + 1, col)
110 }
111}
112
113#[cfg(test)]
114mod tests {
115 use super::*;
116
117 #[test]
118 fn merge_overlapping_spans() {
119 // 0..5 and 3..8 -> 0..8
120 let a = Span::new(0, 5);
121 let b = Span::new(3, 5);
122 let m = a.merge(b);
123 assert_eq!(m.start, 0);
124 assert_eq!(m.len, 8);
125 }
126
127 #[test]
128 fn merge_adjacent_spans() {
129 // 0..3 and 3..7 -> 0..7, one contiguous span, no gap
130 let a = Span::new(0, 3);
131 let b = Span::new(3, 4);
132 let m = a.merge(b);
133 assert_eq!(m.start, 0);
134 assert_eq!(m.len, 7);
135 }
136
137 #[test]
138 fn merge_gapped_spans() {
139 // 0..2 and 5..7 -> 0..7, the gap (2..5) is included
140 let a = Span::new(0, 2);
141 let b = Span::new(5, 2);
142 let m = a.merge(b);
143 assert_eq!(m.start, 0);
144 assert_eq!(m.len, 7);
145 }
146
147 #[test]
148 fn merge_is_order_independent() {
149 let a = Span::new(2, 3); // 2..5
150 let b = Span::new(7, 4); // 7..11
151 assert_eq!(a.merge(b), b.merge(a));
152 let m = a.merge(b);
153 assert_eq!(m.start, 2);
154 assert_eq!(m.len, 9);
155 }
156
157 #[test]
158 fn to_is_an_alias_for_merge() {
159 let a = Span::new(1, 2);
160 let b = Span::new(10, 3);
161 assert_eq!(a.to(b), a.merge(b));
162 assert_eq!(b.to(a), a.merge(b));
163 }
164
165 #[test]
166 fn slice_returns_the_covered_source() {
167 let src = "let x = 42;";
168 let span = Span::new(4, 1); // the `x`
169 assert_eq!(span.slice(src), "x");
170 let span = Span::new(8, 2); // the `42`
171 assert_eq!(span.slice(src), "42");
172 // a token's span round-trips: start/len slice back to the right text.
173 let whole = Span::new(0, src.len());
174 assert_eq!(whole.slice(src), src);
175 }
176
177 #[test]
178 fn end_is_start_plus_len() {
179 let s = Span::new(5, 3);
180 assert_eq!(s.end(), 8);
181 assert_eq!(Span::new(0, 0).end(), 0);
182 }
183
184 #[test]
185 fn location_at_offset_zero_is_line_1_col_1() {
186 let src = "abc\ndef";
187 let idx = LineIndex::new(src);
188 assert_eq!(idx.location(src, 0), (1, 1));
189 }
190
191 #[test]
192 fn location_at_a_newline_is_on_the_line_it_ends() {
193 // "abc\ndef": the '\n' is byte 3, on line 1, column 4.
194 let src = "abc\ndef";
195 let idx = LineIndex::new(src);
196 assert_eq!(idx.location(src, 3), (1, 4));
197 }
198
199 #[test]
200 fn location_just_after_a_newline_is_next_line_col_1() {
201 // byte 4 is 'd', the first char of line 2.
202 let src = "abc\ndef";
203 let idx = LineIndex::new(src);
204 assert_eq!(idx.location(src, 4), (2, 1));
205 }
206
207 #[test]
208 fn location_at_end_of_file_is_valid_and_does_not_panic() {
209 let src = "abc\ndef";
210 let idx = LineIndex::new(src);
211 // src.len() == 7: end of line 2, column 4 (one past 'f').
212 assert_eq!(idx.location(src, src.len()), (2, 4));
213 // an empty file: offset 0 == len 0, line 1 column 1.
214 let empty = "";
215 let eidx = LineIndex::new(empty);
216 assert_eq!(eidx.location(empty, 0), (1, 1));
217 }
218
219 #[test]
220 fn location_clamps_out_of_range_offset_instead_of_panicking() {
221 // an offset exactly at src.len() is valid (end-of-file position).
222 let src = "abc\ndef";
223 let idx = LineIndex::new(src);
224 let at_eof = idx.location(src, src.len());
225 // an offset past the end must return the same position, not panic.
226 assert_eq!(idx.location(src, src.len() + 1), at_eof);
227 assert_eq!(idx.location(src, src.len() + 100), at_eof);
228 // empty source: offset 0 and offset 5 both map to (1, 1).
229 let empty = "";
230 let eidx = LineIndex::new(empty);
231 assert_eq!(eidx.location(empty, 5), (1, 1));
232 }
233
234 #[test]
235 fn location_counts_columns_as_characters_so_tabs_are_one_column() {
236 // line 2 is "\t\tx": two tabs then 'x'. the 'x' is byte 6, column 3
237 // (char count from line start: tab, tab, then x is the 3rd char), not
238 // column 17 or anything tab-expanded. this is LEX-06.
239 let src = "fn f() {\n\t\tx\n}";
240 let idx = LineIndex::new(src);
241 let x_offset = src.find('x').unwrap();
242 assert_eq!(idx.location(src, x_offset), (2, 3));
243 // a space-indented line gives the same column for the same character
244 // position: " y" -> 'y' at column 3.
245 let src2 = "fn f() {\n y\n}";
246 let idx2 = LineIndex::new(src2);
247 let y_offset = src2.find('y').unwrap();
248 assert_eq!(idx2.location(src2, y_offset), (2, 3));
249 }
250
251 #[test]
252 fn crlf_line_endings_break_on_the_newline_not_the_carriage_return() {
253 // "a\r\nb": '\r' is byte 1 (still line 1), '\n' is byte 2 (line 1),
254 // 'b' is byte 3 (line 2, column 1). a lone '\r' is not a line break.
255 let src = "a\r\nb";
256 let idx = LineIndex::new(src);
257 assert_eq!(idx.location(src, 1), (1, 2)); // the '\r'
258 assert_eq!(idx.location(src, 3), (2, 1)); // the 'b'
259 }
260}