reading_liner/index.rs
1use std::ops;
2
3use crate::location::{Offset, line_column};
4
5/// An index built for fast convertion between byte offsets and line-column locations.
6///
7/// NOTE: the last element of `line_offsets` should be considered as a fake offset (sentinel):
8///
9/// `self.line_offsets: [line0, line1, ..., EOF]`
10///
11/// By the word 'fake', we means that if some offset >= the last offset,
12/// it should be seen as exceding the index since the last element only marks ending.
13///
14/// # Invariants
15///
16/// - `index` is never empty.
17/// - `index[0] == 0`.
18/// - `index` is monotonically increasing.
19/// - `index` is append-only (no removals or mutations).
20/// - A sentinel EOF offset is always present as the last element.
21///
22/// Therefore:
23/// - Valid logical line indices are `0..self.count()`,
24/// where `count() = index.len() - 1`.
25/// - For any valid line `i`, the byte range is:
26/// `[index[i], index[i + 1])`.
27#[derive(Debug)]
28pub struct Index {
29 /// vector of offsets whose element records the beginning offset of source UTF8 string
30 line_offsets: Vec<Offset>,
31}
32
33impl Index {
34 /// An index with the first line starting at offset 0, which is the most common usage.
35 ///
36 /// The zero is safe here since it just means an ending, which also means empty.
37 pub fn new() -> Self {
38 Self {
39 line_offsets: vec![0.into()],
40 }
41 }
42
43 /// length of index
44 /// The API has guaranteed that self.line_offsets.len() > 0
45 #[inline]
46 pub fn count(&self) -> usize {
47 debug_assert!(!self.line_offsets.is_empty());
48 self.line_offsets.len() - 1
49 }
50
51 /// ending offset of the source
52 #[inline]
53 pub fn end(&self) -> Option<Offset> {
54 self.line_offsets.last().copied()
55 }
56
57 /// into vector of offsets
58 #[inline]
59 pub fn into_offsets(self) -> Vec<Offset> {
60 self.line_offsets
61 }
62}
63
64impl Index {
65 /// Get the query and freeze index when querying
66 pub fn query(&self) -> Query<'_> {
67 Query::from(&self.line_offsets[..])
68 }
69
70 pub fn get_line_offset_mut(&mut self, line_no: usize) -> Option<&mut Offset> {
71 self.line_offsets.get_mut(line_no)
72 }
73
74 /// Add next line offset to index
75 pub fn add_next_line(&mut self, offset: Offset) {
76 self.line_offsets.push(offset);
77 }
78
79 /// Reset the index
80 pub fn clear(&mut self) {
81 self.line_offsets.clear();
82 self.add_next_line(0.into());
83 }
84}
85
86/// Query line and offset information.
87///
88/// NOTE: Since the `Query` can be sliced, we carefully store an extra beginning offset to trace slice beginning:
89///
90/// `self.slice: [line0, line1, ..., EOF]`
91///
92/// One should keep in mind that all line numbers passed into query methods should be numbers **from the original [Index]**
93/// and the slice is *non-empty*.
94#[derive(Debug)]
95pub struct Query<'index> {
96 /// the beginning line number,
97 /// used to recover line number since `slice` lacks beginning offset from the original [Index]
98 begin: usize,
99
100 slice: &'index [Offset],
101}
102
103impl<'index> Query<'index> {
104 /// This builder is internal since we don't want users to accidentally build a Query with empty slice
105 fn new(begin: usize, slice: &'index [Offset]) -> Self {
106 Self { begin, slice }
107 }
108
109 /// build from raw slice, assuming the beginning is zero
110 fn from(slice: &'index [Offset]) -> Self {
111 Self { begin: 0, slice }
112 }
113
114 /// Returns a zero-copy view over a subrange of lines.
115 ///
116 /// The input `range` is interpreted over the *logical* line indices,
117 /// i.e. `0..self.count()`, where `count() = self.slice.len() - 1`.
118 ///
119 /// Internally, `self.slice` stores a sentinel EOF offset as the last element:
120 /// `[line0, line1, ..., EOF]`.
121 ///
122 /// To preserve this invariant, the returned slice includes one extra element
123 /// at the end (the sentinel), so the actual slice is:
124 ///
125 /// ```text
126 /// slice[range.start .. range.end + 1]
127 /// ```
128 /// This ensures that every line `i` in the resulting view satisfies:
129 /// ```text
130 /// line i = [slice[i], slice[i+1])
131 /// ```
132 ///
133 /// # Panics
134 /// Panics if:
135 /// - `range.start > range.end`
136 /// - `range.end > self.count()`
137 ///
138 /// These conditions indicate a violation of the API contract.
139 ///
140 /// # Performance
141 /// This operation is zero-copy and does not allocate.
142 ///
143 /// Invariant:
144 /// - self.slice.len() >= 1
145 /// - last element is EOF
146 /// - valid line indices: 0..self.slice.len()-1
147 pub fn range(&self, range: ops::Range<usize>) -> Self {
148 assert!(range.start <= range.end);
149 assert!(range.end <= self.count());
150
151 let range = range.start..range.end + 1;
152 Self::new(range.start, &self.slice[range])
153 }
154
155 /// Returns a zero-copy view over lines starting from `range_from.start`
156 /// to the end.
157 ///
158 /// The input is interpreted over the *logical* line indices,
159 /// i.e. `0..self.count()`.
160 ///
161 /// Internally, `self.slice` always ends with a sentinel EOF offset:
162 /// `[line0, line1, ..., EOF]`.
163 ///
164 /// Therefore, slicing with `slice[start..]` naturally preserves the sentinel,
165 /// and no additional adjustment is needed.
166 ///
167 /// The resulting view satisfies:
168 /// ```text
169 /// line i = [slice[i], slice[i+1])
170 /// ```
171 ///
172 /// # Panics
173 /// Panics if:
174 /// - `range_from.start > self.count()`
175 ///
176 /// # Edge Cases
177 /// - If `range_from.start == self.count()`, the returned slice contains only
178 /// the EOF sentinel. This represents an empty range of lines.
179 ///
180 /// # Performance
181 /// This operation is zero-copy and does not allocate.
182 ///
183 /// invariant: self.slice always ends with EOF
184 /// so slice[start..] always contains a valid sentinel
185 pub fn range_from(&self, range_from: ops::RangeFrom<usize>) -> Self {
186 assert!(range_from.start <= self.count());
187
188 Self::new(range_from.start, &self.slice[range_from])
189 }
190
191 /// Count the total lines
192 pub fn count(&self) -> usize {
193 debug_assert!(!self.slice.is_empty());
194 self.slice.len() - 1
195 }
196}
197
198impl Query<'_> {
199 /// Given the number of line `line_no`, returns its start offset.
200 #[inline]
201 pub fn line_offset(&self, line_no: usize) -> Option<Offset> {
202 if line_no < self.begin {
203 return None;
204 }
205 let line_no = line_no - self.begin;
206 self.slice.get(line_no).copied()
207 }
208
209 /// Given the number of line `line_no`,
210 /// Returns the byte range of the given line.
211 ///
212 /// The returned range is half-open: `[start, end)`, where `start` is the
213 /// beginning of the line and `end` is the beginning of the next line
214 /// (or EOF for the last line).
215 ///
216 /// # Returns
217 /// - `Some(range)` if the line exists.
218 /// - `None` if the line index is out of bounds.
219 ///
220 /// # Invariants
221 /// - The internal index always contains a sentinel EOF offset,
222 /// so `line_offset(line_no + 1)` is valid for the last line.
223 ///
224 /// # Notes
225 /// - The range is expressed in **byte offsets**, not character indices.
226 pub fn line_span(&self, line_no: usize) -> Option<ops::Range<Offset>> {
227 let start = self.line_offset(line_no)?;
228 let end = self.line_offset(line_no + 1)?; // it's safe here since we have a fake ending
229
230 Some(start..end)
231 }
232
233 /// The beginning of the whole query range
234 #[inline]
235 pub fn beginning(&self) -> Option<Offset> {
236 self.line_offset(0)
237 }
238
239 /// The ending of the whole query range
240 #[inline]
241 pub fn ending(&self) -> Option<Offset> {
242 self.slice.last().copied()
243 }
244
245 /// check if contains the given offset
246 pub fn contains(&self, offset: Offset) -> bool {
247 let Some(begin) = self.beginning() else {
248 return false;
249 };
250 let Some(end) = self.ending() else {
251 return false;
252 };
253
254 offset >= begin && offset < end
255 }
256
257 /// Locate the line index for a given byte `offset`.
258 ///
259 /// This method performs a binary search over the internal line index to find
260 /// the line whose span contains the given offset.
261 ///
262 /// # Returns
263 /// - `Some(line)` if the offset lies within a known line.
264 /// - `None` if:
265 /// - the offset is before the beginning of the first line, or
266 /// - the offset is at or beyond the sentinel EOF offset.
267 ///
268 /// # Invariants
269 /// - `self.slice` is a sorted list of line starting offsets.
270 /// - The last element of `self.slice` is a sentinel EOF offset.
271 /// - Each line `i` corresponds to the half-open interval:
272 /// `[slice[i], slice[i + 1])`.
273 ///
274 /// # Notes
275 /// - The returned line index is zero-based.
276 /// - If `offset == EOF`, this method returns `None`, since EOF is not
277 /// considered part of any line.
278 #[inline]
279 pub fn locate_line(&self, offset: Offset) -> Option<usize> {
280 binary_search_between(&self.slice, offset).map(|n| self.begin + n)
281 }
282
283 /// Locate the (line, column) position for a given byte `offset`.
284 ///
285 /// This method uses the existing line index without performing any I/O.
286 /// It resolves the line containing the offset, then computes the column
287 /// as the byte distance from the beginning of that line.
288 ///
289 /// # Returns
290 /// - `Some(ZeroBased(line, column))` if the offset lies within a known range.
291 /// - `None` if the offset is out of bounds or beyond the indexed data.
292 ///
293 /// # Invariants
294 /// - The internal index contains valid starting offsets for all indexed lines.
295 /// - Therefore, `line_offset(line)` is guaranteed to succeed for any line
296 /// returned by `locate_line`.
297 ///
298 /// # Notes
299 /// - Both line and column are zero-based.
300 /// - Column is measured in **bytes**, not characters.
301 /// - This method does not attempt to extend the index; for streaming input,
302 /// use the mutable variant instead.
303 pub fn locate(&self, offset: Offset) -> Option<line_column::ZeroBased> {
304 let line = self.locate_line(offset)?;
305 let line_offset = self.line_offset(line).unwrap();
306 let col = offset - line_offset;
307
308 Some((line, col.raw()).into())
309 }
310
311 /// Encode a (line, column) location into a byte `Offset`.
312 ///
313 /// This method uses the existing line index without performing any I/O.
314 /// It validates that the resulting offset lies within the bounds of the line.
315 ///
316 /// # Returns
317 /// - `Some(offset)` if the position is valid.
318 /// - `None` if:
319 /// - the line does not exist, or
320 /// - the column is out of bounds.
321 ///
322 /// # Invariants
323 /// - `line_span(line)` returns a half-open range `[start, end)`.
324 ///
325 /// # Notes
326 /// - Column is interpreted as a **byte offset** relative to the start of the line.
327 /// - No UTF-8 character boundary checks are performed.
328 pub fn encode(&self, location: line_column::ZeroBased) -> Option<Offset> {
329 let (line, col) = location.raw();
330
331 let range = self.line_span(line)?;
332 let offset = range.start + col;
333 range.contains(&offset).then_some(offset)
334 }
335}
336
337/// SAFETY: Assuming `xs` is ordered, try to find a interval where `x` lies.
338/// returns the start index of interval
339///
340/// NOTE: if the input `x` equals the last element of `xs`, this function still returns `None`.
341/// This is because the last element is regarded as an fake ending.
342fn binary_search_between<A: Ord + Copy>(xs: &[A], x: A) -> Option<usize> {
343 if xs.len() <= 1 {
344 return None;
345 }
346 if x == xs[0] {
347 return Some(0);
348 }
349 if x < xs[0] {
350 return None;
351 }
352
353 let mut start = 0;
354 let mut end = xs.len() - 1;
355 while start < end {
356 // base case
357 if start == end - 1 && xs[start] <= x && x < xs[end] {
358 return Some(start);
359 }
360
361 // binary search
362 let mid = start + ((end - start) >> 1);
363 let y = xs[mid];
364 if x == y {
365 return Some(mid);
366 }
367
368 if x < y {
369 end = mid;
370 continue;
371 }
372 // x > y
373 if start == mid {
374 return None;
375 }
376 start = mid;
377 }
378
379 None
380}
381
382#[cfg(test)]
383mod test {
384 use super::*;
385 use quickcheck_macros::quickcheck;
386
387 fn linear_search_between<A: Ord + Copy>(xs: &[A], x: A) -> Option<usize> {
388 if xs.len() <= 1 {
389 return None;
390 }
391
392 for i in 0..xs.len() - 1 {
393 if xs[i] <= x && x < xs[i + 1] {
394 return Some(i);
395 }
396 }
397 None
398 }
399
400 #[quickcheck]
401 fn prop_binary_search_between(mut xs: Vec<i64>, x: i64) -> bool {
402 xs.sort();
403 xs.dedup();
404
405 if xs.len() < 2 {
406 return true;
407 }
408
409 let res0 = linear_search_between(&xs, x);
410 let res1 = binary_search_between(&xs, x);
411
412 res0 == res1
413 }
414
415 #[test]
416 fn test_binary_search() {
417 let xs = [2, 4, 6];
418 let i = binary_search_between(&xs, 3);
419 assert_eq!(i, Some(0));
420
421 let i = binary_search_between(&xs, 4);
422 assert_eq!(i, Some(1));
423
424 let i = binary_search_between(&xs, 1);
425 assert_eq!(i, None);
426
427 let i = binary_search_between(&xs, 7);
428 assert_eq!(i, None);
429
430 let i = binary_search_between(&xs, 6);
431 assert_eq!(i, None);
432 }
433}