reading_liner/index.rs
1//! The whole design of this crate is based on this module.
2//! The core types are [Index] and [Query].
3//!
4//! Type [Index] is an index for fast locating line-column location of offset and
5//! encoding line-column location to offset.
6//!
7//! Type [Index] alone cannot do any search or locatings. one must use [Index::query] to locate offset.
8//! The core algorithm is based on binary search.
9//! [Query] can also be 'sliced' safely.
10
11use std::ops;
12
13use crate::location::{Offset, line_column};
14
15/// An index built for fast convertion between byte offsets and line-column locations.
16///
17/// NOTE: the last element of `line_offsets` should be considered as a fake offset (sentinel):
18///
19/// `self.line_offsets: [line0, line1, ..., EOF]`
20///
21/// By the word 'fake', we means that if some offset >= the last offset,
22/// it should be seen as exceding the index since the last element only marks ending.
23///
24/// # Invariants
25///
26/// - `index` is never empty.
27/// - `index[0] == 0`.
28/// - `index` is monotonically increasing.
29/// - `index` is append-only (no removals or mutations).
30/// - A sentinel EOF offset is always present as the last element.
31///
32/// Therefore:
33/// - Valid logical line indices are `0..self.count()`,
34/// where `count() = index.len() - 1`.
35/// - For any valid line `i`, the byte range is:
36/// `[index[i], index[i + 1])`.
37#[derive(Debug)]
38pub struct Index {
39 /// vector of offsets whose element records the beginning offset of source UTF8 string
40 line_offsets: Vec<Offset>,
41}
42
43impl Index {
44 /// An index with the first line starting at offset 0, which is the most common usage.
45 ///
46 /// The zero is safe here since it just means an ending, which also means empty.
47 pub fn new() -> Self {
48 Self {
49 line_offsets: vec![0.into()],
50 }
51 }
52
53 /// length of index
54 /// The API has guaranteed that self.line_offsets.len() > 0
55 #[inline]
56 pub fn count(&self) -> usize {
57 debug_assert!(!self.line_offsets.is_empty());
58 self.line_offsets.len() - 1
59 }
60
61 /// ending offset of the source
62 #[inline]
63 pub fn end(&self) -> Option<Offset> {
64 self.line_offsets.last().copied()
65 }
66
67 /// into vector of offsets
68 #[inline]
69 pub fn into_offsets(self) -> Vec<Offset> {
70 self.line_offsets
71 }
72}
73
74impl Index {
75 /// Get the query and freeze index when querying
76 pub fn query(&self) -> Query<'_> {
77 Query::from(&self.line_offsets[..])
78 }
79
80 pub fn get_line_offset_mut(&mut self, line_no: usize) -> Option<&mut Offset> {
81 self.line_offsets.get_mut(line_no)
82 }
83
84 /// Add next line offset to index
85 pub fn add_next_line(&mut self, offset: Offset) {
86 self.line_offsets.push(offset);
87 }
88
89 /// Reset the index
90 pub fn clear(&mut self) {
91 self.line_offsets.clear();
92 self.add_next_line(0.into());
93 }
94}
95
96/// Query line and offset information.
97///
98/// NOTE: Since the `Query` can be sliced, we carefully store an extra beginning offset to trace slice beginning:
99///
100/// `self.slice: [line0, line1, ..., EOF]`
101///
102/// One should keep in mind that all line numbers passed into query methods should be numbers **from the original [Index]**
103/// and the slice is *non-empty*.
104#[derive(Debug)]
105pub struct Query<'index> {
106 /// the beginning line number,
107 /// used to recover line number since `slice` lacks beginning offset from the original [Index]
108 begin: usize,
109
110 slice: &'index [Offset],
111}
112
113impl<'index> Query<'index> {
114 /// This builder is internal since we don't want users to accidentally build a Query with empty slice
115 fn new(begin: usize, slice: &'index [Offset]) -> Self {
116 Self { begin, slice }
117 }
118
119 /// build from raw slice, assuming the beginning is zero
120 fn from(slice: &'index [Offset]) -> Self {
121 Self { begin: 0, slice }
122 }
123
124 /// Returns a zero-copy view over a subrange of lines.
125 ///
126 /// The input `range` is interpreted over the *logical* line indices,
127 /// i.e. `0..self.count()`, where `count() = self.slice.len() - 1`.
128 ///
129 /// Internally, `self.slice` stores a sentinel EOF offset as the last element:
130 /// `[line0, line1, ..., EOF]`.
131 ///
132 /// To preserve this invariant, the returned slice includes one extra element
133 /// at the end (the sentinel), so the actual slice is:
134 ///
135 /// ```text
136 /// slice[range.start .. range.end + 1]
137 /// ```
138 /// This ensures that every line `i` in the resulting view satisfies:
139 /// ```text
140 /// line i = [slice[i], slice[i+1])
141 /// ```
142 ///
143 /// # Panics
144 /// Panics if:
145 /// - `range.start > range.end`
146 /// - `range.end > self.count()`
147 ///
148 /// These conditions indicate a violation of the API contract.
149 ///
150 /// # Performance
151 /// This operation is zero-copy and does not allocate.
152 ///
153 /// Invariant:
154 /// - self.slice.len() >= 1
155 /// - last element is EOF
156 /// - valid line indices: 0..self.slice.len()-1
157 pub fn range(&self, range: ops::Range<usize>) -> Self {
158 assert!(range.start <= range.end);
159 assert!(range.end <= self.count());
160
161 let range = range.start..range.end + 1;
162 Self::new(range.start, &self.slice[range])
163 }
164
165 /// Returns a zero-copy view over lines starting from `range_from.start`
166 /// to the end.
167 ///
168 /// The input is interpreted over the *logical* line indices,
169 /// i.e. `0..self.count()`.
170 ///
171 /// Internally, `self.slice` always ends with a sentinel EOF offset:
172 /// `[line0, line1, ..., EOF]`.
173 ///
174 /// Therefore, slicing with `slice[start..]` naturally preserves the sentinel,
175 /// and no additional adjustment is needed.
176 ///
177 /// The resulting view satisfies:
178 /// ```text
179 /// line i = [slice[i], slice[i+1])
180 /// ```
181 ///
182 /// # Panics
183 /// Panics if:
184 /// - `range_from.start > self.count()`
185 ///
186 /// # Edge Cases
187 /// - If `range_from.start == self.count()`, the returned slice contains only
188 /// the EOF sentinel. This represents an empty range of lines.
189 ///
190 /// # Performance
191 /// This operation is zero-copy and does not allocate.
192 ///
193 /// invariant: self.slice always ends with EOF
194 /// so slice[start..] always contains a valid sentinel
195 pub fn range_from(&self, range_from: ops::RangeFrom<usize>) -> Self {
196 assert!(range_from.start <= self.count());
197
198 Self::new(range_from.start, &self.slice[range_from])
199 }
200
201 /// Count the total lines
202 pub fn count(&self) -> usize {
203 debug_assert!(!self.slice.is_empty());
204 self.slice.len() - 1
205 }
206}
207
208impl Query<'_> {
209 /// Given the number of line `line_no`, returns its start offset.
210 #[inline]
211 pub fn line_offset(&self, line_no: usize) -> Option<Offset> {
212 if line_no < self.begin {
213 return None;
214 }
215 let line_no = line_no - self.begin;
216 self.slice.get(line_no).copied()
217 }
218
219 /// Given the number of line `line_no`,
220 /// Returns the byte range of the given line.
221 ///
222 /// The returned range is half-open: `[start, end)`, where `start` is the
223 /// beginning of the line and `end` is the beginning of the next line
224 /// (or EOF for the last line).
225 ///
226 /// # Returns
227 /// - `Some(range)` if the line exists.
228 /// - `None` if the line index is out of bounds.
229 ///
230 /// # Invariants
231 /// - The internal index always contains a sentinel EOF offset,
232 /// so `line_offset(line_no + 1)` is valid for the last line.
233 ///
234 /// # Notes
235 /// - The range is expressed in **byte offsets**, not character indices.
236 pub fn line_span(&self, line_no: usize) -> Option<ops::Range<Offset>> {
237 let start = self.line_offset(line_no)?;
238 let end = self.line_offset(line_no + 1)?; // it's safe here since we have a fake ending
239
240 Some(start..end)
241 }
242
243 /// The beginning of the whole query range
244 #[inline]
245 pub fn beginning(&self) -> Option<Offset> {
246 self.line_offset(0)
247 }
248
249 /// The ending of the whole query range
250 #[inline]
251 pub fn ending(&self) -> Option<Offset> {
252 self.slice.last().copied()
253 }
254
255 /// check if contains the given offset
256 pub fn contains(&self, offset: Offset) -> bool {
257 let Some(begin) = self.beginning() else {
258 return false;
259 };
260 let Some(end) = self.ending() else {
261 return false;
262 };
263
264 offset >= begin && offset < end
265 }
266
267 /// Locate the line index for a given byte `offset`.
268 ///
269 /// This method performs a binary search over the internal line index to find
270 /// the line whose span contains the given offset.
271 ///
272 /// # Returns
273 /// - `Some(line)` if the offset lies within a known line.
274 /// - `None` if:
275 /// - the offset is before the beginning of the first line, or
276 /// - the offset is at or beyond the sentinel EOF offset.
277 ///
278 /// # Invariants
279 /// - `self.slice` is a sorted list of line starting offsets.
280 /// - The last element of `self.slice` is a sentinel EOF offset.
281 /// - Each line `i` corresponds to the half-open interval:
282 /// `[slice[i], slice[i + 1])`.
283 ///
284 /// # Notes
285 /// - The returned line index is zero-based.
286 /// - If `offset == EOF`, this method returns `None`, since EOF is not
287 /// considered part of any line.
288 #[inline]
289 pub fn locate_line(&self, offset: Offset) -> Option<usize> {
290 binary_search_between(&self.slice, offset).map(|n| self.begin + n)
291 }
292
293 /// Locate the (line, column) position for a given byte `offset`.
294 ///
295 /// This method uses the existing line index without performing any I/O.
296 /// It resolves the line containing the offset, then computes the column
297 /// as the byte distance from the beginning of that line.
298 ///
299 /// # Returns
300 /// - `Some(ZeroBased(line, column))` if the offset lies within a known range.
301 /// - `None` if the offset is out of bounds or beyond the indexed data.
302 ///
303 /// # Invariants
304 /// - The internal index contains valid starting offsets for all indexed lines.
305 /// - Therefore, `line_offset(line)` is guaranteed to succeed for any line
306 /// returned by `locate_line`.
307 ///
308 /// # Notes
309 /// - Both line and column are zero-based.
310 /// - Column is measured in **bytes**, not characters.
311 /// - This method does not attempt to extend the index; for streaming input,
312 /// use the mutable variant instead.
313 pub fn locate(&self, offset: Offset) -> Option<line_column::ZeroBased> {
314 let line = self.locate_line(offset)?;
315 let line_offset = self.line_offset(line).unwrap();
316 let col = offset - line_offset;
317
318 Some((line, col.raw()).into())
319 }
320
321 /// Encode a (line, column) location into a byte `Offset`.
322 ///
323 /// This method uses the existing line index without performing any I/O.
324 /// It validates that the resulting offset lies within the bounds of the line.
325 ///
326 /// # Returns
327 /// - `Some(offset)` if the position is valid.
328 /// - `None` if:
329 /// - the line does not exist, or
330 /// - the column is out of bounds.
331 ///
332 /// # Invariants
333 /// - `line_span(line)` returns a half-open range `[start, end)`.
334 ///
335 /// # Notes
336 /// - Column is interpreted as a **byte offset** relative to the start of the line.
337 /// - No UTF-8 character boundary checks are performed.
338 pub fn encode(&self, location: line_column::ZeroBased) -> Option<Offset> {
339 let (line, col) = location.raw();
340
341 let range = self.line_span(line)?;
342 let offset = range.start + col;
343 range.contains(&offset).then_some(offset)
344 }
345}
346
347/// SAFETY: Assuming `xs` is ordered, try to find a interval where `x` lies.
348/// returns the start index of interval
349///
350/// NOTE: if the input `x` equals the last element of `xs`, this function still returns `None`.
351/// This is because the last element is regarded as an fake ending.
352fn binary_search_between<A: Ord + Copy>(xs: &[A], x: A) -> Option<usize> {
353 if xs.len() <= 1 {
354 return None;
355 }
356 if x == xs[0] {
357 return Some(0);
358 }
359 if x < xs[0] {
360 return None;
361 }
362
363 let mut start = 0;
364 let mut end = xs.len() - 1;
365 while start < end {
366 // base case
367 if start == end - 1 && xs[start] <= x && x < xs[end] {
368 return Some(start);
369 }
370
371 // binary search
372 let mid = start + ((end - start) >> 1);
373 let y = xs[mid];
374 if x == y {
375 return Some(mid);
376 }
377
378 if x < y {
379 end = mid;
380 continue;
381 }
382 // x > y
383 if start == mid {
384 return None;
385 }
386 start = mid;
387 }
388
389 None
390}
391
392#[cfg(test)]
393mod test {
394 use super::*;
395 use quickcheck_macros::quickcheck;
396
397 fn linear_search_between<A: Ord + Copy>(xs: &[A], x: A) -> Option<usize> {
398 if xs.len() <= 1 {
399 return None;
400 }
401
402 for i in 0..xs.len() - 1 {
403 if xs[i] <= x && x < xs[i + 1] {
404 return Some(i);
405 }
406 }
407 None
408 }
409
410 #[quickcheck]
411 fn prop_binary_search_between(mut xs: Vec<i64>, x: i64) -> bool {
412 xs.sort();
413 xs.dedup();
414
415 if xs.len() < 2 {
416 return true;
417 }
418
419 let res0 = linear_search_between(&xs, x);
420 let res1 = binary_search_between(&xs, x);
421
422 res0 == res1
423 }
424
425 #[test]
426 fn test_binary_search() {
427 let xs = [2, 4, 6];
428 let i = binary_search_between(&xs, 3);
429 assert_eq!(i, Some(0));
430
431 let i = binary_search_between(&xs, 4);
432 assert_eq!(i, Some(1));
433
434 let i = binary_search_between(&xs, 1);
435 assert_eq!(i, None);
436
437 let i = binary_search_between(&xs, 7);
438 assert_eq!(i, None);
439
440 let i = binary_search_between(&xs, 6);
441 assert_eq!(i, None);
442 }
443}