Skip to main content

reading_liner/
index.rs

1//! The whole design of this crate is based on this module.
2//! The core types are [Index] and [Query].
3//!
4//! Type [Index] is an index for fast locating line-column location of offset and
5//! encoding line-column location to offset.
6//!
7//! Type [Index] alone cannot do any search or locatings. one must use [Index::query] to locate offset.
8//! The core algorithm is based on binary search.
9//! [Query] can also be 'sliced' safely.
10
11use std::ops;
12
13use crate::location::{Offset, line_column};
14
15/// An index built for fast convertion between byte offsets and line-column locations.
16///
17/// NOTE: the last element of `line_offsets` should be considered as a fake offset (sentinel):
18///
19/// `self.line_offsets: [line0, line1, ..., EOF]`
20///
21/// By the word 'fake', we means that if some offset >= the last offset,
22/// it should be seen as exceding the index since the last element only marks ending.
23///
24/// # Invariants
25///
26/// - `index` is never empty.
27/// - `index[0] == 0`.
28/// - `index` is monotonically increasing.
29/// - `index` is append-only (no removals or mutations).
30/// - A sentinel EOF offset is always present as the last element.
31///
32/// Therefore:
33/// - Valid logical line indices are `0..self.count()`,
34///   where `count() = index.len() - 1`.
35/// - For any valid line `i`, the byte range is:
36///   `[index[i], index[i + 1])`.
37#[derive(Debug)]
38pub struct Index {
39    /// vector of offsets whose element records the beginning offset of source UTF8 string
40    line_offsets: Vec<Offset>,
41}
42
43impl Index {
44    /// An index with the first line starting at offset 0, which is the most common usage.
45    ///
46    /// The zero is safe here since it just means an ending, which also means empty.
47    pub fn new() -> Self {
48        Self {
49            line_offsets: vec![0.into()],
50        }
51    }
52
53    /// length of index
54    /// The API has guaranteed that self.line_offsets.len() > 0
55    #[inline]
56    pub fn count(&self) -> usize {
57        debug_assert!(!self.line_offsets.is_empty());
58        self.line_offsets.len() - 1
59    }
60
61    /// ending offset of the source
62    #[inline]
63    pub fn end(&self) -> Option<Offset> {
64        self.line_offsets.last().copied()
65    }
66
67    /// into vector of offsets
68    #[inline]
69    pub fn into_offsets(self) -> Vec<Offset> {
70        self.line_offsets
71    }
72}
73
74impl Index {
75    /// Get the query and freeze index when querying
76    pub fn query(&self) -> Query<'_> {
77        Query::from(&self.line_offsets[..])
78    }
79
80    pub fn get_line_offset_mut(&mut self, line_no: usize) -> Option<&mut Offset> {
81        self.line_offsets.get_mut(line_no)
82    }
83
84    /// Add next line offset to index
85    pub fn add_next_line(&mut self, offset: Offset) {
86        self.line_offsets.push(offset);
87    }
88
89    /// Reset the index
90    pub fn clear(&mut self) {
91        self.line_offsets.clear();
92        self.add_next_line(0.into());
93    }
94}
95
96/// Query line and offset information.
97///
98/// NOTE: Since the `Query` can be sliced, we carefully store an extra beginning offset to trace slice beginning:
99///
100/// `self.slice: [line0, line1, ..., EOF]`
101///
102/// One should keep in mind that all line numbers passed into query methods should be numbers **from the original [Index]**
103/// and the slice is *non-empty*.
104#[derive(Debug)]
105pub struct Query<'index> {
106    /// the beginning line number,
107    /// used to recover line number since `slice` lacks beginning offset from the original [Index]
108    begin: usize,
109
110    slice: &'index [Offset],
111}
112
113impl<'index> Query<'index> {
114    /// This builder is internal since we don't want users to accidentally build a Query with empty slice
115    fn new(begin: usize, slice: &'index [Offset]) -> Self {
116        Self { begin, slice }
117    }
118
119    /// build from raw slice, assuming the beginning is zero
120    fn from(slice: &'index [Offset]) -> Self {
121        Self { begin: 0, slice }
122    }
123
124    /// Returns a zero-copy view over a subrange of lines.
125    ///
126    /// The input `range` is interpreted over the *logical* line indices,
127    /// i.e. `0..self.count()`, where `count() = self.slice.len() - 1`.
128    ///
129    /// Internally, `self.slice` stores a sentinel EOF offset as the last element:
130    /// `[line0, line1, ..., EOF]`.
131    ///
132    /// To preserve this invariant, the returned slice includes one extra element
133    /// at the end (the sentinel), so the actual slice is:
134    ///
135    /// ```text
136    /// slice[range.start .. range.end + 1]
137    /// ```
138    /// This ensures that every line `i` in the resulting view satisfies:
139    /// ```text
140    /// line i = [slice[i], slice[i+1])
141    /// ```
142    ///
143    /// # Panics
144    /// Panics if:
145    /// - `range.start > range.end`
146    /// - `range.end > self.count()`
147    ///
148    /// These conditions indicate a violation of the API contract.
149    ///
150    /// # Performance
151    /// This operation is zero-copy and does not allocate.
152    ///
153    /// Invariant:
154    /// - self.slice.len() >= 1
155    /// - last element is EOF
156    /// - valid line indices: 0..self.slice.len()-1
157    pub fn range(&self, range: ops::Range<usize>) -> Self {
158        assert!(range.start <= range.end);
159        assert!(range.end <= self.count());
160
161        let range = range.start..range.end + 1;
162        Self::new(range.start, &self.slice[range])
163    }
164
165    /// Returns a zero-copy view over lines starting from `range_from.start`
166    /// to the end.
167    ///
168    /// The input is interpreted over the *logical* line indices,
169    /// i.e. `0..self.count()`.
170    ///
171    /// Internally, `self.slice` always ends with a sentinel EOF offset:
172    /// `[line0, line1, ..., EOF]`.
173    ///
174    /// Therefore, slicing with `slice[start..]` naturally preserves the sentinel,
175    /// and no additional adjustment is needed.
176    ///
177    /// The resulting view satisfies:
178    /// ```text
179    /// line i = [slice[i], slice[i+1])
180    /// ```
181    ///
182    /// # Panics
183    /// Panics if:
184    /// - `range_from.start > self.count()`
185    ///
186    /// # Edge Cases
187    /// - If `range_from.start == self.count()`, the returned slice contains only
188    ///   the EOF sentinel. This represents an empty range of lines.
189    ///
190    /// # Performance
191    /// This operation is zero-copy and does not allocate.
192    ///
193    /// invariant: self.slice always ends with EOF
194    /// so slice[start..] always contains a valid sentinel
195    pub fn range_from(&self, range_from: ops::RangeFrom<usize>) -> Self {
196        assert!(range_from.start <= self.count());
197
198        Self::new(range_from.start, &self.slice[range_from])
199    }
200
201    /// Count the total lines
202    pub fn count(&self) -> usize {
203        debug_assert!(!self.slice.is_empty());
204        self.slice.len() - 1
205    }
206}
207
208impl Query<'_> {
209    /// Given the number of line `line_no`, returns its start offset.
210    #[inline]
211    pub fn line_offset(&self, line_no: usize) -> Option<Offset> {
212        if line_no < self.begin {
213            return None;
214        }
215        let line_no = line_no - self.begin;
216        self.slice.get(line_no).copied()
217    }
218
219    /// Given the number of line `line_no`,
220    /// Returns the byte range of the given line.
221    ///
222    /// The returned range is half-open: `[start, end)`, where `start` is the
223    /// beginning of the line and `end` is the beginning of the next line
224    /// (or EOF for the last line).
225    ///
226    /// # Returns
227    /// - `Some(range)` if the line exists.
228    /// - `None` if the line index is out of bounds.
229    ///
230    /// # Invariants
231    /// - The internal index always contains a sentinel EOF offset,
232    ///   so `line_offset(line_no + 1)` is valid for the last line.
233    ///
234    /// # Notes
235    /// - The range is expressed in **byte offsets**, not character indices.
236    pub fn line_span(&self, line_no: usize) -> Option<ops::Range<Offset>> {
237        let start = self.line_offset(line_no)?;
238        let end = self.line_offset(line_no + 1)?; // it's safe here since we have a fake ending
239
240        Some(start..end)
241    }
242
243    /// The beginning of the whole query range
244    #[inline]
245    pub fn beginning(&self) -> Option<Offset> {
246        self.line_offset(0)
247    }
248
249    /// The ending of the whole query range
250    #[inline]
251    pub fn ending(&self) -> Option<Offset> {
252        self.slice.last().copied()
253    }
254
255    /// check if contains the given offset
256    pub fn contains(&self, offset: Offset) -> bool {
257        let Some(begin) = self.beginning() else {
258            return false;
259        };
260        let Some(end) = self.ending() else {
261            return false;
262        };
263
264        offset >= begin && offset < end
265    }
266
267    /// Locate the line index for a given byte `offset`.
268    ///
269    /// This method performs a binary search over the internal line index to find
270    /// the line whose span contains the given offset.
271    ///
272    /// # Returns
273    /// - `Some(line)` if the offset lies within a known line.
274    /// - `None` if:
275    ///   - the offset is before the beginning of the first line, or
276    ///   - the offset is at or beyond the sentinel EOF offset.
277    ///
278    /// # Invariants
279    /// - `self.slice` is a sorted list of line starting offsets.
280    /// - The last element of `self.slice` is a sentinel EOF offset.
281    /// - Each line `i` corresponds to the half-open interval:
282    ///   `[slice[i], slice[i + 1])`.
283    ///
284    /// # Notes
285    /// - The returned line index is zero-based.
286    /// - If `offset == EOF`, this method returns `None`, since EOF is not
287    ///   considered part of any line.
288    #[inline]
289    pub fn locate_line(&self, offset: Offset) -> Option<usize> {
290        binary_search_between(&self.slice, offset).map(|n| self.begin + n)
291    }
292
293    /// Locate the (line, column) position for a given byte `offset`.
294    ///
295    /// This method uses the existing line index without performing any I/O.
296    /// It resolves the line containing the offset, then computes the column
297    /// as the byte distance from the beginning of that line.
298    ///
299    /// # Returns
300    /// - `Some(ZeroBased(line, column))` if the offset lies within a known range.
301    /// - `None` if the offset is out of bounds or beyond the indexed data.
302    ///
303    /// # Invariants
304    /// - The internal index contains valid starting offsets for all indexed lines.
305    /// - Therefore, `line_offset(line)` is guaranteed to succeed for any line
306    ///   returned by `locate_line`.
307    ///
308    /// # Notes
309    /// - Both line and column are zero-based.
310    /// - Column is measured in **bytes**, not characters.
311    /// - This method does not attempt to extend the index; for streaming input,
312    ///   use the mutable variant instead.
313    pub fn locate(&self, offset: Offset) -> Option<line_column::ZeroBased> {
314        let line = self.locate_line(offset)?;
315        let line_offset = self.line_offset(line).unwrap();
316        let col = offset - line_offset;
317
318        Some((line, col.raw()).into())
319    }
320
321    /// Encode a (line, column) location into a byte `Offset`.
322    ///
323    /// This method uses the existing line index without performing any I/O.
324    /// It validates that the resulting offset lies within the bounds of the line.
325    ///
326    /// # Returns
327    /// - `Some(offset)` if the position is valid.
328    /// - `None` if:
329    ///   - the line does not exist, or
330    ///   - the column is out of bounds.
331    ///
332    /// # Invariants
333    /// - `line_span(line)` returns a half-open range `[start, end)`.
334    ///
335    /// # Notes
336    /// - Column is interpreted as a **byte offset** relative to the start of the line.
337    /// - No UTF-8 character boundary checks are performed.
338    pub fn encode(&self, location: line_column::ZeroBased) -> Option<Offset> {
339        let (line, col) = location.raw();
340
341        let range = self.line_span(line)?;
342        let offset = range.start + col;
343        range.contains(&offset).then_some(offset)
344    }
345}
346
347/// SAFETY: Assuming `xs` is ordered, try to find a interval where `x` lies.  
348/// returns the start index of interval
349///
350/// NOTE: if the input `x` equals the last element of `xs`, this function still returns `None`.
351/// This is because the last element is regarded as an fake ending.
352fn binary_search_between<A: Ord + Copy>(xs: &[A], x: A) -> Option<usize> {
353    if xs.len() <= 1 {
354        return None;
355    }
356    if x == xs[0] {
357        return Some(0);
358    }
359    if x < xs[0] {
360        return None;
361    }
362
363    let mut start = 0;
364    let mut end = xs.len() - 1;
365    while start < end {
366        // base case
367        if start == end - 1 && xs[start] <= x && x < xs[end] {
368            return Some(start);
369        }
370
371        // binary search
372        let mid = start + ((end - start) >> 1);
373        let y = xs[mid];
374        if x == y {
375            return Some(mid);
376        }
377
378        if x < y {
379            end = mid;
380            continue;
381        }
382        // x > y
383        if start == mid {
384            return None;
385        }
386        start = mid;
387    }
388
389    None
390}
391
392#[cfg(test)]
393mod test {
394    use super::*;
395    use quickcheck_macros::quickcheck;
396
397    fn linear_search_between<A: Ord + Copy>(xs: &[A], x: A) -> Option<usize> {
398        if xs.len() <= 1 {
399            return None;
400        }
401
402        for i in 0..xs.len() - 1 {
403            if xs[i] <= x && x < xs[i + 1] {
404                return Some(i);
405            }
406        }
407        None
408    }
409
410    #[quickcheck]
411    fn prop_binary_search_between(mut xs: Vec<i64>, x: i64) -> bool {
412        xs.sort();
413        xs.dedup();
414
415        if xs.len() < 2 {
416            return true;
417        }
418
419        let res0 = linear_search_between(&xs, x);
420        let res1 = binary_search_between(&xs, x);
421
422        res0 == res1
423    }
424
425    #[test]
426    fn test_binary_search() {
427        let xs = [2, 4, 6];
428        let i = binary_search_between(&xs, 3);
429        assert_eq!(i, Some(0));
430
431        let i = binary_search_between(&xs, 4);
432        assert_eq!(i, Some(1));
433
434        let i = binary_search_between(&xs, 1);
435        assert_eq!(i, None);
436
437        let i = binary_search_between(&xs, 7);
438        assert_eq!(i, None);
439
440        let i = binary_search_between(&xs, 6);
441        assert_eq!(i, None);
442    }
443}