Skip to main content

reading_liner/
index.rs

1use std::ops;
2
3use crate::location::{Offset, line_column};
4
5/// An index built for fast convertion between byte offsets and line-column locations.
6///
7/// NOTE: the last element of `line_offsets` should be considered as a fake offset (sentinel):
8///
9/// `self.line_offsets: [line0, line1, ..., EOF]`
10///
11/// By the word 'fake', we means that if some offset >= the last offset,
12/// it should be seen as exceding the index since the last element only marks ending.
13///
14/// # Invariants
15///
16/// - `index` is never empty.
17/// - `index[0] == 0`.
18/// - `index` is monotonically increasing.
19/// - `index` is append-only (no removals or mutations).
20/// - A sentinel EOF offset is always present as the last element.
21///
22/// Therefore:
23/// - Valid logical line indices are `0..self.count()`,
24///   where `count() = index.len() - 1`.
25/// - For any valid line `i`, the byte range is:
26///   `[index[i], index[i + 1])`.
27#[derive(Debug)]
28pub struct Index {
29    /// vector of offsets whose element records the beginning offset of source UTF8 string
30    line_offsets: Vec<Offset>,
31}
32
33impl Index {
34    /// An index with the first line starting at offset 0, which is the most common usage.
35    ///
36    /// The zero is safe here since it just means an ending, which also means empty.
37    pub fn new() -> Self {
38        Self {
39            line_offsets: vec![0.into()],
40        }
41    }
42
43    /// length of index
44    /// The API has guaranteed that self.line_offsets.len() > 0
45    #[inline]
46    pub fn count(&self) -> usize {
47        debug_assert!(!self.line_offsets.is_empty());
48        self.line_offsets.len() - 1
49    }
50
51    /// ending offset of the source
52    #[inline]
53    pub fn end(&self) -> Option<Offset> {
54        self.line_offsets.last().copied()
55    }
56
57    /// into vector of offsets
58    #[inline]
59    pub fn into_offsets(self) -> Vec<Offset> {
60        self.line_offsets
61    }
62}
63
64impl Index {
65    /// Get the query and freeze index when querying
66    pub fn query(&self) -> Query<'_> {
67        Query::from(&self.line_offsets[..])
68    }
69
70    pub fn get_line_offset_mut(&mut self, line_no: usize) -> Option<&mut Offset> {
71        self.line_offsets.get_mut(line_no)
72    }
73
74    /// Add next line offset to index
75    pub fn add_next_line(&mut self, offset: Offset) {
76        self.line_offsets.push(offset);
77    }
78
79    /// Reset the index
80    pub fn clear(&mut self) {
81        self.line_offsets.clear();
82        self.add_next_line(0.into());
83    }
84}
85
86/// Query line and offset information.
87///
88/// NOTE: Since the `Query` can be sliced, we carefully store an extra beginning offset to trace slice beginning:
89///
90/// `self.slice: [line0, line1, ..., EOF]`
91///
92/// One should keep in mind that all line numbers passed into query methods should be numbers **from the original [Index]**
93/// and the slice is *non-empty*.
94#[derive(Debug)]
95pub struct Query<'index> {
96    /// the beginning line number,
97    /// used to recover line number since `slice` lacks beginning offset from the original [Index]
98    begin: usize,
99
100    slice: &'index [Offset],
101}
102
103impl<'index> Query<'index> {
104    /// This builder is internal since we don't want users to accidentally build a Query with empty slice
105    fn new(begin: usize, slice: &'index [Offset]) -> Self {
106        Self { begin, slice }
107    }
108
109    /// build from raw slice, assuming the beginning is zero
110    fn from(slice: &'index [Offset]) -> Self {
111        Self { begin: 0, slice }
112    }
113
114    /// Returns a zero-copy view over a subrange of lines.
115    ///
116    /// The input `range` is interpreted over the *logical* line indices,
117    /// i.e. `0..self.count()`, where `count() = self.slice.len() - 1`.
118    ///
119    /// Internally, `self.slice` stores a sentinel EOF offset as the last element:
120    /// `[line0, line1, ..., EOF]`.
121    ///
122    /// To preserve this invariant, the returned slice includes one extra element
123    /// at the end (the sentinel), so the actual slice is:
124    ///
125    /// ```text
126    /// slice[range.start .. range.end + 1]
127    /// ```
128    /// This ensures that every line `i` in the resulting view satisfies:
129    /// ```text
130    /// line i = [slice[i], slice[i+1])
131    /// ```
132    ///
133    /// # Panics
134    /// Panics if:
135    /// - `range.start > range.end`
136    /// - `range.end > self.count()`
137    ///
138    /// These conditions indicate a violation of the API contract.
139    ///
140    /// # Performance
141    /// This operation is zero-copy and does not allocate.
142    ///
143    /// Invariant:
144    /// - self.slice.len() >= 1
145    /// - last element is EOF
146    /// - valid line indices: 0..self.slice.len()-1
147    pub fn range(&self, range: ops::Range<usize>) -> Self {
148        assert!(range.start <= range.end);
149        assert!(range.end <= self.count());
150
151        let range = range.start..range.end + 1;
152        Self::new(range.start, &self.slice[range])
153    }
154
155    /// Returns a zero-copy view over lines starting from `range_from.start`
156    /// to the end.
157    ///
158    /// The input is interpreted over the *logical* line indices,
159    /// i.e. `0..self.count()`.
160    ///
161    /// Internally, `self.slice` always ends with a sentinel EOF offset:
162    /// `[line0, line1, ..., EOF]`.
163    ///
164    /// Therefore, slicing with `slice[start..]` naturally preserves the sentinel,
165    /// and no additional adjustment is needed.
166    ///
167    /// The resulting view satisfies:
168    /// ```text
169    /// line i = [slice[i], slice[i+1])
170    /// ```
171    ///
172    /// # Panics
173    /// Panics if:
174    /// - `range_from.start > self.count()`
175    ///
176    /// # Edge Cases
177    /// - If `range_from.start == self.count()`, the returned slice contains only
178    ///   the EOF sentinel. This represents an empty range of lines.
179    ///
180    /// # Performance
181    /// This operation is zero-copy and does not allocate.
182    ///
183    /// invariant: self.slice always ends with EOF
184    /// so slice[start..] always contains a valid sentinel
185    pub fn range_from(&self, range_from: ops::RangeFrom<usize>) -> Self {
186        assert!(range_from.start <= self.count());
187
188        Self::new(range_from.start, &self.slice[range_from])
189    }
190
191    /// Count the total lines
192    pub fn count(&self) -> usize {
193        debug_assert!(!self.slice.is_empty());
194        self.slice.len() - 1
195    }
196}
197
198impl Query<'_> {
199    /// Given the number of line `line_no`, returns its start offset.
200    #[inline]
201    pub fn line_offset(&self, line_no: usize) -> Option<Offset> {
202        if line_no < self.begin {
203            return None;
204        }
205        let line_no = line_no - self.begin;
206        self.slice.get(line_no).copied()
207    }
208
209    /// Given the number of line `line_no`,
210    /// Returns the byte range of the given line.
211    ///
212    /// The returned range is half-open: `[start, end)`, where `start` is the
213    /// beginning of the line and `end` is the beginning of the next line
214    /// (or EOF for the last line).
215    ///
216    /// # Returns
217    /// - `Some(range)` if the line exists.
218    /// - `None` if the line index is out of bounds.
219    ///
220    /// # Invariants
221    /// - The internal index always contains a sentinel EOF offset,
222    ///   so `line_offset(line_no + 1)` is valid for the last line.
223    ///
224    /// # Notes
225    /// - The range is expressed in **byte offsets**, not character indices.
226    pub fn line_span(&self, line_no: usize) -> Option<ops::Range<Offset>> {
227        let start = self.line_offset(line_no)?;
228        let end = self.line_offset(line_no + 1)?; // it's safe here since we have a fake ending
229
230        Some(start..end)
231    }
232
233    /// The beginning of the whole query range
234    #[inline]
235    pub fn beginning(&self) -> Option<Offset> {
236        self.line_offset(0)
237    }
238
239    /// The ending of the whole query range
240    #[inline]
241    pub fn ending(&self) -> Option<Offset> {
242        self.slice.last().copied()
243    }
244
245    /// check if contains the given offset
246    pub fn contains(&self, offset: Offset) -> bool {
247        let Some(begin) = self.beginning() else {
248            return false;
249        };
250        let Some(end) = self.ending() else {
251            return false;
252        };
253
254        offset >= begin && offset < end
255    }
256
257    /// Locate the line index for a given byte `offset`.
258    ///
259    /// This method performs a binary search over the internal line index to find
260    /// the line whose span contains the given offset.
261    ///
262    /// # Returns
263    /// - `Some(line)` if the offset lies within a known line.
264    /// - `None` if:
265    ///   - the offset is before the beginning of the first line, or
266    ///   - the offset is at or beyond the sentinel EOF offset.
267    ///
268    /// # Invariants
269    /// - `self.slice` is a sorted list of line starting offsets.
270    /// - The last element of `self.slice` is a sentinel EOF offset.
271    /// - Each line `i` corresponds to the half-open interval:
272    ///   `[slice[i], slice[i + 1])`.
273    ///
274    /// # Notes
275    /// - The returned line index is zero-based.
276    /// - If `offset == EOF`, this method returns `None`, since EOF is not
277    ///   considered part of any line.
278    #[inline]
279    pub fn locate_line(&self, offset: Offset) -> Option<usize> {
280        binary_search_between(&self.slice, offset).map(|n| self.begin + n)
281    }
282
283    /// Locate the (line, column) position for a given byte `offset`.
284    ///
285    /// This method uses the existing line index without performing any I/O.
286    /// It resolves the line containing the offset, then computes the column
287    /// as the byte distance from the beginning of that line.
288    ///
289    /// # Returns
290    /// - `Some(ZeroBased(line, column))` if the offset lies within a known range.
291    /// - `None` if the offset is out of bounds or beyond the indexed data.
292    ///
293    /// # Invariants
294    /// - The internal index contains valid starting offsets for all indexed lines.
295    /// - Therefore, `line_offset(line)` is guaranteed to succeed for any line
296    ///   returned by `locate_line`.
297    ///
298    /// # Notes
299    /// - Both line and column are zero-based.
300    /// - Column is measured in **bytes**, not characters.
301    /// - This method does not attempt to extend the index; for streaming input,
302    ///   use the mutable variant instead.
303    pub fn locate(&self, offset: Offset) -> Option<line_column::ZeroBased> {
304        let line = self.locate_line(offset)?;
305        let line_offset = self.line_offset(line).unwrap();
306        let col = offset - line_offset;
307
308        Some((line, col.raw()).into())
309    }
310
311    /// Encode a (line, column) location into a byte `Offset`.
312    ///
313    /// This method uses the existing line index without performing any I/O.
314    /// It validates that the resulting offset lies within the bounds of the line.
315    ///
316    /// # Returns
317    /// - `Some(offset)` if the position is valid.
318    /// - `None` if:
319    ///   - the line does not exist, or
320    ///   - the column is out of bounds.
321    ///
322    /// # Invariants
323    /// - `line_span(line)` returns a half-open range `[start, end)`.
324    ///
325    /// # Notes
326    /// - Column is interpreted as a **byte offset** relative to the start of the line.
327    /// - No UTF-8 character boundary checks are performed.
328    pub fn encode(&self, location: line_column::ZeroBased) -> Option<Offset> {
329        let (line, col) = location.raw();
330
331        let range = self.line_span(line)?;
332        let offset = range.start + col;
333        range.contains(&offset).then_some(offset)
334    }
335}
336
337/// SAFETY: Assuming `xs` is ordered, try to find a interval where `x` lies.  
338/// returns the start index of interval
339///
340/// NOTE: if the input `x` equals the last element of `xs`, this function still returns `None`.
341/// This is because the last element is regarded as an fake ending.
342fn binary_search_between<A: Ord + Copy>(xs: &[A], x: A) -> Option<usize> {
343    if xs.len() <= 1 {
344        return None;
345    }
346    if x == xs[0] {
347        return Some(0);
348    }
349    if x < xs[0] {
350        return None;
351    }
352
353    let mut start = 0;
354    let mut end = xs.len() - 1;
355    while start < end {
356        // base case
357        if start == end - 1 && xs[start] <= x && x < xs[end] {
358            return Some(start);
359        }
360
361        // binary search
362        let mid = start + ((end - start) >> 1);
363        let y = xs[mid];
364        if x == y {
365            return Some(mid);
366        }
367
368        if x < y {
369            end = mid;
370            continue;
371        }
372        // x > y
373        if start == mid {
374            return None;
375        }
376        start = mid;
377    }
378
379    None
380}
381
382#[cfg(test)]
383mod test {
384    use super::*;
385    use quickcheck_macros::quickcheck;
386
387    fn linear_search_between<A: Ord + Copy>(xs: &[A], x: A) -> Option<usize> {
388        if xs.len() <= 1 {
389            return None;
390        }
391
392        for i in 0..xs.len() - 1 {
393            if xs[i] <= x && x < xs[i + 1] {
394                return Some(i);
395            }
396        }
397        None
398    }
399
400    #[quickcheck]
401    fn prop_binary_search_between(mut xs: Vec<i64>, x: i64) -> bool {
402        xs.sort();
403        xs.dedup();
404
405        if xs.len() < 2 {
406            return true;
407        }
408
409        let res0 = linear_search_between(&xs, x);
410        let res1 = binary_search_between(&xs, x);
411
412        res0 == res1
413    }
414
415    #[test]
416    fn test_binary_search() {
417        let xs = [2, 4, 6];
418        let i = binary_search_between(&xs, 3);
419        assert_eq!(i, Some(0));
420
421        let i = binary_search_between(&xs, 4);
422        assert_eq!(i, Some(1));
423
424        let i = binary_search_between(&xs, 1);
425        assert_eq!(i, None);
426
427        let i = binary_search_between(&xs, 7);
428        assert_eq!(i, None);
429
430        let i = binary_search_between(&xs, 6);
431        assert_eq!(i, None);
432    }
433}