Skip to main content

tess/
line_index.rs

1use crate::source::Source;
2use regex::bytes::Regex;
3use std::ops::Range;
4
5pub struct LineIndex {
6    starts: Vec<usize>,
7    record_starts: Vec<usize>,
8    record_start_regex: Option<Regex>,
9    scanned_through: usize,
10    start_byte: usize,
11    pending_line_start: bool,
12    head_cap: Option<usize>,
13    /// True once we've committed the first record (either a real match
14    /// or the synthetic record-0 absorbing orphan-head lines). Always
15    /// true when no record_start_regex is set.
16    record_zero_committed: bool,
17}
18
19impl Default for LineIndex {
20    fn default() -> Self {
21        Self::new()
22    }
23}
24
25impl LineIndex {
26    pub fn new() -> Self {
27        Self::new_starting_at(0)
28    }
29
30    /// Construct a line index that begins at the given byte offset. Bytes
31    /// before `start_byte` are never scanned and never appear in `line_range`.
32    /// Used by `--tail` to skip past the head of the source.
33    pub fn new_starting_at(start_byte: usize) -> Self {
34        Self {
35            starts: vec![start_byte],
36            record_starts: vec![start_byte],
37            record_start_regex: None,
38            scanned_through: start_byte,
39            start_byte,
40            pending_line_start: false,
41            head_cap: None,
42            record_zero_committed: true,
43        }
44    }
45
46    /// Limit the index to the first N logical lines from the start point.
47    /// `line_count` clamps to this and `extend_to_byte` stops scanning
48    /// past it. Used by `--head N`.
49    pub fn set_head_cap(&mut self, cap: usize) {
50        self.head_cap = Some(cap);
51    }
52
53    /// Enable records mode using the supplied regex. Must be called before
54    /// any scanning has begun. Re-calling panics in debug builds.
55    pub fn set_record_start(&mut self, re: Regex) {
56        debug_assert!(
57            self.scanned_through == self.start_byte && self.starts.len() == 1,
58            "set_record_start must be called before scanning"
59        );
60        self.record_start_regex = Some(re);
61        self.record_zero_committed = false;
62    }
63
64    /// True iff records mode is active (a regex was set).
65    pub fn records_mode(&self) -> bool {
66        self.record_start_regex.is_some()
67    }
68
69    pub fn line_count(&self) -> usize {
70        let raw = if self.scanned_through == self.start_byte && self.starts.len() == 1 {
71            0
72        } else {
73            self.starts.len()
74        };
75        match self.head_cap {
76            Some(cap) => raw.min(cap),
77            None => raw,
78        }
79    }
80
81    /// True once we've scanned one entry past the cap. We always keep a
82    /// "sentinel" entry beyond the cap so that `line_range(cap-1)` knows
83    /// where the last visible line ends.
84    fn at_scan_cap(&self) -> bool {
85        matches!(self.head_cap, Some(cap) if self.starts.len() > cap)
86    }
87
88    /// Scan `src` from `scanned_through` to at least byte position `target_byte`.
89    fn extend_to_byte(&mut self, src: &dyn Source, target_byte: usize) {
90        if self.at_scan_cap() {
91            return;
92        }
93        if matches!(self.head_cap, Some(0)) {
94            return;
95        }
96        let total = src.len();
97        let stop = target_byte.min(total);
98        if self.scanned_through >= stop {
99            return;
100        }
101
102        if self.pending_line_start {
103            let line_start = self.scanned_through;
104            self.starts.push(line_start);
105            self.maybe_push_record_start(line_start, src);
106            self.pending_line_start = false;
107            if self.at_scan_cap() {
108                return;
109            }
110        }
111
112        let chunk = src.bytes(self.scanned_through..total);
113        let mut pos = self.scanned_through;
114        for &b in chunk.iter() {
115            pos += 1;
116            if b == b'\n' {
117                if pos < total {
118                    let new_line_start = pos;
119                    self.starts.push(new_line_start);
120                    self.maybe_push_record_start(new_line_start, src);
121                    if self.at_scan_cap() {
122                        self.scanned_through = pos;
123                        return;
124                    }
125                } else {
126                    self.pending_line_start = true;
127                }
128            }
129            if pos >= stop && b == b'\n' {
130                self.scanned_through = pos;
131                return;
132            }
133        }
134        self.scanned_through = total;
135    }
136
137    /// Decide whether a newly-discovered line at `line_start` should also
138    /// start a new record. In line-per-record mode, every line is a record.
139    /// In records mode, we test the line's bytes against the regex.
140    fn maybe_push_record_start(&mut self, line_start: usize, src: &dyn Source) {
141        match &self.record_start_regex {
142            None => {
143                // Line-per-record: every line-start push is also a record-start.
144                self.record_starts.push(line_start);
145            }
146            Some(re) => {
147                let line_end = self.find_line_end(line_start, src);
148                let line_bytes = src.bytes(line_start..line_end);
149                let is_match = re.is_match(&line_bytes);
150                if is_match {
151                    if !self.record_zero_committed {
152                        // First time we've seen a real record-start match.
153                        if line_start == self.start_byte {
154                            // The bootstrap entry at start_byte was a placeholder;
155                            // this line is record 0. No second push needed —
156                            // record_starts[0] already equals start_byte.
157                        } else {
158                            // Lines before this one form synthetic record 0; the
159                            // bootstrap entry stays as record 0's start. Push the
160                            // new line's offset as record 1's start.
161                            self.record_starts.push(line_start);
162                        }
163                        self.record_zero_committed = true;
164                    } else {
165                        self.record_starts.push(line_start);
166                    }
167                } else if !self.record_zero_committed && line_start == self.start_byte {
168                    // Very first line is a non-match: synthetic record 0 will
169                    // absorb it and subsequent continuations. The bootstrap
170                    // entry stays; mark the synthetic record committed.
171                    self.record_zero_committed = true;
172                }
173                // Otherwise: continuation line. No push.
174            }
175        }
176    }
177
178    /// Find the byte position one past the end of the line starting at
179    /// `line_start`. Reads forward until the next `\n` or EOF. Used by
180    /// `maybe_push_record_start` to extract the line bytes for regex testing.
181    fn find_line_end(&self, line_start: usize, src: &dyn Source) -> usize {
182        let total = src.len();
183        let chunk = src.bytes(line_start..total);
184        for (i, &b) in chunk.iter().enumerate() {
185            if b == b'\n' {
186                return line_start + i;
187            }
188        }
189        total
190    }
191
192    pub fn extend_to_line(&mut self, n: usize, src: &dyn Source) {
193        while self.starts.len() <= n && self.scanned_through < src.len() {
194            if self.at_scan_cap() {
195                // head_cap is set and we've already scanned the sentinel past
196                // the cap; no further progress is possible.
197                return;
198            }
199            self.extend_to_byte(src, src.len());
200        }
201    }
202
203    pub fn extend_to_end(&mut self, src: &dyn Source) {
204        self.extend_to_byte(src, src.len());
205    }
206
207    pub fn notice_new_bytes(&mut self, src: &dyn Source) {
208        self.extend_to_byte(src, src.len());
209    }
210
211    /// Byte position up to which the index has scanned.
212    pub fn scanned_through(&self) -> usize {
213        self.scanned_through
214    }
215
216    /// Extend the index until `byte` is covered (or EOF). Public wrapper
217    /// around the private `extend_to_byte`, for callers that need to query
218    /// a specific byte position.
219    pub fn extend_to_byte_for_query(&mut self, src: &dyn Source, byte: usize) {
220        self.extend_to_byte(src, byte);
221    }
222
223    /// Find the physical-line index containing the byte at position `byte`.
224    /// Returns `None` if `byte` is before `start_byte` or at/after
225    /// `scanned_through`.
226    pub fn line_at_byte(&self, byte: usize) -> Option<usize> {
227        if byte < self.start_byte || byte >= self.scanned_through {
228            return None;
229        }
230        match self.starts.binary_search(&byte) {
231            Ok(idx) => Some(idx),
232            Err(0) => None,
233            Err(idx) => Some(idx - 1),
234        }
235    }
236
237    /// Byte range of line `n` (excluding the trailing newline).
238    /// Caller must ensure n < line_count() and the index has scanned through the line.
239    pub fn line_range(&self, n: usize, src: &dyn Source) -> Range<usize> {
240        let start = self.starts[n];
241        // When head_cap is set, line `cap-1` is the last visible line. Its end
242        // is the byte just before line `cap`'s start (a real line break exists
243        // there, otherwise the cap wouldn't have been reached). The "last line
244        // unbounded" branch below should not be entered for a capped line.
245        let next_known = self.starts.get(n + 1).copied();
246        let end = if let Some(next_start) = next_known {
247            // Drop the trailing newline preceding the next line start.
248            next_start - 1
249        } else {
250            // Last line: from start to current scanned end (minus trailing \n if present).
251            let total_scanned = src.len().min(self.scanned_through.max(start));
252            if total_scanned > start && src.bytes(total_scanned - 1..total_scanned)[0] == b'\n' {
253                total_scanned - 1
254            } else {
255                total_scanned
256            }
257        };
258        start..end
259    }
260
261    /// Number of records exposed by this index. Equals `line_count()` when
262    /// records mode is inactive. May be less than `line_count()` when
263    /// records mode is active and records span multiple physical lines.
264    pub fn record_count(&self) -> usize {
265        let raw = if self.scanned_through == self.start_byte && self.record_starts.len() == 1
266            && self.record_start_regex.is_none()
267        {
268            // Empty source, no records mode: zero records.
269            0
270        } else if self.scanned_through == self.start_byte && self.record_starts.len() == 1
271            && self.record_start_regex.is_some() && !self.record_zero_committed
272        {
273            // Empty source, records mode set but nothing scanned yet: zero records.
274            0
275        } else {
276            self.record_starts.len()
277        };
278        match self.head_cap {
279            Some(0) => 0,
280            Some(cap) => {
281                let visible_lines = raw.min(self.starts.len()).min(cap);
282                self.line_to_record_inner(visible_lines.saturating_sub(1))
283                    .map(|r| r + 1)
284                    .unwrap_or(0)
285            }
286            None => raw,
287        }
288    }
289
290    /// Byte range of record `n` including embedded `\n`s. Excludes the
291    /// trailing `\n` after the record's last physical line (if any).
292    pub fn record_range(&self, n: usize, src: &dyn Source) -> Range<usize> {
293        let start = self.record_starts[n];
294        let end = if n + 1 < self.record_starts.len() {
295            // End is just before the next record-start; that byte is a `\n`.
296            self.record_starts[n + 1] - 1
297        } else {
298            // Last record: extend to scanned_through, trimming trailing `\n`.
299            let total_scanned = src.len().min(self.scanned_through.max(start));
300            if total_scanned > start && src.bytes(total_scanned - 1..total_scanned)[0] == b'\n' {
301                total_scanned - 1
302            } else {
303                total_scanned
304            }
305        };
306        start..end
307    }
308
309    /// Range of physical line indices `[first..last)` covered by record `n`.
310    pub fn record_line_range(&self, n: usize) -> Range<usize> {
311        let first_line = self.starts.binary_search(&self.record_starts[n])
312            .expect("record start is always a line start");
313        let last_line = if n + 1 < self.record_starts.len() {
314            self.starts.binary_search(&self.record_starts[n + 1])
315                .expect("record start is always a line start")
316        } else {
317            self.starts.len()
318        };
319        first_line..last_line
320    }
321
322    /// Record index that contains physical line `line_n`. O(log records).
323    pub fn line_to_record(&self, line_n: usize) -> usize {
324        self.line_to_record_inner(line_n).unwrap_or(0)
325    }
326
327    fn line_to_record_inner(&self, line_n: usize) -> Option<usize> {
328        if self.starts.len() <= line_n {
329            return None;
330        }
331        let line_start = self.starts[line_n];
332        match self.record_starts.binary_search(&line_start) {
333            Ok(idx) => Some(idx),
334            Err(0) => Some(0),
335            Err(idx) => Some(idx - 1),
336        }
337    }
338
339    /// Contiguous byte slice covering record `n`. Embedded `\n`s present.
340    pub fn record_bytes<'a>(&self, n: usize, src: &'a dyn Source) -> std::borrow::Cow<'a, [u8]> {
341        let r = self.record_range(n, src);
342        src.bytes(r)
343    }
344}
345
346#[cfg(test)]
347mod tests {
348    use super::*;
349    use crate::source::MockSource;
350    use regex::bytes::Regex;
351
352    #[test]
353    fn empty_source_zero_lines() {
354        let m = MockSource::new();
355        let mut idx = LineIndex::new();
356        idx.extend_to_end(&m);
357        assert_eq!(idx.line_count(), 0);
358    }
359
360    #[test]
361    fn single_line_no_newline() {
362        let m = MockSource::new();
363        m.append(b"hello");
364        let mut idx = LineIndex::new();
365        idx.extend_to_end(&m);
366        assert_eq!(idx.line_count(), 1);
367        assert_eq!(idx.line_range(0, &m), 0..5);
368    }
369
370    #[test]
371    fn single_line_trailing_newline() {
372        let m = MockSource::new();
373        m.append(b"hello\n");
374        let mut idx = LineIndex::new();
375        idx.extend_to_end(&m);
376        assert_eq!(idx.line_count(), 1);
377        assert_eq!(idx.line_range(0, &m), 0..5);
378    }
379
380    #[test]
381    fn multiple_lines() {
382        let m = MockSource::new();
383        m.append(b"a\nbb\nccc\n");
384        let mut idx = LineIndex::new();
385        idx.extend_to_end(&m);
386        assert_eq!(idx.line_count(), 3);
387        assert_eq!(idx.line_range(0, &m), 0..1);
388        assert_eq!(idx.line_range(1, &m), 2..4);
389        assert_eq!(idx.line_range(2, &m), 5..8);
390    }
391
392    #[test]
393    fn head_cap_truncates_line_count() {
394        let m = MockSource::new();
395        m.append(b"1\n2\n3\n4\n5\n6\n7\n8\n");  // 8 lines
396        let mut idx = LineIndex::new();
397        idx.set_head_cap(3);
398        idx.extend_to_end(&m);
399        assert_eq!(idx.line_count(), 3, "should be capped to 3 lines");
400        // Lines 0..2 inclusive must have correct ranges.
401        assert_eq!(idx.line_range(0, &m), 0..1);
402        assert_eq!(idx.line_range(1, &m), 2..3);
403        assert_eq!(idx.line_range(2, &m), 4..5);
404    }
405
406    #[test]
407    fn head_cap_extend_to_line_terminates() {
408        // Regression: extend_to_line(n) used to spin forever when head_cap
409        // had already been hit, because extend_to_byte returned without
410        // advancing scanned_through.
411        let m = MockSource::new();
412        m.append(b"1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n");
413        let mut idx = LineIndex::new();
414        idx.set_head_cap(3);
415        idx.extend_to_line(20, &m);  // far past the cap
416        assert_eq!(idx.line_count(), 3);
417    }
418
419    #[test]
420    fn head_cap_zero_yields_empty() {
421        let m = MockSource::new();
422        m.append(b"1\n2\n3\n");
423        let mut idx = LineIndex::new();
424        idx.set_head_cap(0);
425        idx.extend_to_end(&m);
426        assert_eq!(idx.line_count(), 0);
427    }
428
429    #[test]
430    fn start_byte_skips_head_of_source() {
431        let m = MockSource::new();
432        // 5 lines: alpha\nbeta\ngamma\ndelta\nepsilon\n
433        m.append(b"alpha\nbeta\ngamma\ndelta\nepsilon\n");
434        // gamma starts at byte 11.
435        let mut idx = LineIndex::new_starting_at(11);
436        idx.extend_to_end(&m);
437        assert_eq!(idx.line_count(), 3, "from byte 11 there are 3 lines: gamma, delta, epsilon");
438        assert_eq!(idx.line_range(0, &m), 11..16); // gamma
439        assert_eq!(idx.line_range(1, &m), 17..22); // delta
440        assert_eq!(idx.line_range(2, &m), 23..30); // epsilon
441    }
442
443    #[test]
444    fn start_byte_with_empty_remainder() {
445        let m = MockSource::new();
446        m.append(b"alpha\n");
447        let mut idx = LineIndex::new_starting_at(6);
448        idx.extend_to_end(&m);
449        assert_eq!(idx.line_count(), 0);
450    }
451
452    #[test]
453    fn incremental_growth_via_notice_new_bytes() {
454        let m = MockSource::new();
455        let mut idx = LineIndex::new();
456
457        m.append(b"alpha\n");
458        idx.notice_new_bytes(&m);
459        assert_eq!(idx.line_count(), 1);
460
461        m.append(b"beta\ngamm");
462        idx.notice_new_bytes(&m);
463        assert_eq!(idx.line_count(), 3); // alpha, beta, gamm (partial, but counted)
464
465        m.append(b"a\n");
466        idx.notice_new_bytes(&m);
467        assert_eq!(idx.line_count(), 3);
468        // "alpha\n" = bytes 0-5, "beta\n" = bytes 6-10, "gamma" = bytes 11-15
469        assert_eq!(idx.line_range(2, &m), 11..16); // "gamma"
470    }
471
472    fn re(pat: &str) -> Regex {
473        Regex::new(pat).unwrap()
474    }
475
476    #[test]
477    fn records_mirror_lines_when_no_regex() {
478        let m = MockSource::new();
479        m.append(b"a\nb\nc\n");
480        let mut idx = LineIndex::new();
481        idx.extend_to_end(&m);
482        assert_eq!(idx.line_count(), 3);
483        assert_eq!(idx.record_count(), 3);
484        for i in 0..3 {
485            assert_eq!(idx.record_range(i, &m), idx.line_range(i, &m));
486        }
487    }
488
489    #[test]
490    fn record_count_zero_for_empty_source_records_mode() {
491        let m = MockSource::new();
492        let mut idx = LineIndex::new();
493        idx.set_record_start(re(r"^\["));
494        idx.extend_to_end(&m);
495        assert_eq!(idx.record_count(), 0);
496    }
497
498    #[test]
499    fn records_group_continuations() {
500        let m = MockSource::new();
501        m.append(b"[1] head\n  more\n  more\n[2] head\n  more\n");
502        let mut idx = LineIndex::new();
503        idx.set_record_start(re(r"^\["));
504        idx.extend_to_end(&m);
505        assert_eq!(idx.line_count(), 5);
506        assert_eq!(idx.record_count(), 2);
507        let r0 = idx.record_range(0, &m);
508        assert_eq!(&m.bytes(r0)[..], b"[1] head\n  more\n  more");
509        let r1 = idx.record_range(1, &m);
510        assert_eq!(&m.bytes(r1)[..3], b"[2]");
511    }
512
513    #[test]
514    fn synthetic_record_zero_absorbs_orphan_head() {
515        let m = MockSource::new();
516        m.append(b"banner line 1\nbanner line 2\n[1] first real record\n");
517        let mut idx = LineIndex::new();
518        idx.set_record_start(re(r"^\["));
519        idx.extend_to_end(&m);
520        assert_eq!(idx.line_count(), 3);
521        assert_eq!(idx.record_count(), 2);
522        let r0 = idx.record_range(0, &m);
523        assert_eq!(&m.bytes(r0)[..], b"banner line 1\nbanner line 2");
524        assert_eq!(idx.record_line_range(0), 0..2);
525        assert_eq!(idx.record_line_range(1), 2..3);
526    }
527
528    #[test]
529    fn line_to_record_round_trips() {
530        let m = MockSource::new();
531        m.append(b"[1] a\n  cont\n[2] b\n  cont\n  cont\n");
532        let mut idx = LineIndex::new();
533        idx.set_record_start(re(r"^\["));
534        idx.extend_to_end(&m);
535        assert_eq!(idx.line_to_record(0), 0);  // "[1] a"
536        assert_eq!(idx.line_to_record(1), 0);  // "  cont"
537        assert_eq!(idx.line_to_record(2), 1);  // "[2] b"
538        assert_eq!(idx.line_to_record(3), 1);
539        assert_eq!(idx.line_to_record(4), 1);
540    }
541
542    #[test]
543    fn record_bytes_contains_embedded_newlines() {
544        let m = MockSource::new();
545        m.append(b"[1] head\n  more\n[2] next\n");
546        let mut idx = LineIndex::new();
547        idx.set_record_start(re(r"^\["));
548        idx.extend_to_end(&m);
549        let bytes = idx.record_bytes(0, &m);
550        assert_eq!(&*bytes, b"[1] head\n  more");
551    }
552
553    #[test]
554    fn no_match_at_all_is_one_synthetic_record() {
555        let m = MockSource::new();
556        m.append(b"plain text\nmore plain\nno brackets here\n");
557        let mut idx = LineIndex::new();
558        idx.set_record_start(re(r"^\["));
559        idx.extend_to_end(&m);
560        assert_eq!(idx.line_count(), 3);
561        assert_eq!(idx.record_count(), 1);
562        assert_eq!(idx.record_line_range(0), 0..3);
563    }
564
565    #[test]
566    fn pending_record_start_handles_growing_input() {
567        let m = MockSource::new();
568        let mut idx = LineIndex::new();
569        idx.set_record_start(re(r"^\["));
570        m.append(b"[1] head\n  more\n");
571        idx.notice_new_bytes(&m);
572        assert_eq!(idx.record_count(), 1);
573        m.append(b"[2] head\n");
574        idx.notice_new_bytes(&m);
575        assert_eq!(idx.record_count(), 2);
576    }
577
578    #[test]
579    fn empty_continuation_lines_are_continuations() {
580        let m = MockSource::new();
581        m.append(b"[1] head\n\n  after blank\n[2] next\n");
582        let mut idx = LineIndex::new();
583        idx.set_record_start(re(r"^\["));
584        idx.extend_to_end(&m);
585        assert_eq!(idx.line_count(), 4);
586        assert_eq!(idx.record_count(), 2);
587        assert_eq!(idx.record_line_range(0), 0..3);
588    }
589
590    #[test]
591    fn records_mode_reports_true_only_when_regex_set() {
592        let mut idx = LineIndex::new();
593        assert!(!idx.records_mode());
594        idx.set_record_start(re(r"^\["));
595        assert!(idx.records_mode());
596    }
597
598    #[test]
599    fn record_range_handles_unterminated_last_record() {
600        let m = MockSource::new();
601        m.append(b"[1] head\n[2] last line no newline");
602        let mut idx = LineIndex::new();
603        idx.set_record_start(re(r"^\["));
604        idx.extend_to_end(&m);
605        assert_eq!(idx.record_count(), 2);
606        let r1 = idx.record_range(1, &m);
607        assert_eq!(&m.bytes(r1)[..], b"[2] last line no newline");
608    }
609
610    #[test]
611    fn record_count_with_head_cap_zero_returns_zero_in_records_mode() {
612        let m = MockSource::new();
613        m.append(b"[1] head\n[2] next\n");
614        let mut idx = LineIndex::new();
615        idx.set_record_start(re(r"^\["));
616        idx.set_head_cap(0);
617        idx.extend_to_end(&m);
618        assert_eq!(idx.line_count(), 0);
619        assert_eq!(idx.record_count(), 0);
620    }
621}