Skip to main content

tess/
line_index.rs

1use crate::source::Source;
2use regex::bytes::Regex;
3use std::ops::Range;
4
5pub struct LineIndex {
6    starts: Vec<usize>,
7    record_starts: Vec<usize>,
8    record_start_regex: Option<Regex>,
9    scanned_through: usize,
10    start_byte: usize,
11    pending_line_start: bool,
12    head_cap: Option<usize>,
13    /// True once we've committed the first record (either a real match
14    /// or the synthetic record-0 absorbing orphan-head lines). Always
15    /// true when no record_start_regex is set.
16    record_zero_committed: bool,
17}
18
19impl Default for LineIndex {
20    fn default() -> Self {
21        Self::new()
22    }
23}
24
25impl LineIndex {
26    pub fn new() -> Self {
27        Self::new_starting_at(0)
28    }
29
30    /// Construct a line index that begins at the given byte offset. Bytes
31    /// before `start_byte` are never scanned and never appear in `line_range`.
32    /// Used by `--tail` to skip past the head of the source.
33    pub fn new_starting_at(start_byte: usize) -> Self {
34        Self {
35            starts: vec![start_byte],
36            record_starts: vec![start_byte],
37            record_start_regex: None,
38            scanned_through: start_byte,
39            start_byte,
40            pending_line_start: false,
41            head_cap: None,
42            record_zero_committed: true,
43        }
44    }
45
46    /// Limit the index to the first N logical lines from the start point.
47    /// `line_count` clamps to this and `extend_to_byte` stops scanning
48    /// past it. Used by `--head N`.
49    pub fn set_head_cap(&mut self, cap: usize) {
50        self.head_cap = Some(cap);
51    }
52
53    /// Enable records mode using the supplied regex. Must be called before
54    /// any scanning has begun. Re-calling panics in debug builds.
55    pub fn set_record_start(&mut self, re: Regex) {
56        debug_assert!(
57            self.scanned_through == self.start_byte && self.starts.len() == 1,
58            "set_record_start must be called before scanning"
59        );
60        self.record_start_regex = Some(re);
61        self.record_zero_committed = false;
62    }
63
64    /// True iff records mode is active (a regex was set).
65    pub fn records_mode(&self) -> bool {
66        self.record_start_regex.is_some()
67    }
68
69    pub fn line_count(&self) -> usize {
70        let raw = if self.scanned_through == self.start_byte && self.starts.len() == 1 {
71            0
72        } else {
73            self.starts.len()
74        };
75        match self.head_cap {
76            Some(cap) => raw.min(cap),
77            None => raw,
78        }
79    }
80
81    /// True once we've scanned one entry past the cap. We always keep a
82    /// "sentinel" entry beyond the cap so that `line_range(cap-1)` knows
83    /// where the last visible line ends.
84    fn at_scan_cap(&self) -> bool {
85        matches!(self.head_cap, Some(cap) if self.starts.len() > cap)
86    }
87
88    /// Scan `src` from `scanned_through` to at least byte position `target_byte`.
89    fn extend_to_byte(&mut self, src: &dyn Source, target_byte: usize) {
90        if self.at_scan_cap() {
91            return;
92        }
93        if matches!(self.head_cap, Some(0)) {
94            return;
95        }
96        let total = src.len();
97        let stop = target_byte.min(total);
98        if self.scanned_through >= stop {
99            return;
100        }
101
102        if self.pending_line_start {
103            let line_start = self.scanned_through;
104            self.starts.push(line_start);
105            self.maybe_push_record_start(line_start, src);
106            self.pending_line_start = false;
107            if self.at_scan_cap() {
108                return;
109            }
110        }
111
112        let chunk = src.bytes(self.scanned_through..total);
113        let mut pos = self.scanned_through;
114        for &b in chunk.iter() {
115            pos += 1;
116            if b == b'\n' {
117                if pos < total {
118                    let new_line_start = pos;
119                    self.starts.push(new_line_start);
120                    self.maybe_push_record_start(new_line_start, src);
121                    if self.at_scan_cap() {
122                        self.scanned_through = pos;
123                        return;
124                    }
125                } else {
126                    self.pending_line_start = true;
127                }
128            }
129            if pos >= stop && b == b'\n' {
130                self.scanned_through = pos;
131                return;
132            }
133        }
134        self.scanned_through = total;
135    }
136
137    /// Decide whether a newly-discovered line at `line_start` should also
138    /// start a new record. In line-per-record mode, every line is a record.
139    /// In records mode, we test the line's bytes against the regex.
140    fn maybe_push_record_start(&mut self, line_start: usize, src: &dyn Source) {
141        match &self.record_start_regex {
142            None => {
143                // Line-per-record: every line-start push is also a record-start.
144                self.record_starts.push(line_start);
145            }
146            Some(re) => {
147                let line_end = self.find_line_end(line_start, src);
148                let line_bytes = src.bytes(line_start..line_end);
149                let is_match = re.is_match(&line_bytes);
150                if is_match {
151                    if !self.record_zero_committed {
152                        // First time we've seen a real record-start match.
153                        if line_start == self.start_byte {
154                            // The bootstrap entry at start_byte was a placeholder;
155                            // this line is record 0. No second push needed —
156                            // record_starts[0] already equals start_byte.
157                        } else {
158                            // Lines before this one form synthetic record 0; the
159                            // bootstrap entry stays as record 0's start. Push the
160                            // new line's offset as record 1's start.
161                            self.record_starts.push(line_start);
162                        }
163                        self.record_zero_committed = true;
164                    } else {
165                        self.record_starts.push(line_start);
166                    }
167                } else if !self.record_zero_committed && line_start == self.start_byte {
168                    // Very first line is a non-match: synthetic record 0 will
169                    // absorb it and subsequent continuations. The bootstrap
170                    // entry stays; mark the synthetic record committed.
171                    self.record_zero_committed = true;
172                }
173                // Otherwise: continuation line. No push.
174            }
175        }
176    }
177
178    /// Find the byte position one past the end of the line starting at
179    /// `line_start`. Reads forward until the next `\n` or EOF. Used by
180    /// `maybe_push_record_start` to extract the line bytes for regex testing.
181    fn find_line_end(&self, line_start: usize, src: &dyn Source) -> usize {
182        let total = src.len();
183        let chunk = src.bytes(line_start..total);
184        for (i, &b) in chunk.iter().enumerate() {
185            if b == b'\n' {
186                return line_start + i;
187            }
188        }
189        total
190    }
191
192    pub fn extend_to_line(&mut self, n: usize, src: &dyn Source) {
193        while self.starts.len() <= n && self.scanned_through < src.len() {
194            if self.at_scan_cap() {
195                // head_cap is set and we've already scanned the sentinel past
196                // the cap; no further progress is possible.
197                return;
198            }
199            self.extend_to_byte(src, src.len());
200        }
201    }
202
203    pub fn extend_to_end(&mut self, src: &dyn Source) {
204        self.extend_to_byte(src, src.len());
205    }
206
207    pub fn notice_new_bytes(&mut self, src: &dyn Source) {
208        self.extend_to_byte(src, src.len());
209    }
210
211    /// Byte position up to which the index has scanned.
212    pub fn scanned_through(&self) -> usize {
213        self.scanned_through
214    }
215
216    /// Extend the index until `byte` is covered (or EOF). Public wrapper
217    /// around the private `extend_to_byte`, for callers that need to query
218    /// a specific byte position.
219    pub fn extend_to_byte_for_query(&mut self, src: &dyn Source, byte: usize) {
220        self.extend_to_byte(src, byte);
221    }
222
223    /// Find the physical-line index containing the byte at position `byte`.
224    /// Returns `None` if `byte` is before `start_byte` or at/after
225    /// `scanned_through`.
226    pub fn line_at_byte(&self, byte: usize) -> Option<usize> {
227        if byte < self.start_byte || byte >= self.scanned_through {
228            return None;
229        }
230        match self.starts.binary_search(&byte) {
231            Ok(idx) => Some(idx),
232            Err(0) => None,
233            Err(idx) => Some(idx - 1),
234        }
235    }
236
237    /// Byte range of line `n` (excluding the trailing newline).
238    /// Caller must ensure n < line_count() and the index has scanned through the line.
239    pub fn line_range(&self, n: usize, src: &dyn Source) -> Range<usize> {
240        let start = self.starts[n];
241        // When head_cap is set, line `cap-1` is the last visible line. Its end
242        // is the byte just before line `cap`'s start (a real line break exists
243        // there, otherwise the cap wouldn't have been reached). The "last line
244        // unbounded" branch below should not be entered for a capped line.
245        let next_known = self.starts.get(n + 1).copied();
246        let end = if let Some(next_start) = next_known {
247            // Drop the trailing newline preceding the next line start.
248            next_start - 1
249        } else {
250            // Last line: from start to current scanned end (minus trailing \n if present).
251            let total_scanned = src.len().min(self.scanned_through.max(start));
252            if total_scanned > start && src.bytes(total_scanned - 1..total_scanned)[0] == b'\n' {
253                total_scanned - 1
254            } else {
255                total_scanned
256            }
257        };
258        start..end
259    }
260
261    /// Number of records exposed by this index. Equals `line_count()` when
262    /// records mode is inactive. May be less than `line_count()` when
263    /// records mode is active and records span multiple physical lines.
264    pub fn record_count(&self) -> usize {
265        let raw = if self.scanned_through == self.start_byte && self.record_starts.len() == 1
266            && self.record_start_regex.is_none()
267        {
268            // Empty source, no records mode: zero records.
269            0
270        } else if self.scanned_through == self.start_byte && self.record_starts.len() == 1
271            && self.record_start_regex.is_some() && !self.record_zero_committed
272        {
273            // Empty source, records mode set but nothing scanned yet: zero records.
274            0
275        } else {
276            self.record_starts.len()
277        };
278        match self.head_cap {
279            Some(0) => 0,
280            Some(cap) => {
281                let visible_lines = raw.min(self.starts.len()).min(cap);
282                self.line_to_record_inner(visible_lines.saturating_sub(1))
283                    .map(|r| r + 1)
284                    .unwrap_or(0)
285            }
286            None => raw,
287        }
288    }
289
290    /// Byte range of record `n` including embedded `\n`s. Excludes the
291    /// trailing `\n` after the record's last physical line (if any).
292    pub fn record_range(&self, n: usize, src: &dyn Source) -> Range<usize> {
293        let start = self.record_starts[n];
294        let end = if n + 1 < self.record_starts.len() {
295            // End is just before the next record-start; that byte is a `\n`.
296            self.record_starts[n + 1] - 1
297        } else {
298            // Last record: extend to scanned_through, trimming trailing `\n`.
299            let total_scanned = src.len().min(self.scanned_through.max(start));
300            if total_scanned > start && src.bytes(total_scanned - 1..total_scanned)[0] == b'\n' {
301                total_scanned - 1
302            } else {
303                total_scanned
304            }
305        };
306        start..end
307    }
308
309    /// Range of physical line indices `[first..last)` covered by record `n`.
310    pub fn record_line_range(&self, n: usize) -> Range<usize> {
311        let first_line = self.starts.binary_search(&self.record_starts[n])
312            .expect("record start is always a line start");
313        let last_line = if n + 1 < self.record_starts.len() {
314            self.starts.binary_search(&self.record_starts[n + 1])
315                .expect("record start is always a line start")
316        } else {
317            self.starts.len()
318        };
319        first_line..last_line
320    }
321
322    /// Record index that contains physical line `line_n`. O(log records).
323    pub fn line_to_record(&self, line_n: usize) -> usize {
324        self.line_to_record_inner(line_n).unwrap_or(0)
325    }
326
327    fn line_to_record_inner(&self, line_n: usize) -> Option<usize> {
328        if self.starts.len() <= line_n {
329            return None;
330        }
331        let line_start = self.starts[line_n];
332        match self.record_starts.binary_search(&line_start) {
333            Ok(idx) => Some(idx),
334            Err(0) => Some(0),
335            Err(idx) => Some(idx - 1),
336        }
337    }
338
339    /// Contiguous byte slice covering record `n`. Embedded `\n`s present.
340    pub fn record_bytes<'a>(&self, n: usize, src: &'a dyn Source) -> std::borrow::Cow<'a, [u8]> {
341        let r = self.record_range(n, src);
342        src.bytes(r)
343    }
344
345    /// Return the bytes of line `n` with SGR/CSI/OSC sequences stripped.
346    /// Borrows the source's bytes when no escape sequences are present
347    /// (common case); owns a new buffer otherwise.
348    pub fn line_bytes_stripped<'a>(
349        &self,
350        n: usize,
351        src: &'a dyn Source,
352    ) -> std::borrow::Cow<'a, [u8]> {
353        let range = self.line_range(n, src);
354        let raw = src.bytes(range);
355        crate::ansi::strip_sgr(&raw).into_owned().into()
356    }
357
358    /// Like `line_bytes_stripped` but for records (multi-line mode).
359    pub fn record_bytes_stripped<'a>(
360        &self,
361        n: usize,
362        src: &'a dyn Source,
363    ) -> std::borrow::Cow<'a, [u8]> {
364        let range = self.record_range(n, src);
365        let raw = src.bytes(range);
366        crate::ansi::strip_sgr(&raw).into_owned().into()
367    }
368}
369
370#[cfg(test)]
371mod tests {
372    use super::*;
373    use crate::source::MockSource;
374    use regex::bytes::Regex;
375
376    #[test]
377    fn empty_source_zero_lines() {
378        let m = MockSource::new();
379        let mut idx = LineIndex::new();
380        idx.extend_to_end(&m);
381        assert_eq!(idx.line_count(), 0);
382    }
383
384    #[test]
385    fn single_line_no_newline() {
386        let m = MockSource::new();
387        m.append(b"hello");
388        let mut idx = LineIndex::new();
389        idx.extend_to_end(&m);
390        assert_eq!(idx.line_count(), 1);
391        assert_eq!(idx.line_range(0, &m), 0..5);
392    }
393
394    #[test]
395    fn single_line_trailing_newline() {
396        let m = MockSource::new();
397        m.append(b"hello\n");
398        let mut idx = LineIndex::new();
399        idx.extend_to_end(&m);
400        assert_eq!(idx.line_count(), 1);
401        assert_eq!(idx.line_range(0, &m), 0..5);
402    }
403
404    #[test]
405    fn multiple_lines() {
406        let m = MockSource::new();
407        m.append(b"a\nbb\nccc\n");
408        let mut idx = LineIndex::new();
409        idx.extend_to_end(&m);
410        assert_eq!(idx.line_count(), 3);
411        assert_eq!(idx.line_range(0, &m), 0..1);
412        assert_eq!(idx.line_range(1, &m), 2..4);
413        assert_eq!(idx.line_range(2, &m), 5..8);
414    }
415
416    #[test]
417    fn head_cap_truncates_line_count() {
418        let m = MockSource::new();
419        m.append(b"1\n2\n3\n4\n5\n6\n7\n8\n");  // 8 lines
420        let mut idx = LineIndex::new();
421        idx.set_head_cap(3);
422        idx.extend_to_end(&m);
423        assert_eq!(idx.line_count(), 3, "should be capped to 3 lines");
424        // Lines 0..2 inclusive must have correct ranges.
425        assert_eq!(idx.line_range(0, &m), 0..1);
426        assert_eq!(idx.line_range(1, &m), 2..3);
427        assert_eq!(idx.line_range(2, &m), 4..5);
428    }
429
430    #[test]
431    fn head_cap_extend_to_line_terminates() {
432        // Regression: extend_to_line(n) used to spin forever when head_cap
433        // had already been hit, because extend_to_byte returned without
434        // advancing scanned_through.
435        let m = MockSource::new();
436        m.append(b"1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n");
437        let mut idx = LineIndex::new();
438        idx.set_head_cap(3);
439        idx.extend_to_line(20, &m);  // far past the cap
440        assert_eq!(idx.line_count(), 3);
441    }
442
443    #[test]
444    fn head_cap_zero_yields_empty() {
445        let m = MockSource::new();
446        m.append(b"1\n2\n3\n");
447        let mut idx = LineIndex::new();
448        idx.set_head_cap(0);
449        idx.extend_to_end(&m);
450        assert_eq!(idx.line_count(), 0);
451    }
452
453    #[test]
454    fn start_byte_skips_head_of_source() {
455        let m = MockSource::new();
456        // 5 lines: alpha\nbeta\ngamma\ndelta\nepsilon\n
457        m.append(b"alpha\nbeta\ngamma\ndelta\nepsilon\n");
458        // gamma starts at byte 11.
459        let mut idx = LineIndex::new_starting_at(11);
460        idx.extend_to_end(&m);
461        assert_eq!(idx.line_count(), 3, "from byte 11 there are 3 lines: gamma, delta, epsilon");
462        assert_eq!(idx.line_range(0, &m), 11..16); // gamma
463        assert_eq!(idx.line_range(1, &m), 17..22); // delta
464        assert_eq!(idx.line_range(2, &m), 23..30); // epsilon
465    }
466
467    #[test]
468    fn start_byte_with_empty_remainder() {
469        let m = MockSource::new();
470        m.append(b"alpha\n");
471        let mut idx = LineIndex::new_starting_at(6);
472        idx.extend_to_end(&m);
473        assert_eq!(idx.line_count(), 0);
474    }
475
476    #[test]
477    fn incremental_growth_via_notice_new_bytes() {
478        let m = MockSource::new();
479        let mut idx = LineIndex::new();
480
481        m.append(b"alpha\n");
482        idx.notice_new_bytes(&m);
483        assert_eq!(idx.line_count(), 1);
484
485        m.append(b"beta\ngamm");
486        idx.notice_new_bytes(&m);
487        assert_eq!(idx.line_count(), 3); // alpha, beta, gamm (partial, but counted)
488
489        m.append(b"a\n");
490        idx.notice_new_bytes(&m);
491        assert_eq!(idx.line_count(), 3);
492        // "alpha\n" = bytes 0-5, "beta\n" = bytes 6-10, "gamma" = bytes 11-15
493        assert_eq!(idx.line_range(2, &m), 11..16); // "gamma"
494    }
495
496    fn re(pat: &str) -> Regex {
497        Regex::new(pat).unwrap()
498    }
499
500    #[test]
501    fn records_mirror_lines_when_no_regex() {
502        let m = MockSource::new();
503        m.append(b"a\nb\nc\n");
504        let mut idx = LineIndex::new();
505        idx.extend_to_end(&m);
506        assert_eq!(idx.line_count(), 3);
507        assert_eq!(idx.record_count(), 3);
508        for i in 0..3 {
509            assert_eq!(idx.record_range(i, &m), idx.line_range(i, &m));
510        }
511    }
512
513    #[test]
514    fn record_count_zero_for_empty_source_records_mode() {
515        let m = MockSource::new();
516        let mut idx = LineIndex::new();
517        idx.set_record_start(re(r"^\["));
518        idx.extend_to_end(&m);
519        assert_eq!(idx.record_count(), 0);
520    }
521
522    #[test]
523    fn records_group_continuations() {
524        let m = MockSource::new();
525        m.append(b"[1] head\n  more\n  more\n[2] head\n  more\n");
526        let mut idx = LineIndex::new();
527        idx.set_record_start(re(r"^\["));
528        idx.extend_to_end(&m);
529        assert_eq!(idx.line_count(), 5);
530        assert_eq!(idx.record_count(), 2);
531        let r0 = idx.record_range(0, &m);
532        assert_eq!(&m.bytes(r0)[..], b"[1] head\n  more\n  more");
533        let r1 = idx.record_range(1, &m);
534        assert_eq!(&m.bytes(r1)[..3], b"[2]");
535    }
536
537    #[test]
538    fn synthetic_record_zero_absorbs_orphan_head() {
539        let m = MockSource::new();
540        m.append(b"banner line 1\nbanner line 2\n[1] first real record\n");
541        let mut idx = LineIndex::new();
542        idx.set_record_start(re(r"^\["));
543        idx.extend_to_end(&m);
544        assert_eq!(idx.line_count(), 3);
545        assert_eq!(idx.record_count(), 2);
546        let r0 = idx.record_range(0, &m);
547        assert_eq!(&m.bytes(r0)[..], b"banner line 1\nbanner line 2");
548        assert_eq!(idx.record_line_range(0), 0..2);
549        assert_eq!(idx.record_line_range(1), 2..3);
550    }
551
552    #[test]
553    fn line_to_record_round_trips() {
554        let m = MockSource::new();
555        m.append(b"[1] a\n  cont\n[2] b\n  cont\n  cont\n");
556        let mut idx = LineIndex::new();
557        idx.set_record_start(re(r"^\["));
558        idx.extend_to_end(&m);
559        assert_eq!(idx.line_to_record(0), 0);  // "[1] a"
560        assert_eq!(idx.line_to_record(1), 0);  // "  cont"
561        assert_eq!(idx.line_to_record(2), 1);  // "[2] b"
562        assert_eq!(idx.line_to_record(3), 1);
563        assert_eq!(idx.line_to_record(4), 1);
564    }
565
566    #[test]
567    fn record_bytes_contains_embedded_newlines() {
568        let m = MockSource::new();
569        m.append(b"[1] head\n  more\n[2] next\n");
570        let mut idx = LineIndex::new();
571        idx.set_record_start(re(r"^\["));
572        idx.extend_to_end(&m);
573        let bytes = idx.record_bytes(0, &m);
574        assert_eq!(&*bytes, b"[1] head\n  more");
575    }
576
577    #[test]
578    fn no_match_at_all_is_one_synthetic_record() {
579        let m = MockSource::new();
580        m.append(b"plain text\nmore plain\nno brackets here\n");
581        let mut idx = LineIndex::new();
582        idx.set_record_start(re(r"^\["));
583        idx.extend_to_end(&m);
584        assert_eq!(idx.line_count(), 3);
585        assert_eq!(idx.record_count(), 1);
586        assert_eq!(idx.record_line_range(0), 0..3);
587    }
588
589    #[test]
590    fn pending_record_start_handles_growing_input() {
591        let m = MockSource::new();
592        let mut idx = LineIndex::new();
593        idx.set_record_start(re(r"^\["));
594        m.append(b"[1] head\n  more\n");
595        idx.notice_new_bytes(&m);
596        assert_eq!(idx.record_count(), 1);
597        m.append(b"[2] head\n");
598        idx.notice_new_bytes(&m);
599        assert_eq!(idx.record_count(), 2);
600    }
601
602    #[test]
603    fn empty_continuation_lines_are_continuations() {
604        let m = MockSource::new();
605        m.append(b"[1] head\n\n  after blank\n[2] next\n");
606        let mut idx = LineIndex::new();
607        idx.set_record_start(re(r"^\["));
608        idx.extend_to_end(&m);
609        assert_eq!(idx.line_count(), 4);
610        assert_eq!(idx.record_count(), 2);
611        assert_eq!(idx.record_line_range(0), 0..3);
612    }
613
614    #[test]
615    fn line_bytes_stripped_returns_visible_text() {
616        let m = MockSource::new();
617        m.append(b"\x1b[31merror\x1b[0m\n");
618        let mut idx = LineIndex::new();
619        idx.extend_to_end(&m);
620        let stripped = idx.line_bytes_stripped(0, &m);
621        assert_eq!(stripped.as_ref(), b"error");
622    }
623
624    #[test]
625    fn line_bytes_stripped_plain_input() {
626        let m = MockSource::new();
627        m.append(b"plain\n");
628        let mut idx = LineIndex::new();
629        idx.extend_to_end(&m);
630        let stripped = idx.line_bytes_stripped(0, &m);
631        assert_eq!(stripped.as_ref(), b"plain");
632    }
633
634    #[test]
635    fn records_mode_reports_true_only_when_regex_set() {
636        let mut idx = LineIndex::new();
637        assert!(!idx.records_mode());
638        idx.set_record_start(re(r"^\["));
639        assert!(idx.records_mode());
640    }
641
642    #[test]
643    fn record_range_handles_unterminated_last_record() {
644        let m = MockSource::new();
645        m.append(b"[1] head\n[2] last line no newline");
646        let mut idx = LineIndex::new();
647        idx.set_record_start(re(r"^\["));
648        idx.extend_to_end(&m);
649        assert_eq!(idx.record_count(), 2);
650        let r1 = idx.record_range(1, &m);
651        assert_eq!(&m.bytes(r1)[..], b"[2] last line no newline");
652    }
653
654    #[test]
655    fn record_count_with_head_cap_zero_returns_zero_in_records_mode() {
656        let m = MockSource::new();
657        m.append(b"[1] head\n[2] next\n");
658        let mut idx = LineIndex::new();
659        idx.set_record_start(re(r"^\["));
660        idx.set_head_cap(0);
661        idx.extend_to_end(&m);
662        assert_eq!(idx.line_count(), 0);
663        assert_eq!(idx.record_count(), 0);
664    }
665}