grep_searcher/
lines.rs

1/*!
2A collection of routines for performing operations on lines.
3*/
4
5use {
6    bstr::ByteSlice,
7    grep_matcher::{LineTerminator, Match},
8};
9
10/// An iterator over lines in a particular slice of bytes.
11///
12/// Line terminators are considered part of the line they terminate. All lines
13/// yielded by the iterator are guaranteed to be non-empty.
14///
15/// `'b` refers to the lifetime of the underlying bytes.
16#[derive(Debug)]
17pub struct LineIter<'b> {
18    bytes: &'b [u8],
19    stepper: LineStep,
20}
21
22impl<'b> LineIter<'b> {
23    /// Create a new line iterator that yields lines in the given bytes that
24    /// are terminated by `line_term`.
25    pub fn new(line_term: u8, bytes: &'b [u8]) -> LineIter<'b> {
26        let stepper = LineStep::new(line_term, 0, bytes.len());
27        LineIter { bytes, stepper }
28    }
29}
30
31impl<'b> Iterator for LineIter<'b> {
32    type Item = &'b [u8];
33
34    fn next(&mut self) -> Option<&'b [u8]> {
35        self.stepper.next_match(self.bytes).map(|m| &self.bytes[m])
36    }
37}
38
39/// An explicit iterator over lines in a particular slice of bytes.
40///
41/// This iterator avoids borrowing the bytes themselves, and instead requires
42/// callers to explicitly provide the bytes when moving through the iterator.
43/// While not idiomatic, this provides a simple way of iterating over lines
44/// that doesn't require borrowing the slice itself, which can be convenient.
45///
46/// Line terminators are considered part of the line they terminate. All lines
47/// yielded by the iterator are guaranteed to be non-empty.
48#[derive(Debug)]
49pub struct LineStep {
50    line_term: u8,
51    pos: usize,
52    end: usize,
53}
54
55impl LineStep {
56    /// Create a new line iterator over the given range of bytes using the
57    /// given line terminator.
58    ///
59    /// Callers should provide the actual bytes for each call to `next`. The
60    /// same slice must be provided to each call.
61    ///
62    /// This panics if `start` is not less than or equal to `end`.
63    pub fn new(line_term: u8, start: usize, end: usize) -> LineStep {
64        LineStep { line_term, pos: start, end }
65    }
66
67    /// Return the start and end position of the next line in the given bytes.
68    ///
69    /// The caller must past exactly the same slice of bytes for each call to
70    /// `next`.
71    ///
72    /// The range returned includes the line terminator. Ranges are always
73    /// non-empty.
74    pub fn next(&mut self, bytes: &[u8]) -> Option<(usize, usize)> {
75        self.next_impl(bytes)
76    }
77
78    /// Like next, but returns a `Match` instead of a tuple.
79    #[inline(always)]
80    pub(crate) fn next_match(&mut self, bytes: &[u8]) -> Option<Match> {
81        self.next_impl(bytes).map(|(s, e)| Match::new(s, e))
82    }
83
84    #[inline(always)]
85    fn next_impl(&mut self, mut bytes: &[u8]) -> Option<(usize, usize)> {
86        bytes = &bytes[..self.end];
87        match bytes[self.pos..].find_byte(self.line_term) {
88            None => {
89                if self.pos < bytes.len() {
90                    let m = (self.pos, bytes.len());
91                    assert!(m.0 <= m.1);
92
93                    self.pos = m.1;
94                    Some(m)
95                } else {
96                    None
97                }
98            }
99            Some(line_end) => {
100                let m = (self.pos, self.pos + line_end + 1);
101                assert!(m.0 <= m.1);
102
103                self.pos = m.1;
104                Some(m)
105            }
106        }
107    }
108}
109
110/// Count the number of occurrences of `line_term` in `bytes`.
111pub(crate) fn count(bytes: &[u8], line_term: u8) -> u64 {
112    memchr::memchr_iter(line_term, bytes).count() as u64
113}
114
115/// Given a line that possibly ends with a terminator, return that line without
116/// the terminator.
117#[inline(always)]
118pub(crate) fn without_terminator(
119    bytes: &[u8],
120    line_term: LineTerminator,
121) -> &[u8] {
122    let line_term = line_term.as_bytes();
123    let start = bytes.len().saturating_sub(line_term.len());
124    if bytes.get(start..) == Some(line_term) {
125        return &bytes[..bytes.len() - line_term.len()];
126    }
127    bytes
128}
129
130/// Return the start and end offsets of the lines containing the given range
131/// of bytes.
132///
133/// Line terminators are considered part of the line they terminate.
134#[inline(always)]
135pub(crate) fn locate(bytes: &[u8], line_term: u8, range: Match) -> Match {
136    let line_start =
137        bytes[..range.start()].rfind_byte(line_term).map_or(0, |i| i + 1);
138    let line_end =
139        if range.end() > line_start && bytes[range.end() - 1] == line_term {
140            range.end()
141        } else {
142            bytes[range.end()..]
143                .find_byte(line_term)
144                .map_or(bytes.len(), |i| range.end() + i + 1)
145        };
146    Match::new(line_start, line_end)
147}
148
149/// Returns the minimal starting offset of the line that occurs `count` lines
150/// before the last line in `bytes`.
151///
152/// Lines are terminated by `line_term`. If `count` is zero, then this returns
153/// the starting offset of the last line in `bytes`.
154///
155/// If `bytes` ends with a line terminator, then the terminator itself is
156/// considered part of the last line.
157pub(crate) fn preceding(bytes: &[u8], line_term: u8, count: usize) -> usize {
158    preceding_by_pos(bytes, bytes.len(), line_term, count)
159}
160
161/// Returns the minimal starting offset of the line that occurs `count` lines
162/// before the line containing `pos`. Lines are terminated by `line_term`.
163/// If `count` is zero, then this returns the starting offset of the line
164/// containing `pos`.
165///
166/// If `pos` points just past a line terminator, then it is considered part of
167/// the line that it terminates. For example, given `bytes = b"abc\nxyz\n"`
168/// and `pos = 7`, `preceding(bytes, pos, b'\n', 0)` returns `4` (as does `pos
169/// = 8`) and `preceding(bytes, pos, `b'\n', 1)` returns `0`.
170fn preceding_by_pos(
171    bytes: &[u8],
172    mut pos: usize,
173    line_term: u8,
174    mut count: usize,
175) -> usize {
176    if pos == 0 {
177        return 0;
178    } else if bytes[pos - 1] == line_term {
179        pos -= 1;
180    }
181    loop {
182        match bytes[..pos].rfind_byte(line_term) {
183            None => {
184                return 0;
185            }
186            Some(i) => {
187                if count == 0 {
188                    return i + 1;
189                } else if i == 0 {
190                    return 0;
191                }
192                count -= 1;
193                pos = i;
194            }
195        }
196    }
197}
198
199#[cfg(test)]
200mod tests {
201    use super::*;
202
203    const SHERLOCK: &'static str = "\
204For the Doctor Watsons of this world, as opposed to the Sherlock
205Holmeses, success in the province of detective work must always
206be, to a very large extent, the result of luck. Sherlock Holmes
207can extract a clew from a wisp of straw or a flake of cigar ash;
208but Doctor Watson has to have it taken out for him and dusted,
209and exhibited clearly, with a label attached.\
210";
211
212    fn m(start: usize, end: usize) -> Match {
213        Match::new(start, end)
214    }
215
216    fn lines(text: &str) -> Vec<&str> {
217        let mut results = vec![];
218        let mut it = LineStep::new(b'\n', 0, text.len());
219        while let Some(m) = it.next_match(text.as_bytes()) {
220            results.push(&text[m]);
221        }
222        results
223    }
224
225    fn line_ranges(text: &str) -> Vec<std::ops::Range<usize>> {
226        let mut results = vec![];
227        let mut it = LineStep::new(b'\n', 0, text.len());
228        while let Some(m) = it.next_match(text.as_bytes()) {
229            results.push(m.start()..m.end());
230        }
231        results
232    }
233
234    fn prev(text: &str, pos: usize, count: usize) -> usize {
235        preceding_by_pos(text.as_bytes(), pos, b'\n', count)
236    }
237
238    fn loc(text: &str, start: usize, end: usize) -> Match {
239        locate(text.as_bytes(), b'\n', Match::new(start, end))
240    }
241
242    #[test]
243    fn line_count() {
244        assert_eq!(0, count(b"", b'\n'));
245        assert_eq!(1, count(b"\n", b'\n'));
246        assert_eq!(2, count(b"\n\n", b'\n'));
247        assert_eq!(2, count(b"a\nb\nc", b'\n'));
248    }
249
250    #[test]
251    fn line_locate() {
252        let t = SHERLOCK;
253        let lines = line_ranges(t);
254
255        assert_eq!(
256            loc(t, lines[0].start, lines[0].end),
257            m(lines[0].start, lines[0].end)
258        );
259        assert_eq!(
260            loc(t, lines[0].start + 1, lines[0].end),
261            m(lines[0].start, lines[0].end)
262        );
263        assert_eq!(
264            loc(t, lines[0].end - 1, lines[0].end),
265            m(lines[0].start, lines[0].end)
266        );
267        assert_eq!(
268            loc(t, lines[0].end, lines[0].end),
269            m(lines[1].start, lines[1].end)
270        );
271
272        assert_eq!(
273            loc(t, lines[5].start, lines[5].end),
274            m(lines[5].start, lines[5].end)
275        );
276        assert_eq!(
277            loc(t, lines[5].start + 1, lines[5].end),
278            m(lines[5].start, lines[5].end)
279        );
280        assert_eq!(
281            loc(t, lines[5].end - 1, lines[5].end),
282            m(lines[5].start, lines[5].end)
283        );
284        assert_eq!(
285            loc(t, lines[5].end, lines[5].end),
286            m(lines[5].start, lines[5].end)
287        );
288    }
289
290    #[test]
291    fn line_locate_weird() {
292        assert_eq!(loc("", 0, 0), m(0, 0));
293
294        assert_eq!(loc("\n", 0, 1), m(0, 1));
295        assert_eq!(loc("\n", 1, 1), m(1, 1));
296
297        assert_eq!(loc("\n\n", 0, 0), m(0, 1));
298        assert_eq!(loc("\n\n", 0, 1), m(0, 1));
299        assert_eq!(loc("\n\n", 1, 1), m(1, 2));
300        assert_eq!(loc("\n\n", 1, 2), m(1, 2));
301        assert_eq!(loc("\n\n", 2, 2), m(2, 2));
302
303        assert_eq!(loc("a\nb\nc", 0, 1), m(0, 2));
304        assert_eq!(loc("a\nb\nc", 1, 2), m(0, 2));
305        assert_eq!(loc("a\nb\nc", 2, 3), m(2, 4));
306        assert_eq!(loc("a\nb\nc", 3, 4), m(2, 4));
307        assert_eq!(loc("a\nb\nc", 4, 5), m(4, 5));
308        assert_eq!(loc("a\nb\nc", 5, 5), m(4, 5));
309    }
310
311    #[test]
312    fn line_iter() {
313        assert_eq!(lines("abc"), vec!["abc"]);
314
315        assert_eq!(lines("abc\n"), vec!["abc\n"]);
316        assert_eq!(lines("abc\nxyz"), vec!["abc\n", "xyz"]);
317        assert_eq!(lines("abc\nxyz\n"), vec!["abc\n", "xyz\n"]);
318
319        assert_eq!(lines("abc\n\n"), vec!["abc\n", "\n"]);
320        assert_eq!(lines("abc\n\n\n"), vec!["abc\n", "\n", "\n"]);
321        assert_eq!(lines("abc\n\nxyz"), vec!["abc\n", "\n", "xyz"]);
322        assert_eq!(lines("abc\n\nxyz\n"), vec!["abc\n", "\n", "xyz\n"]);
323        assert_eq!(lines("abc\nxyz\n\n"), vec!["abc\n", "xyz\n", "\n"]);
324
325        assert_eq!(lines("\n"), vec!["\n"]);
326        assert_eq!(lines(""), Vec::<&str>::new());
327    }
328
329    #[test]
330    fn line_iter_empty() {
331        let mut it = LineStep::new(b'\n', 0, 0);
332        assert_eq!(it.next(b"abc"), None);
333    }
334
335    #[test]
336    fn preceding_lines_doc() {
337        // These are the examples mentions in the documentation of `preceding`.
338        let bytes = b"abc\nxyz\n";
339        assert_eq!(4, preceding_by_pos(bytes, 7, b'\n', 0));
340        assert_eq!(4, preceding_by_pos(bytes, 8, b'\n', 0));
341        assert_eq!(0, preceding_by_pos(bytes, 7, b'\n', 1));
342        assert_eq!(0, preceding_by_pos(bytes, 8, b'\n', 1));
343    }
344
345    #[test]
346    fn preceding_lines_sherlock() {
347        let t = SHERLOCK;
348        let lines = line_ranges(t);
349
350        // The following tests check the count == 0 case, i.e., finding the
351        // beginning of the line containing the given position.
352        assert_eq!(0, prev(t, 0, 0));
353        assert_eq!(0, prev(t, 1, 0));
354        // The line terminator is addressed by `end-1` and terminates the line
355        // it is part of.
356        assert_eq!(0, prev(t, lines[0].end - 1, 0));
357        assert_eq!(lines[0].start, prev(t, lines[0].end, 0));
358        // The end position of line addresses the byte immediately following a
359        // line terminator, which puts it on the following line.
360        assert_eq!(lines[1].start, prev(t, lines[0].end + 1, 0));
361
362        // Now tests for count > 0.
363        assert_eq!(0, prev(t, 0, 1));
364        assert_eq!(0, prev(t, 0, 2));
365        assert_eq!(0, prev(t, 1, 1));
366        assert_eq!(0, prev(t, 1, 2));
367        assert_eq!(0, prev(t, lines[0].end - 1, 1));
368        assert_eq!(0, prev(t, lines[0].end - 1, 2));
369        assert_eq!(0, prev(t, lines[0].end, 1));
370        assert_eq!(0, prev(t, lines[0].end, 2));
371        assert_eq!(lines[3].start, prev(t, lines[4].end - 1, 1));
372        assert_eq!(lines[3].start, prev(t, lines[4].end, 1));
373        assert_eq!(lines[4].start, prev(t, lines[4].end + 1, 1));
374
375        // The last line has no line terminator.
376        assert_eq!(lines[5].start, prev(t, lines[5].end, 0));
377        assert_eq!(lines[5].start, prev(t, lines[5].end - 1, 0));
378        assert_eq!(lines[4].start, prev(t, lines[5].end, 1));
379        assert_eq!(lines[0].start, prev(t, lines[5].end, 5));
380    }
381
382    #[test]
383    fn preceding_lines_short() {
384        let t = "a\nb\nc\nd\ne\nf\n";
385        let lines = line_ranges(t);
386        assert_eq!(12, t.len());
387
388        assert_eq!(lines[5].start, prev(t, lines[5].end, 0));
389        assert_eq!(lines[4].start, prev(t, lines[5].end, 1));
390        assert_eq!(lines[3].start, prev(t, lines[5].end, 2));
391        assert_eq!(lines[2].start, prev(t, lines[5].end, 3));
392        assert_eq!(lines[1].start, prev(t, lines[5].end, 4));
393        assert_eq!(lines[0].start, prev(t, lines[5].end, 5));
394        assert_eq!(lines[0].start, prev(t, lines[5].end, 6));
395
396        assert_eq!(lines[5].start, prev(t, lines[5].end - 1, 0));
397        assert_eq!(lines[4].start, prev(t, lines[5].end - 1, 1));
398        assert_eq!(lines[3].start, prev(t, lines[5].end - 1, 2));
399        assert_eq!(lines[2].start, prev(t, lines[5].end - 1, 3));
400        assert_eq!(lines[1].start, prev(t, lines[5].end - 1, 4));
401        assert_eq!(lines[0].start, prev(t, lines[5].end - 1, 5));
402        assert_eq!(lines[0].start, prev(t, lines[5].end - 1, 6));
403
404        assert_eq!(lines[4].start, prev(t, lines[5].start, 0));
405        assert_eq!(lines[3].start, prev(t, lines[5].start, 1));
406        assert_eq!(lines[2].start, prev(t, lines[5].start, 2));
407        assert_eq!(lines[1].start, prev(t, lines[5].start, 3));
408        assert_eq!(lines[0].start, prev(t, lines[5].start, 4));
409        assert_eq!(lines[0].start, prev(t, lines[5].start, 5));
410
411        assert_eq!(lines[3].start, prev(t, lines[4].end - 1, 1));
412        assert_eq!(lines[2].start, prev(t, lines[4].start, 1));
413
414        assert_eq!(lines[2].start, prev(t, lines[3].end - 1, 1));
415        assert_eq!(lines[1].start, prev(t, lines[3].start, 1));
416
417        assert_eq!(lines[1].start, prev(t, lines[2].end - 1, 1));
418        assert_eq!(lines[0].start, prev(t, lines[2].start, 1));
419
420        assert_eq!(lines[0].start, prev(t, lines[1].end - 1, 1));
421        assert_eq!(lines[0].start, prev(t, lines[1].start, 1));
422
423        assert_eq!(lines[0].start, prev(t, lines[0].end - 1, 1));
424        assert_eq!(lines[0].start, prev(t, lines[0].start, 1));
425    }
426
427    #[test]
428    fn preceding_lines_empty1() {
429        let t = "\n\n\nd\ne\nf\n";
430        let lines = line_ranges(t);
431        assert_eq!(9, t.len());
432
433        assert_eq!(lines[0].start, prev(t, lines[0].end, 0));
434        assert_eq!(lines[0].start, prev(t, lines[0].end, 1));
435        assert_eq!(lines[1].start, prev(t, lines[1].end, 0));
436        assert_eq!(lines[0].start, prev(t, lines[1].end, 1));
437
438        assert_eq!(lines[5].start, prev(t, lines[5].end, 0));
439        assert_eq!(lines[4].start, prev(t, lines[5].end, 1));
440        assert_eq!(lines[3].start, prev(t, lines[5].end, 2));
441        assert_eq!(lines[2].start, prev(t, lines[5].end, 3));
442        assert_eq!(lines[1].start, prev(t, lines[5].end, 4));
443        assert_eq!(lines[0].start, prev(t, lines[5].end, 5));
444        assert_eq!(lines[0].start, prev(t, lines[5].end, 6));
445    }
446
447    #[test]
448    fn preceding_lines_empty2() {
449        let t = "a\n\n\nd\ne\nf\n";
450        let lines = line_ranges(t);
451        assert_eq!(10, t.len());
452
453        assert_eq!(lines[0].start, prev(t, lines[0].end, 0));
454        assert_eq!(lines[0].start, prev(t, lines[0].end, 1));
455        assert_eq!(lines[1].start, prev(t, lines[1].end, 0));
456        assert_eq!(lines[0].start, prev(t, lines[1].end, 1));
457
458        assert_eq!(lines[5].start, prev(t, lines[5].end, 0));
459        assert_eq!(lines[4].start, prev(t, lines[5].end, 1));
460        assert_eq!(lines[3].start, prev(t, lines[5].end, 2));
461        assert_eq!(lines[2].start, prev(t, lines[5].end, 3));
462        assert_eq!(lines[1].start, prev(t, lines[5].end, 4));
463        assert_eq!(lines[0].start, prev(t, lines[5].end, 5));
464        assert_eq!(lines[0].start, prev(t, lines[5].end, 6));
465    }
466}