Skip to main content

fff_grep/
lines.rs

1/*!
2A collection of routines for performing operations on lines.
3*/
4
5use {
6    bstr::ByteSlice,
7    grep_matcher::{LineTerminator, Match},
8};
9
10/// An explicit iterator over lines in a particular slice of bytes.
11///
12/// This iterator avoids borrowing the bytes themselves, and instead requires
13/// callers to explicitly provide the bytes when moving through the iterator.
14///
15/// Line terminators are considered part of the line they terminate. All lines
16/// yielded by the iterator are guaranteed to be non-empty.
17#[derive(Debug)]
18pub struct LineStep {
19    line_term: u8,
20    pos: usize,
21    end: usize,
22}
23
24impl LineStep {
25    /// Create a new line iterator over the given range of bytes using the
26    /// given line terminator.
27    pub fn new(line_term: u8, start: usize, end: usize) -> LineStep {
28        LineStep {
29            line_term,
30            pos: start,
31            end,
32        }
33    }
34
35    /// Like next, but returns a `Match` instead of a tuple.
36    #[inline(always)]
37    pub fn next_match(&mut self, bytes: &[u8]) -> Option<Match> {
38        self.next_impl(bytes).map(|(s, e)| Match::new(s, e))
39    }
40
41    #[inline(always)]
42    fn next_impl(&mut self, mut bytes: &[u8]) -> Option<(usize, usize)> {
43        bytes = &bytes[..self.end];
44        match bytes[self.pos..].find_byte(self.line_term) {
45            None => {
46                if self.pos < bytes.len() {
47                    let m = (self.pos, bytes.len());
48                    assert!(m.0 <= m.1);
49
50                    self.pos = m.1;
51                    Some(m)
52                } else {
53                    None
54                }
55            }
56            Some(line_end) => {
57                let m = (self.pos, self.pos + line_end + 1);
58                assert!(m.0 <= m.1);
59
60                self.pos = m.1;
61                Some(m)
62            }
63        }
64    }
65}
66
67/// Count the number of occurrences of `line_term` in `bytes`.
68pub fn count(bytes: &[u8], line_term: u8) -> u64 {
69    memchr::memchr_iter(line_term, bytes).count() as u64
70}
71
72/// Given a line that possibly ends with a terminator, return that line without
73/// the terminator.
74#[inline(always)]
75pub fn without_terminator(bytes: &[u8], line_term: LineTerminator) -> &[u8] {
76    let line_term = line_term.as_bytes();
77    let start = bytes.len().saturating_sub(line_term.len());
78    if bytes.get(start..) == Some(line_term) {
79        return &bytes[..bytes.len() - line_term.len()];
80    }
81    bytes
82}
83
84/// Return the start and end offsets of the lines containing the given range
85/// of bytes.
86///
87/// Line terminators are considered part of the line they terminate.
88#[inline(always)]
89pub fn locate(bytes: &[u8], line_term: u8, range: Match) -> Match {
90    let line_start = bytes[..range.start()]
91        .rfind_byte(line_term)
92        .map_or(0, |i| i + 1);
93    let line_end = if range.end() > line_start && bytes[range.end() - 1] == line_term {
94        range.end()
95    } else {
96        bytes[range.end()..]
97            .find_byte(line_term)
98            .map_or(bytes.len(), |i| range.end() + i + 1)
99    };
100    Match::new(line_start, line_end)
101}
102
103#[cfg(test)]
104mod tests {
105    use super::*;
106
107    const SHERLOCK: &'static str = "\
108For the Doctor Watsons of this world, as opposed to the Sherlock
109Holmeses, success in the province of detective work must always
110be, to a very large extent, the result of luck. Sherlock Holmes
111can extract a clew from a wisp of straw or a flake of cigar ash;
112but Doctor Watson has to have it taken out for him and dusted,
113and exhibited clearly, with a label attached.\
114";
115
116    fn m(start: usize, end: usize) -> Match {
117        Match::new(start, end)
118    }
119
120    fn lines(text: &str) -> Vec<&str> {
121        let mut results = vec![];
122        let mut it = LineStep::new(b'\n', 0, text.len());
123        while let Some(m) = it.next_match(text.as_bytes()) {
124            results.push(&text[m]);
125        }
126        results
127    }
128
129    fn line_ranges(text: &str) -> Vec<std::ops::Range<usize>> {
130        let mut results = vec![];
131        let mut it = LineStep::new(b'\n', 0, text.len());
132        while let Some(m) = it.next_match(text.as_bytes()) {
133            results.push(m.start()..m.end());
134        }
135        results
136    }
137
138    fn loc(text: &str, start: usize, end: usize) -> Match {
139        locate(text.as_bytes(), b'\n', Match::new(start, end))
140    }
141
142    #[test]
143    fn line_count() {
144        assert_eq!(0, count(b"", b'\n'));
145        assert_eq!(1, count(b"\n", b'\n'));
146        assert_eq!(2, count(b"\n\n", b'\n'));
147        assert_eq!(2, count(b"a\nb\nc", b'\n'));
148    }
149
150    #[test]
151    fn line_locate() {
152        let t = SHERLOCK;
153        let lines = line_ranges(t);
154
155        assert_eq!(
156            loc(t, lines[0].start, lines[0].end),
157            m(lines[0].start, lines[0].end)
158        );
159        assert_eq!(
160            loc(t, lines[0].start + 1, lines[0].end),
161            m(lines[0].start, lines[0].end)
162        );
163        assert_eq!(
164            loc(t, lines[0].end - 1, lines[0].end),
165            m(lines[0].start, lines[0].end)
166        );
167        assert_eq!(
168            loc(t, lines[0].end, lines[0].end),
169            m(lines[1].start, lines[1].end)
170        );
171
172        assert_eq!(
173            loc(t, lines[5].start, lines[5].end),
174            m(lines[5].start, lines[5].end)
175        );
176        assert_eq!(
177            loc(t, lines[5].start + 1, lines[5].end),
178            m(lines[5].start, lines[5].end)
179        );
180        assert_eq!(
181            loc(t, lines[5].end - 1, lines[5].end),
182            m(lines[5].start, lines[5].end)
183        );
184        assert_eq!(
185            loc(t, lines[5].end, lines[5].end),
186            m(lines[5].start, lines[5].end)
187        );
188    }
189
190    #[test]
191    fn line_locate_weird() {
192        assert_eq!(loc("", 0, 0), m(0, 0));
193
194        assert_eq!(loc("\n", 0, 1), m(0, 1));
195        assert_eq!(loc("\n", 1, 1), m(1, 1));
196
197        assert_eq!(loc("\n\n", 0, 0), m(0, 1));
198        assert_eq!(loc("\n\n", 0, 1), m(0, 1));
199        assert_eq!(loc("\n\n", 1, 1), m(1, 2));
200        assert_eq!(loc("\n\n", 1, 2), m(1, 2));
201        assert_eq!(loc("\n\n", 2, 2), m(2, 2));
202
203        assert_eq!(loc("a\nb\nc", 0, 1), m(0, 2));
204        assert_eq!(loc("a\nb\nc", 1, 2), m(0, 2));
205        assert_eq!(loc("a\nb\nc", 2, 3), m(2, 4));
206        assert_eq!(loc("a\nb\nc", 3, 4), m(2, 4));
207        assert_eq!(loc("a\nb\nc", 4, 5), m(4, 5));
208        assert_eq!(loc("a\nb\nc", 5, 5), m(4, 5));
209    }
210
211    #[test]
212    fn line_iter() {
213        assert_eq!(lines("abc"), vec!["abc"]);
214
215        assert_eq!(lines("abc\n"), vec!["abc\n"]);
216        assert_eq!(lines("abc\nxyz"), vec!["abc\n", "xyz"]);
217        assert_eq!(lines("abc\nxyz\n"), vec!["abc\n", "xyz\n"]);
218
219        assert_eq!(lines("abc\n\n"), vec!["abc\n", "\n"]);
220        assert_eq!(lines("abc\n\n\n"), vec!["abc\n", "\n", "\n"]);
221        assert_eq!(lines("abc\n\nxyz"), vec!["abc\n", "\n", "xyz"]);
222        assert_eq!(lines("abc\n\nxyz\n"), vec!["abc\n", "\n", "xyz\n"]);
223        assert_eq!(lines("abc\nxyz\n\n"), vec!["abc\n", "xyz\n", "\n"]);
224
225        assert_eq!(lines("\n"), vec!["\n"]);
226        assert_eq!(lines(""), Vec::<&str>::new());
227    }
228
229    #[test]
230    fn line_iter_empty() {
231        let mut it = LineStep::new(b'\n', 0, 0);
232        assert_eq!(it.next_match(b"abc"), None);
233    }
234}