Skip to main content

fff_grep/
lines.rs

1/*!
2A collection of routines for performing operations on lines.
3*/
4
5use bstr::ByteSlice;
6
7use crate::matcher::{LineTerminator, Match};
8
9/// An explicit iterator over lines in a particular slice of bytes.
10///
11/// This iterator avoids borrowing the bytes themselves, and instead requires
12/// callers to explicitly provide the bytes when moving through the iterator.
13///
14/// Line terminators are considered part of the line they terminate. All lines
15/// yielded by the iterator are guaranteed to be non-empty.
16#[derive(Debug)]
17pub struct LineStep {
18    line_term: u8,
19    pos: usize,
20    end: usize,
21}
22
23impl LineStep {
24    /// Create a new line iterator over the given range of bytes using the
25    /// given line terminator.
26    pub fn new(line_term: u8, start: usize, end: usize) -> LineStep {
27        LineStep {
28            line_term,
29            pos: start,
30            end,
31        }
32    }
33
34    /// Like next, but returns a `Match` instead of a tuple.
35    #[inline(always)]
36    pub fn next_match(&mut self, bytes: &[u8]) -> Option<Match> {
37        self.next_impl(bytes).map(|(s, e)| Match::new(s, e))
38    }
39
40    #[inline(always)]
41    fn next_impl(&mut self, mut bytes: &[u8]) -> Option<(usize, usize)> {
42        bytes = &bytes[..self.end];
43        match bytes[self.pos..].find_byte(self.line_term) {
44            None => {
45                if self.pos < bytes.len() {
46                    let m = (self.pos, bytes.len());
47                    assert!(m.0 <= m.1);
48
49                    self.pos = m.1;
50                    Some(m)
51                } else {
52                    None
53                }
54            }
55            Some(line_end) => {
56                let m = (self.pos, self.pos + line_end + 1);
57                assert!(m.0 <= m.1);
58
59                self.pos = m.1;
60                Some(m)
61            }
62        }
63    }
64}
65
66/// Count the number of occurrences of `line_term` in `bytes`.
67pub fn count(bytes: &[u8], line_term: u8) -> u64 {
68    memchr::memchr_iter(line_term, bytes).count() as u64
69}
70
71/// Given a line that possibly ends with a terminator, return that line without the terminator.
72#[inline(always)]
73pub fn without_terminator(bytes: &[u8], line_term: LineTerminator) -> &[u8] {
74    let line_term = line_term.as_bytes();
75    let start = bytes.len().saturating_sub(line_term.len());
76    if bytes.get(start..) == Some(line_term) {
77        return &bytes[..bytes.len() - line_term.len()];
78    }
79    bytes
80}
81
82/// Return the start and end offsets of the lines containing the given range
83/// of bytes.
84///
85/// Line terminators are considered part of the line they terminate.
86#[inline(always)]
87pub fn locate(bytes: &[u8], line_term: u8, range: Match) -> Match {
88    let line_start = bytes[..range.start()]
89        .rfind_byte(line_term)
90        .map_or(0, |i| i + 1);
91    let line_end = if range.end() > line_start && bytes[range.end() - 1] == line_term {
92        range.end()
93    } else {
94        bytes[range.end()..]
95            .find_byte(line_term)
96            .map_or(bytes.len(), |i| range.end() + i + 1)
97    };
98    Match::new(line_start, line_end)
99}
100
101#[cfg(test)]
102mod tests {
103    use super::*;
104
105    const SHERLOCK: &str = "\
106For the Doctor Watsons of this world, as opposed to the Sherlock
107Holmeses, success in the province of detective work must always
108be, to a very large extent, the result of luck. Sherlock Holmes
109can extract a clew from a wisp of straw or a flake of cigar ash;
110but Doctor Watson has to have it taken out for him and dusted,
111and exhibited clearly, with a label attached.\
112";
113
114    fn m(start: usize, end: usize) -> Match {
115        Match::new(start, end)
116    }
117
118    fn lines(text: &str) -> Vec<&str> {
119        let mut results = vec![];
120        let mut it = LineStep::new(b'\n', 0, text.len());
121        while let Some(m) = it.next_match(text.as_bytes()) {
122            results.push(&text[m]);
123        }
124        results
125    }
126
127    fn line_ranges(text: &str) -> Vec<std::ops::Range<usize>> {
128        let mut results = vec![];
129        let mut it = LineStep::new(b'\n', 0, text.len());
130        while let Some(m) = it.next_match(text.as_bytes()) {
131            results.push(m.start()..m.end());
132        }
133        results
134    }
135
136    fn loc(text: &str, start: usize, end: usize) -> Match {
137        locate(text.as_bytes(), b'\n', Match::new(start, end))
138    }
139
140    #[test]
141    fn line_count() {
142        assert_eq!(0, count(b"", b'\n'));
143        assert_eq!(1, count(b"\n", b'\n'));
144        assert_eq!(2, count(b"\n\n", b'\n'));
145        assert_eq!(2, count(b"a\nb\nc", b'\n'));
146    }
147
148    #[test]
149    fn line_locate() {
150        let t = SHERLOCK;
151        let lines = line_ranges(t);
152
153        assert_eq!(
154            loc(t, lines[0].start, lines[0].end),
155            m(lines[0].start, lines[0].end)
156        );
157        assert_eq!(
158            loc(t, lines[0].start + 1, lines[0].end),
159            m(lines[0].start, lines[0].end)
160        );
161        assert_eq!(
162            loc(t, lines[0].end - 1, lines[0].end),
163            m(lines[0].start, lines[0].end)
164        );
165        assert_eq!(
166            loc(t, lines[0].end, lines[0].end),
167            m(lines[1].start, lines[1].end)
168        );
169
170        assert_eq!(
171            loc(t, lines[5].start, lines[5].end),
172            m(lines[5].start, lines[5].end)
173        );
174        assert_eq!(
175            loc(t, lines[5].start + 1, lines[5].end),
176            m(lines[5].start, lines[5].end)
177        );
178        assert_eq!(
179            loc(t, lines[5].end - 1, lines[5].end),
180            m(lines[5].start, lines[5].end)
181        );
182        assert_eq!(
183            loc(t, lines[5].end, lines[5].end),
184            m(lines[5].start, lines[5].end)
185        );
186    }
187
188    #[test]
189    fn line_locate_weird() {
190        assert_eq!(loc("", 0, 0), m(0, 0));
191
192        assert_eq!(loc("\n", 0, 1), m(0, 1));
193        assert_eq!(loc("\n", 1, 1), m(1, 1));
194
195        assert_eq!(loc("\n\n", 0, 0), m(0, 1));
196        assert_eq!(loc("\n\n", 0, 1), m(0, 1));
197        assert_eq!(loc("\n\n", 1, 1), m(1, 2));
198        assert_eq!(loc("\n\n", 1, 2), m(1, 2));
199        assert_eq!(loc("\n\n", 2, 2), m(2, 2));
200
201        assert_eq!(loc("a\nb\nc", 0, 1), m(0, 2));
202        assert_eq!(loc("a\nb\nc", 1, 2), m(0, 2));
203        assert_eq!(loc("a\nb\nc", 2, 3), m(2, 4));
204        assert_eq!(loc("a\nb\nc", 3, 4), m(2, 4));
205        assert_eq!(loc("a\nb\nc", 4, 5), m(4, 5));
206        assert_eq!(loc("a\nb\nc", 5, 5), m(4, 5));
207    }
208
209    #[test]
210    fn line_iter() {
211        assert_eq!(lines("abc"), vec!["abc"]);
212
213        assert_eq!(lines("abc\n"), vec!["abc\n"]);
214        assert_eq!(lines("abc\nxyz"), vec!["abc\n", "xyz"]);
215        assert_eq!(lines("abc\nxyz\n"), vec!["abc\n", "xyz\n"]);
216
217        assert_eq!(lines("abc\n\n"), vec!["abc\n", "\n"]);
218        assert_eq!(lines("abc\n\n\n"), vec!["abc\n", "\n", "\n"]);
219        assert_eq!(lines("abc\n\nxyz"), vec!["abc\n", "\n", "xyz"]);
220        assert_eq!(lines("abc\n\nxyz\n"), vec!["abc\n", "\n", "xyz\n"]);
221        assert_eq!(lines("abc\nxyz\n\n"), vec!["abc\n", "xyz\n", "\n"]);
222
223        assert_eq!(lines("\n"), vec!["\n"]);
224        assert_eq!(lines(""), Vec::<&str>::new());
225    }
226
227    #[test]
228    fn line_iter_empty() {
229        let mut it = LineStep::new(b'\n', 0, 0);
230        assert_eq!(it.next_match(b"abc"), None);
231    }
232}