hyperscan/regex/
re.rs

1use std::ops::Range;
2use std::str::FromStr;
3use std::sync::Arc;
4use std::vec;
5
6use crate::{
7    common::BlockDatabase,
8    compile::{Builder, Flags, Pattern},
9    runtime::Matching,
10    Error, Result,
11};
12
13/// Match represents a single match of a regex in a haystack.
14///
15/// The lifetime parameter `'t` refers to the lifetime of the matched text.
16#[derive(Copy, Clone, Debug, PartialEq, Eq)]
17pub struct Match<'t> {
18    text: &'t str,
19    start: usize,
20    end: usize,
21}
22
23impl<'t> Match<'t> {
24    /// Returns the starting byte offset of the match in the haystack.
25    #[inline]
26    pub fn start(&self) -> usize {
27        self.start
28    }
29
30    /// Returns the ending byte offset of the match in the haystack.
31    #[inline]
32    pub fn end(&self) -> usize {
33        self.end
34    }
35
36    /// Returns the range over the starting and ending byte offsets of the
37    /// match in the haystack.
38    #[inline]
39    pub fn range(&self) -> Range<usize> {
40        self.start..self.end
41    }
42
43    /// Returns the matched text.
44    #[inline]
45    pub fn as_str(&self) -> &'t str {
46        &self.text[self.start..self.end]
47    }
48
49    /// Creates a new match from the given haystack and byte offsets.
50    #[inline]
51    fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> {
52        Match {
53            text: haystack,
54            start,
55            end,
56        }
57    }
58}
59
60impl<'t> From<Match<'t>> for &'t str {
61    fn from(m: Match<'t>) -> &'t str {
62        m.as_str()
63    }
64}
65
66impl<'t> From<Match<'t>> for Range<usize> {
67    fn from(m: Match<'t>) -> Range<usize> {
68        m.range()
69    }
70}
71
72/// An iterator over all non-overlapping matches for a particular string.
73///
74/// The iterator yields a `Match` value. The iterator stops when no more
75/// matches can be found.
76///
77/// `'r` is the lifetime of the compiled regular expression and `'t` is the
78/// lifetime of the matched string.
79pub struct Matches<'t>(&'t str, vec::IntoIter<Range<usize>>);
80
81impl<'t> Matches<'t> {
82    /// Return the text being searched.
83    pub fn text(&self) -> &'t str {
84        self.0
85    }
86}
87
88impl<'t> Iterator for Matches<'t> {
89    type Item = Match<'t>;
90
91    fn next(&mut self) -> Option<Self::Item> {
92        self.1.next().map(|range| Match::new(self.0, range.start, range.end))
93    }
94}
95
96impl<'t> DoubleEndedIterator for Matches<'t> {
97    fn next_back(&mut self) -> Option<Self::Item> {
98        self.1
99            .next_back()
100            .map(|range| Match::new(self.0, range.start, range.end))
101    }
102}
103
104/// A compiled regular expression for matching Unicode strings.
105#[derive(Clone)]
106pub struct Regex(pub(crate) Arc<BlockDatabase>);
107
108impl FromStr for Regex {
109    type Err = Error;
110
111    /// Attempts to parse a string into a regular expression
112    fn from_str(s: &str) -> Result<Regex> {
113        Regex::new(s)
114    }
115}
116
117/// Core regular expression methods.
118impl Regex {
119    /// Compiles a regular expression.
120    /// Once compiled, it can be used repeatedly to search, split or replace text in a string.
121    ///
122    /// If an invalid expression is given, then an error is returned.
123    pub fn new<S: Into<String>>(re: S) -> Result<Regex> {
124        Self::with_flags(re, Flags::empty())
125    }
126
127    pub(crate) fn with_flags<S: Into<String>>(re: S, flags: Flags) -> Result<Regex> {
128        Pattern::with_flags(re, flags | Flags::SOM_LEFTMOST | Flags::UTF8)?
129            .build()
130            .map(|db| Regex(Arc::new(db)))
131    }
132
133    /// Returns true if and only if the regex matches the string given.
134    ///
135    /// It is recommended to use this method if all you need to do is test a match,
136    /// since the underlying matching engine may be able to do less work.
137    ///
138    /// # Examples
139    ///
140    /// Test if some text contains at least one word with exactly 13 Unicode word characters:
141    ///
142    /// ```rust
143    /// # use hyperscan::regex::Regex;
144    /// let text = "I categorically deny having triskaidekaphobia.";
145    /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text));
146    /// ```
147    pub fn is_match(&self, text: &str) -> bool {
148        let mut matched = false;
149
150        let s = self.0.alloc_scratch().unwrap();
151        let _ = self.0.scan(text, &s, |_, _, _, _| {
152            matched = true;
153
154            Matching::Terminate
155        });
156
157        matched
158    }
159
160    /// Returns the start and end byte range of the leftmost-first match in text. If no match exists, then None is returned.
161    ///
162    /// Note that this should only be used if you want to discover the position of the match. Testing the existence of a match is faster if you use is_match.
163    ///
164    /// # Examples
165    ///
166    /// Find the start and end location of the first word with exactly 13 Unicode word characters:
167    ///
168    /// ```rust
169    /// # use hyperscan::regex::Regex;
170    /// let text = "I categorically deny having triskaidekaphobia.";
171    /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap();
172    /// assert_eq!(mat.start(), 2);
173    /// assert_eq!(mat.end(), 15);
174    /// ```
175    pub fn find<'t>(&self, text: &'t str) -> Option<Match<'t>> {
176        let mut matched = vec![];
177
178        let s = self.0.alloc_scratch().unwrap();
179        let _ = self.0.scan(text, &s, |_, from, to, _| {
180            matched.push((from as usize, to as usize));
181
182            Matching::Terminate
183        });
184
185        matched
186            .first()
187            .map(|&(start, end)| Match::new(&text[start..end], start, end))
188    }
189
190    /// Returns an iterator for each successive non-overlapping match in
191    /// `text`, returning the start and end byte indices with respect to
192    /// `text`.
193    ///
194    /// # Examples
195    ///
196    /// Find the start and end location of every word with exactly 13 Unicode
197    /// word characters:
198    ///
199    /// ```rust
200    /// # use hyperscan::regex::Regex;
201    /// let text = "Retroactively relinquishing remunerations is reprehensible.";
202    /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) {
203    ///     println!("{:?}", mat);
204    /// }
205    /// ```
206    pub fn find_iter<'t>(&self, text: &'t str) -> Matches<'t> {
207        let mut matched = Vec::<Range<usize>>::new();
208
209        let s = self.0.alloc_scratch().unwrap();
210        let _ = self.0.scan(text, &s, |_, from, to, _| {
211            let range = from as usize..to as usize;
212
213            match matched.last() {
214                Some(last) if last.start == range.start && last.end < range.end => {
215                    // only the non-overlapping match should be return
216                    *matched.last_mut().unwrap() = range;
217                }
218                _ => matched.push(range),
219            }
220
221            Matching::Continue
222        });
223
224        Matches(text, matched.into_iter())
225    }
226
227    /// Returns an iterator of substrings of `text` delimited by a match of the
228    /// regular expression. Namely, each element of the iterator corresponds to
229    /// text that *isn't* matched by the regular expression.
230    ///
231    /// This method will *not* copy the text given.
232    ///
233    /// # Examples
234    ///
235    /// To split a string delimited by arbitrary amounts of spaces or tabs:
236    ///
237    /// ```rust
238    /// # use hyperscan::regex::Regex;
239    /// let re = Regex::new(r"[ \t]+").unwrap();
240    /// let fields: Vec<&str> = re.split("a b \t  c\td    e").collect();
241    /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
242    /// ```
243    pub fn split<'t>(&self, text: &'t str) -> Split<'t> {
244        Split {
245            finder: self.find_iter(text),
246            last: 0,
247        }
248    }
249
250    /// Returns an iterator of at most `limit` substrings of `text` delimited
251    /// by a match of the regular expression. (A `limit` of `0` will return no
252    /// substrings.) Namely, each element of the iterator corresponds to text
253    /// that *isn't* matched by the regular expression. The remainder of the
254    /// string that is not split will be the last element in the iterator.
255    ///
256    /// This method will *not* copy the text given.
257    ///
258    /// # Examples
259    ///
260    /// Get the first two words in some text:
261    ///
262    /// ```rust
263    /// # use hyperscan::regex::Regex;
264    /// let re = Regex::new(r"\W+").unwrap();
265    /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect();
266    /// assert_eq!(fields, vec!("Hey", "How", "are you?"));
267    /// ```
268    pub fn splitn<'t>(&self, text: &'t str, limit: usize) -> SplitN<'t> {
269        SplitN {
270            splits: self.split(text),
271            n: limit,
272        }
273    }
274}
275
276/// Yields all substrings delimited by a regular expression match.
277///
278/// `'t` is the lifetime of the string being split.
279pub struct Split<'t> {
280    finder: Matches<'t>,
281    last: usize,
282}
283
284impl<'t> Iterator for Split<'t> {
285    type Item = &'t str;
286
287    fn next(&mut self) -> Option<&'t str> {
288        let text = self.finder.text();
289        match self.finder.next() {
290            None => {
291                if self.last > text.len() {
292                    None
293                } else {
294                    let s = &text[self.last..];
295                    self.last = text.len() + 1; // Next call will return None
296                    Some(s)
297                }
298            }
299            Some(m) => {
300                let matched = &text[self.last..m.start()];
301                self.last = m.end();
302                Some(matched)
303            }
304        }
305    }
306}
307
308/// Yields at most `N` substrings delimited by a regular expression match.
309///
310/// The last substring will be whatever remains after splitting.
311///
312/// `'t` is the lifetime of the string being split.
313pub struct SplitN<'t> {
314    splits: Split<'t>,
315    n: usize,
316}
317
318impl<'t> Iterator for SplitN<'t> {
319    type Item = &'t str;
320
321    fn next(&mut self) -> Option<&'t str> {
322        if self.n == 0 {
323            return None;
324        }
325
326        self.n -= 1;
327        if self.n > 0 {
328            return self.splits.next();
329        }
330
331        let text = self.splits.finder.text();
332        if self.splits.last > text.len() {
333            // We've already returned all substrings.
334            None
335        } else {
336            // self.n == 0, so future calls will return None immediately
337            Some(&text[self.splits.last..])
338        }
339    }
340}
341
342#[cfg(test)]
343mod tests {
344    #[test]
345    fn test_find_iter() {
346        let regex = r"\b\w{13}\b";
347        let text = "Retroactively relinquishing remunerations is reprehensible.";
348
349        assert_eq!(
350            regex::Regex::new(regex)
351                .unwrap()
352                .find_iter(text)
353                .map(|m| m.range())
354                .collect::<Vec<_>>(),
355            super::Regex::new(regex)
356                .unwrap()
357                .find_iter(text)
358                .map(|m| m.range())
359                .collect::<Vec<_>>()
360        );
361    }
362}