hyperscan/regex/re.rs
1use std::ops::Range;
2use std::str::FromStr;
3use std::sync::Arc;
4use std::vec;
5
6use crate::{
7 common::BlockDatabase,
8 compile::{Builder, Flags, Pattern},
9 runtime::Matching,
10 Error, Result,
11};
12
13/// Match represents a single match of a regex in a haystack.
14///
15/// The lifetime parameter `'t` refers to the lifetime of the matched text.
16#[derive(Copy, Clone, Debug, PartialEq, Eq)]
17pub struct Match<'t> {
18 text: &'t str,
19 start: usize,
20 end: usize,
21}
22
23impl<'t> Match<'t> {
24 /// Returns the starting byte offset of the match in the haystack.
25 #[inline]
26 pub fn start(&self) -> usize {
27 self.start
28 }
29
30 /// Returns the ending byte offset of the match in the haystack.
31 #[inline]
32 pub fn end(&self) -> usize {
33 self.end
34 }
35
36 /// Returns the range over the starting and ending byte offsets of the
37 /// match in the haystack.
38 #[inline]
39 pub fn range(&self) -> Range<usize> {
40 self.start..self.end
41 }
42
43 /// Returns the matched text.
44 #[inline]
45 pub fn as_str(&self) -> &'t str {
46 &self.text[self.start..self.end]
47 }
48
49 /// Creates a new match from the given haystack and byte offsets.
50 #[inline]
51 fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> {
52 Match {
53 text: haystack,
54 start,
55 end,
56 }
57 }
58}
59
60impl<'t> From<Match<'t>> for &'t str {
61 fn from(m: Match<'t>) -> &'t str {
62 m.as_str()
63 }
64}
65
66impl<'t> From<Match<'t>> for Range<usize> {
67 fn from(m: Match<'t>) -> Range<usize> {
68 m.range()
69 }
70}
71
72/// An iterator over all non-overlapping matches for a particular string.
73///
74/// The iterator yields a `Match` value. The iterator stops when no more
75/// matches can be found.
76///
77/// `'r` is the lifetime of the compiled regular expression and `'t` is the
78/// lifetime of the matched string.
79pub struct Matches<'t>(&'t str, vec::IntoIter<Range<usize>>);
80
81impl<'t> Matches<'t> {
82 /// Return the text being searched.
83 pub fn text(&self) -> &'t str {
84 self.0
85 }
86}
87
88impl<'t> Iterator for Matches<'t> {
89 type Item = Match<'t>;
90
91 fn next(&mut self) -> Option<Self::Item> {
92 self.1.next().map(|range| Match::new(self.0, range.start, range.end))
93 }
94}
95
96impl<'t> DoubleEndedIterator for Matches<'t> {
97 fn next_back(&mut self) -> Option<Self::Item> {
98 self.1
99 .next_back()
100 .map(|range| Match::new(self.0, range.start, range.end))
101 }
102}
103
104/// A compiled regular expression for matching Unicode strings.
105#[derive(Clone)]
106pub struct Regex(pub(crate) Arc<BlockDatabase>);
107
108impl FromStr for Regex {
109 type Err = Error;
110
111 /// Attempts to parse a string into a regular expression
112 fn from_str(s: &str) -> Result<Regex> {
113 Regex::new(s)
114 }
115}
116
117/// Core regular expression methods.
118impl Regex {
119 /// Compiles a regular expression.
120 /// Once compiled, it can be used repeatedly to search, split or replace text in a string.
121 ///
122 /// If an invalid expression is given, then an error is returned.
123 pub fn new<S: Into<String>>(re: S) -> Result<Regex> {
124 Self::with_flags(re, Flags::empty())
125 }
126
127 pub(crate) fn with_flags<S: Into<String>>(re: S, flags: Flags) -> Result<Regex> {
128 Pattern::with_flags(re, flags | Flags::SOM_LEFTMOST | Flags::UTF8)?
129 .build()
130 .map(|db| Regex(Arc::new(db)))
131 }
132
133 /// Returns true if and only if the regex matches the string given.
134 ///
135 /// It is recommended to use this method if all you need to do is test a match,
136 /// since the underlying matching engine may be able to do less work.
137 ///
138 /// # Examples
139 ///
140 /// Test if some text contains at least one word with exactly 13 Unicode word characters:
141 ///
142 /// ```rust
143 /// # use hyperscan::regex::Regex;
144 /// let text = "I categorically deny having triskaidekaphobia.";
145 /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text));
146 /// ```
147 pub fn is_match(&self, text: &str) -> bool {
148 let mut matched = false;
149
150 let s = self.0.alloc_scratch().unwrap();
151 let _ = self.0.scan(text, &s, |_, _, _, _| {
152 matched = true;
153
154 Matching::Terminate
155 });
156
157 matched
158 }
159
160 /// Returns the start and end byte range of the leftmost-first match in text. If no match exists, then None is returned.
161 ///
162 /// Note that this should only be used if you want to discover the position of the match. Testing the existence of a match is faster if you use is_match.
163 ///
164 /// # Examples
165 ///
166 /// Find the start and end location of the first word with exactly 13 Unicode word characters:
167 ///
168 /// ```rust
169 /// # use hyperscan::regex::Regex;
170 /// let text = "I categorically deny having triskaidekaphobia.";
171 /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap();
172 /// assert_eq!(mat.start(), 2);
173 /// assert_eq!(mat.end(), 15);
174 /// ```
175 pub fn find<'t>(&self, text: &'t str) -> Option<Match<'t>> {
176 let mut matched = vec![];
177
178 let s = self.0.alloc_scratch().unwrap();
179 let _ = self.0.scan(text, &s, |_, from, to, _| {
180 matched.push((from as usize, to as usize));
181
182 Matching::Terminate
183 });
184
185 matched
186 .first()
187 .map(|&(start, end)| Match::new(&text[start..end], start, end))
188 }
189
190 /// Returns an iterator for each successive non-overlapping match in
191 /// `text`, returning the start and end byte indices with respect to
192 /// `text`.
193 ///
194 /// # Examples
195 ///
196 /// Find the start and end location of every word with exactly 13 Unicode
197 /// word characters:
198 ///
199 /// ```rust
200 /// # use hyperscan::regex::Regex;
201 /// let text = "Retroactively relinquishing remunerations is reprehensible.";
202 /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) {
203 /// println!("{:?}", mat);
204 /// }
205 /// ```
206 pub fn find_iter<'t>(&self, text: &'t str) -> Matches<'t> {
207 let mut matched = Vec::<Range<usize>>::new();
208
209 let s = self.0.alloc_scratch().unwrap();
210 let _ = self.0.scan(text, &s, |_, from, to, _| {
211 let range = from as usize..to as usize;
212
213 match matched.last() {
214 Some(last) if last.start == range.start && last.end < range.end => {
215 // only the non-overlapping match should be return
216 *matched.last_mut().unwrap() = range;
217 }
218 _ => matched.push(range),
219 }
220
221 Matching::Continue
222 });
223
224 Matches(text, matched.into_iter())
225 }
226
227 /// Returns an iterator of substrings of `text` delimited by a match of the
228 /// regular expression. Namely, each element of the iterator corresponds to
229 /// text that *isn't* matched by the regular expression.
230 ///
231 /// This method will *not* copy the text given.
232 ///
233 /// # Examples
234 ///
235 /// To split a string delimited by arbitrary amounts of spaces or tabs:
236 ///
237 /// ```rust
238 /// # use hyperscan::regex::Regex;
239 /// let re = Regex::new(r"[ \t]+").unwrap();
240 /// let fields: Vec<&str> = re.split("a b \t c\td e").collect();
241 /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
242 /// ```
243 pub fn split<'t>(&self, text: &'t str) -> Split<'t> {
244 Split {
245 finder: self.find_iter(text),
246 last: 0,
247 }
248 }
249
250 /// Returns an iterator of at most `limit` substrings of `text` delimited
251 /// by a match of the regular expression. (A `limit` of `0` will return no
252 /// substrings.) Namely, each element of the iterator corresponds to text
253 /// that *isn't* matched by the regular expression. The remainder of the
254 /// string that is not split will be the last element in the iterator.
255 ///
256 /// This method will *not* copy the text given.
257 ///
258 /// # Examples
259 ///
260 /// Get the first two words in some text:
261 ///
262 /// ```rust
263 /// # use hyperscan::regex::Regex;
264 /// let re = Regex::new(r"\W+").unwrap();
265 /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect();
266 /// assert_eq!(fields, vec!("Hey", "How", "are you?"));
267 /// ```
268 pub fn splitn<'t>(&self, text: &'t str, limit: usize) -> SplitN<'t> {
269 SplitN {
270 splits: self.split(text),
271 n: limit,
272 }
273 }
274}
275
276/// Yields all substrings delimited by a regular expression match.
277///
278/// `'t` is the lifetime of the string being split.
279pub struct Split<'t> {
280 finder: Matches<'t>,
281 last: usize,
282}
283
284impl<'t> Iterator for Split<'t> {
285 type Item = &'t str;
286
287 fn next(&mut self) -> Option<&'t str> {
288 let text = self.finder.text();
289 match self.finder.next() {
290 None => {
291 if self.last > text.len() {
292 None
293 } else {
294 let s = &text[self.last..];
295 self.last = text.len() + 1; // Next call will return None
296 Some(s)
297 }
298 }
299 Some(m) => {
300 let matched = &text[self.last..m.start()];
301 self.last = m.end();
302 Some(matched)
303 }
304 }
305 }
306}
307
308/// Yields at most `N` substrings delimited by a regular expression match.
309///
310/// The last substring will be whatever remains after splitting.
311///
312/// `'t` is the lifetime of the string being split.
313pub struct SplitN<'t> {
314 splits: Split<'t>,
315 n: usize,
316}
317
318impl<'t> Iterator for SplitN<'t> {
319 type Item = &'t str;
320
321 fn next(&mut self) -> Option<&'t str> {
322 if self.n == 0 {
323 return None;
324 }
325
326 self.n -= 1;
327 if self.n > 0 {
328 return self.splits.next();
329 }
330
331 let text = self.splits.finder.text();
332 if self.splits.last > text.len() {
333 // We've already returned all substrings.
334 None
335 } else {
336 // self.n == 0, so future calls will return None immediately
337 Some(&text[self.splits.last..])
338 }
339 }
340}
341
342#[cfg(test)]
343mod tests {
344 #[test]
345 fn test_find_iter() {
346 let regex = r"\b\w{13}\b";
347 let text = "Retroactively relinquishing remunerations is reprehensible.";
348
349 assert_eq!(
350 regex::Regex::new(regex)
351 .unwrap()
352 .find_iter(text)
353 .map(|m| m.range())
354 .collect::<Vec<_>>(),
355 super::Regex::new(regex)
356 .unwrap()
357 .find_iter(text)
358 .map(|m| m.range())
359 .collect::<Vec<_>>()
360 );
361 }
362}