onig_regset/
find.rs

1use super::{Regex, Region, SearchOptions};
2use std::iter::FusedIterator;
3
4impl Regex {
5    /// Returns the capture groups corresponding to the leftmost-first match
6    /// in text. Capture group `0` always corresponds to the entire match.
7    /// If no match is found, then `None` is returned.
8    pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
9        let mut region = Region::new();
10        self.search_with_options(
11            text,
12            0,
13            text.len(),
14            SearchOptions::SEARCH_OPTION_NONE,
15            Some(&mut region),
16        )
17        .map(|pos| Captures {
18            text,
19            region,
20            offset: pos,
21        })
22    }
23
24    /// Returns an iterator for each successive non-overlapping match in `text`,
25    /// returning the start and end byte indices with respect to `text`.
26    ///
27    /// # Example
28    ///
29    /// Find the start and end location of every word with exactly 13
30    /// characters:
31    ///
32    /// ```rust
33    /// # use onig::Regex;
34    /// # fn main() {
35    /// let text = "Retroactively relinquishing remunerations is reprehensible.";
36    /// for pos in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) {
37    ///     println!("{:?}", pos);
38    /// }
39    /// // Output:
40    /// // (0, 13)
41    /// // (14, 27)
42    /// // (28, 41)
43    /// // (45, 58)
44    /// # }
45    /// ```
46    pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> {
47        FindMatches {
48            regex: self,
49            region: Region::new(),
50            text,
51            last_end: 0,
52            last_match_end: None,
53        }
54    }
55
56    /// Returns an iterator over all the non-overlapping capture groups matched
57    /// in `text`. This is operationally the same as `find_iter` (except it
58    /// yields information about submatches).
59    ///
60    /// # Example
61    ///
62    /// We can use this to find all movie titles and their release years in
63    /// some text, where the movie is formatted like "'Title' (xxxx)":
64    ///
65    /// ```rust
66    /// # use onig::Regex;
67    /// # fn main() {
68    /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)")
69    ///                .unwrap();
70    /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
71    /// for caps in re.captures_iter(text) {
72    ///     println!("Movie: {:?}, Released: {:?}", caps.at(1), caps.at(2));
73    /// }
74    /// // Output:
75    /// // Movie: Citizen Kane, Released: 1941
76    /// // Movie: The Wizard of Oz, Released: 1939
77    /// // Movie: M, Released: 1931
78    /// # }
79    /// ```
80    pub fn captures_iter<'r, 't>(&'r self, text: &'t str) -> FindCaptures<'r, 't> {
81        FindCaptures {
82            regex: self,
83            text,
84            last_end: 0,
85            last_match_end: None,
86        }
87    }
88
89    /// Returns an iterator of substrings of `text` delimited by a match
90    /// of the regular expression.
91    /// Namely, each element of the iterator corresponds to text that *isn't*
92    /// matched by the regular expression.
93    ///
94    /// This method will *not* copy the text given.
95    ///
96    /// # Example
97    ///
98    /// To split a string delimited by arbitrary amounts of spaces or tabs:
99    ///
100    /// ```rust
101    /// # use onig::Regex;
102    /// # fn main() {
103    /// let re = Regex::new(r"[ \t]+").unwrap();
104    /// let fields: Vec<&str> = re.split("a b \t  c\td    e").collect();
105    /// assert_eq!(fields, vec!("a", "b", "c", "d", "e"));
106    /// # }
107    /// ```
108    pub fn split<'r, 't>(&'r self, text: &'t str) -> RegexSplits<'r, 't> {
109        RegexSplits {
110            finder: self.find_iter(text),
111            last: 0,
112        }
113    }
114
115    /// Returns an iterator of at most `limit` substrings of `text` delimited
116    /// by a match of the regular expression. (A `limit` of `0` will return no
117    /// substrings.)
118    /// Namely, each element of the iterator corresponds to text that *isn't*
119    /// matched by the regular expression.
120    /// The remainder of the string that is not split will be the last element
121    /// in the iterator.
122    ///
123    /// This method will *not* copy the text given.
124    ///
125    /// # Example
126    ///
127    /// Get the first two words in some text:
128    ///
129    /// ```rust
130    /// # use onig::Regex;
131    /// # fn main() {
132    /// let re = Regex::new(r"\W+").unwrap();
133    /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect();
134    /// assert_eq!(fields, vec!("Hey", "How", "are you?"));
135    /// # }
136    /// ```
137    pub fn splitn<'r, 't>(&'r self, text: &'t str, limit: usize) -> RegexSplitsN<'r, 't> {
138        RegexSplitsN {
139            splits: self.split(text),
140            n: limit,
141        }
142    }
143
144    /// Scan the given slice, capturing into the given region and
145    /// executing a callback for each match.
146    pub fn scan_with_region<F>(
147        &self,
148        to_search: &str,
149        region: &mut Region,
150        options: SearchOptions,
151        mut callback: F,
152    ) -> i32
153    where
154        F: Fn(i32, i32, &Region) -> bool,
155    {
156        use onig_sys::{onig_scan, OnigRegion};
157        use std::os::raw::{c_int, c_void};
158
159        // Find the bounds of the string we're searching
160        let start = to_search.as_ptr();
161        let end = to_search[to_search.len()..].as_ptr();
162
163        unsafe extern "C" fn scan_cb<F>(
164            i: c_int,
165            j: c_int,
166            r: *mut OnigRegion,
167            ud: *mut c_void,
168        ) -> c_int
169        where
170            F: Fn(i32, i32, &Region) -> bool,
171        {
172            let region = Region::clone_from_raw(r);
173            let callback = &*(ud as *mut F);
174            if callback(i, j, &region) {
175                0
176            } else {
177                -1
178            }
179        }
180
181        unsafe {
182            onig_scan(
183                self.raw,
184                start,
185                end,
186                (&mut region.raw) as *mut ::onig_sys::OnigRegion,
187                options.bits(),
188                Some(scan_cb::<F>),
189                &mut callback as *mut F as *mut c_void,
190            )
191        }
192    }
193
194    /// Scan a Pattern and Observe Captures
195    ///
196    /// The scan function takes a haystack `to_search` and invokes the
197    /// given `callback` for each capture of this expression.
198    pub fn scan<'t, CB>(&self, to_search: &'t str, callback: CB)
199    where
200        CB: Fn(i32, Captures<'t>) -> bool,
201    {
202        let mut region = Region::new();
203        self.scan_with_region(
204            to_search,
205            &mut region,
206            SearchOptions::SEARCH_OPTION_NONE,
207            |n, s, region| {
208                let captures = Captures {
209                    text: to_search,
210                    region: region.clone(),
211                    offset: s as usize,
212                };
213                callback(n, captures)
214            },
215        );
216    }
217}
218
219/// Captures represents a group of captured strings for a single match.
220///
221/// The 0th capture always corresponds to the entire match. Each subsequent
222/// index corresponds to the next capture group in the regex. Positions
223/// returned from a capture group are always byte indices.
224///
225/// `'t` is the lifetime of the matched text.
226#[derive(Debug)]
227pub struct Captures<'t> {
228    text: &'t str,
229    region: Region,
230    offset: usize,
231}
232
233impl<'t> Captures<'t> {
234    /// This constructor is primarily intended for internal use by other parts
235    /// of the onig crate, such as RegSet.
236    pub(crate) fn new(text: &'t str, region: Region, offset: usize) -> Self {
237        Captures {
238            text,
239            region,
240            offset,
241        }
242    }
243
244    /// Returns the start and end positions of the Nth capture group. Returns
245    /// `None` if i is not a valid capture group or if the capture group did
246    /// not match anything. The positions returned are always byte indices with
247    /// respect to the original string matched.
248    pub fn pos(&self, pos: usize) -> Option<(usize, usize)> {
249        self.region.pos(pos)
250    }
251
252    /// Returns the matched string for the capture group `i`. If `i` isn't
253    /// a valid capture group or didn't match anything, then `None` is returned.
254    pub fn at(&self, pos: usize) -> Option<&'t str> {
255        self.pos(pos).map(|(beg, end)| &self.text[beg..end])
256    }
257
258    /// Returns the number of captured groups.
259    pub fn len(&self) -> usize {
260        self.region.len()
261    }
262
263    /// Returns true if and only if there are no captured groups.
264    pub fn is_empty(&self) -> bool {
265        self.len() == 0
266    }
267
268    /// Creates an iterator of all the capture groups in order of appearance in
269    /// the regular expression.
270    pub fn iter(&'t self) -> SubCaptures<'t> {
271        SubCaptures { idx: 0, caps: self }
272    }
273
274    /// Creates an iterator of all the capture group positions in order of
275    /// appearance in the regular expression. Positions are byte indices in
276    /// terms of the original string matched.
277    pub fn iter_pos(&'t self) -> SubCapturesPos<'t> {
278        SubCapturesPos { idx: 0, caps: self }
279    }
280
281    /// Offset of the captures within the given string slice.
282    pub fn offset(&self) -> usize {
283        self.offset
284    }
285}
286
287/// An iterator over capture groups for a particular match of a regular
288/// expression.
289///
290/// `'t` is the lifetime of the matched text.
291pub struct SubCaptures<'t> {
292    idx: usize,
293    caps: &'t Captures<'t>,
294}
295
296impl<'t> Iterator for SubCaptures<'t> {
297    type Item = Option<&'t str>;
298
299    fn next(&mut self) -> Option<Option<&'t str>> {
300        if self.idx < self.caps.len() {
301            self.idx += 1;
302            Some(self.caps.at(self.idx - 1))
303        } else {
304            None
305        }
306    }
307
308    fn size_hint(&self) -> (usize, Option<usize>) {
309        let size = self.caps.len();
310        (size, Some(size))
311    }
312
313    fn count(self) -> usize {
314        self.caps.len()
315    }
316}
317
318impl<'t> FusedIterator for SubCaptures<'t> {}
319
320impl<'t> ExactSizeIterator for SubCaptures<'t> {}
321
322/// An iterator over capture group positions for a particular match of
323/// a regular expression.
324///
325/// Positions are byte indices in terms of the original
326/// string matched. `'t` is the lifetime of the matched text.
327pub struct SubCapturesPos<'t> {
328    idx: usize,
329    caps: &'t Captures<'t>,
330}
331
332impl<'t> Iterator for SubCapturesPos<'t> {
333    type Item = Option<(usize, usize)>;
334
335    fn next(&mut self) -> Option<Option<(usize, usize)>> {
336        if self.idx < self.caps.len() {
337            self.idx += 1;
338            Some(self.caps.pos(self.idx - 1))
339        } else {
340            None
341        }
342    }
343
344    fn size_hint(&self) -> (usize, Option<usize>) {
345        let size = self.caps.len();
346        (size, Some(size))
347    }
348
349    fn count(self) -> usize {
350        self.caps.len()
351    }
352}
353
354impl<'t> FusedIterator for SubCapturesPos<'t> {}
355
356impl<'t> ExactSizeIterator for SubCapturesPos<'t> {}
357
358/// An iterator over all non-overlapping matches for a particular string.
359///
360/// The iterator yields a tuple of integers corresponding to the start and end
361/// of the match. The indices are byte offsets. The iterator stops when no more
362/// matches can be found.
363///
364/// `'r` is the lifetime of the `Regex` struct and `'t` is the lifetime
365/// of the matched string.
366pub struct FindMatches<'r, 't> {
367    regex: &'r Regex,
368    region: Region,
369    text: &'t str,
370    last_end: usize,
371    last_match_end: Option<usize>,
372}
373
374impl<'r, 't> Iterator for FindMatches<'r, 't> {
375    type Item = (usize, usize);
376
377    fn next(&mut self) -> Option<(usize, usize)> {
378        if self.last_end > self.text.len() {
379            return None;
380        }
381        self.region.clear();
382        self.regex.search_with_options(
383            self.text,
384            self.last_end,
385            self.text.len(),
386            SearchOptions::SEARCH_OPTION_NONE,
387            Some(&mut self.region),
388        )?;
389        let (s, e) = self.region.pos(0).unwrap();
390
391        // Don't accept empty matches immediately following the last match.
392        // i.e., no infinite loops please.
393        if e == s && self.last_match_end.map_or(false, |l| l == e) {
394            self.last_end += self.text[self.last_end..]
395                .chars()
396                .next()
397                .map(|c| c.len_utf8())
398                .unwrap_or(1);
399            return self.next();
400        } else {
401            self.last_end = e;
402            self.last_match_end = Some(e);
403        }
404
405        Some((s, e))
406    }
407}
408
409impl<'r, 't> FusedIterator for FindMatches<'r, 't> {}
410
411/// An iterator that yields all non-overlapping capture groups matching a
412/// particular regular expression.
413///
414/// The iterator stops when no more matches can be found.
415///
416/// `'r` is the lifetime of the `Regex` struct and `'t` is the lifetime
417/// of the matched string.
418pub struct FindCaptures<'r, 't> {
419    regex: &'r Regex,
420    text: &'t str,
421    last_end: usize,
422    last_match_end: Option<usize>,
423}
424
425impl<'r, 't> Iterator for FindCaptures<'r, 't> {
426    type Item = Captures<'t>;
427
428    fn next(&mut self) -> Option<Captures<'t>> {
429        if self.last_end > self.text.len() {
430            return None;
431        }
432
433        let mut region = Region::new();
434        let r = self.regex.search_with_options(
435            self.text,
436            self.last_end,
437            self.text.len(),
438            SearchOptions::SEARCH_OPTION_NONE,
439            Some(&mut region),
440        )?;
441        let (s, e) = region.pos(0).unwrap();
442
443        // Don't accept empty matches immediately following the last match.
444        // i.e., no infinite loops please.
445        if e == s && self.last_match_end.map_or(false, |l| l == e) {
446            self.last_end += self.text[self.last_end..]
447                .chars()
448                .next()
449                .map(|c| c.len_utf8())
450                .unwrap_or(1);
451            return self.next();
452        } else {
453            self.last_end = e;
454            self.last_match_end = Some(e);
455        }
456        Some(Captures {
457            text: self.text,
458            region,
459            offset: r,
460        })
461    }
462}
463
464impl<'r, 't> FusedIterator for FindCaptures<'r, 't> {}
465
466/// Yields all substrings delimited by a regular expression match.
467///
468/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime
469/// of the string being split.
470pub struct RegexSplits<'r, 't> {
471    finder: FindMatches<'r, 't>,
472    last: usize,
473}
474
475impl<'r, 't> Iterator for RegexSplits<'r, 't> {
476    type Item = &'t str;
477
478    fn next(&mut self) -> Option<&'t str> {
479        let text = self.finder.text;
480        match self.finder.next() {
481            None => {
482                if self.last >= text.len() {
483                    None
484                } else {
485                    let s = &text[self.last..];
486                    self.last = text.len();
487                    Some(s)
488                }
489            }
490            Some((s, e)) => {
491                let matched = &text[self.last..s];
492                self.last = e;
493                Some(matched)
494            }
495        }
496    }
497}
498
499impl<'r, 't> FusedIterator for RegexSplits<'r, 't> {}
500
501/// Yields at most `N` substrings delimited by a regular expression match.
502///
503/// The last substring will be whatever remains after splitting.
504///
505/// `'r` is the lifetime of the compiled expression and `'t` is the lifetime
506/// of the string being split.
507pub struct RegexSplitsN<'r, 't> {
508    splits: RegexSplits<'r, 't>,
509    n: usize,
510}
511
512impl<'r, 't> Iterator for RegexSplitsN<'r, 't> {
513    type Item = &'t str;
514
515    fn next(&mut self) -> Option<&'t str> {
516        if self.n == 0 {
517            return None;
518        }
519        self.n -= 1;
520        if self.n == 0 {
521            let text = self.splits.finder.text;
522            Some(&text[self.splits.last..])
523        } else {
524            self.splits.next()
525        }
526    }
527
528    fn size_hint(&self) -> (usize, Option<usize>) {
529        (0, Some(self.n))
530    }
531}
532
533impl<'r, 't> FusedIterator for RegexSplitsN<'r, 't> {}
534
535#[cfg(test)]
536mod tests {
537    use super::super::*;
538
539    #[test]
540    fn test_regex_captures() {
541        let regex = Regex::new("e(l+)|(r+)").unwrap();
542        let captures = regex.captures("hello").unwrap();
543        assert_eq!(captures.len(), 3);
544        assert_eq!(captures.is_empty(), false);
545        let pos1 = captures.pos(0).unwrap();
546        let pos2 = captures.pos(1).unwrap();
547        let pos3 = captures.pos(2);
548        assert_eq!(pos1, (1, 4));
549        assert_eq!(pos2, (2, 4));
550        assert_eq!(pos3, None);
551        let str1 = captures.at(0).unwrap();
552        let str2 = captures.at(1).unwrap();
553        let str3 = captures.at(2);
554        assert_eq!(str1, "ell");
555        assert_eq!(str2, "ll");
556        assert_eq!(str3, None);
557    }
558
559    #[test]
560    fn test_regex_subcaptures() {
561        let regex = Regex::new("e(l+)").unwrap();
562        let captures = regex.captures("hello").unwrap();
563        let caps = captures.iter().collect::<Vec<_>>();
564        assert_eq!(caps[0], Some("ell"));
565        assert_eq!(caps[1], Some("ll"));
566        assert_eq!(caps.len(), 2);
567    }
568
569    #[test]
570    fn test_regex_subcapturespos() {
571        let regex = Regex::new("e(l+)").unwrap();
572        let captures = regex.captures("hello").unwrap();
573        let caps = captures.iter_pos().collect::<Vec<_>>();
574        assert_eq!(caps[0], Some((1, 4)));
575        assert_eq!(caps[1], Some((2, 4)));
576        assert_eq!(caps.len(), 2);
577    }
578
579    #[test]
580    fn test_find_iter() {
581        let re = Regex::new(r"\d+").unwrap();
582        let ms = re.find_iter("a12b2").collect::<Vec<_>>();
583        assert_eq!(ms, vec![(1, 3), (4, 5)]);
584    }
585
586    #[test]
587    fn test_find_iter_one_zero_length() {
588        let re = Regex::new(r"\d*").unwrap();
589        let ms = re.find_iter("a1b2").collect::<Vec<_>>();
590        assert_eq!(ms, vec![(0, 0), (1, 2), (3, 4)]);
591    }
592
593    #[test]
594    fn test_find_iter_many_zero_length() {
595        let re = Regex::new(r"\d*").unwrap();
596        let ms = re.find_iter("a1bbb2").collect::<Vec<_>>();
597        assert_eq!(ms, vec![(0, 0), (1, 2), (3, 3), (4, 4), (5, 6)]);
598    }
599
600    #[test]
601    fn test_find_iter_empty_after_match() {
602        let re = Regex::new(r"b|(?=,)").unwrap();
603        let ms = re.find_iter("ba,").collect::<Vec<_>>();
604        assert_eq!(ms, vec![(0, 1), (2, 2)]);
605    }
606
607    #[test]
608    fn test_zero_length_matches_jumps_past_match_location() {
609        let re = Regex::new(r"\b").unwrap();
610        let matches = re.find_iter("test string").collect::<Vec<_>>();
611        assert_eq!(matches, [(0, 0), (4, 4), (5, 5), (11, 11)]);
612    }
613
614    #[test]
615    fn test_captures_iter() {
616        let re = Regex::new(r"\d+").unwrap();
617        let ms = re.captures_iter("a12b2").collect::<Vec<_>>();
618        assert_eq!(ms[0].pos(0).unwrap(), (1, 3));
619        assert_eq!(ms[1].pos(0).unwrap(), (4, 5));
620    }
621
622    #[test]
623    fn test_captures_stores_match_offset() {
624        let reg = Regex::new(r"\d+\.(\d+)").unwrap();
625        let captures = reg.captures("100 - 3.1415 / 2.0").unwrap();
626        assert_eq!(6, captures.offset());
627        let all_caps = reg
628            .captures_iter("1 - 3234.3 * 123.2 - 100")
629            .map(|cap| cap.offset())
630            .collect::<Vec<_>>();
631        assert_eq!(vec![4, 13], all_caps);
632    }
633}