onig_regset/
regset.rs

1use crate::{Captures, EncodedChars, Error, Regex, RegexOptions, Region, SearchOptions};
2
3use std::os::raw::c_int;
4use std::ptr::null_mut;
5
6/// Defines the search priority when multiple regexes could match at the same position
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum RegSetLead {
9    /// Return the match that occurs first in the text (position priority)
10    Position,
11    /// Same results as Position I think but slower
12    Regex,
13    /// Return the first regex in your regset that matches, regardless of position
14    PriorityToRegexOrder,
15}
16
17impl RegSetLead {
18    fn to_onig_lead(self) -> onig_sys::OnigRegSetLead {
19        match self {
20            RegSetLead::Position => onig_sys::OnigRegSetLead_ONIG_REGSET_POSITION_LEAD,
21            RegSetLead::Regex => onig_sys::OnigRegSetLead_ONIG_REGSET_REGEX_LEAD,
22            RegSetLead::PriorityToRegexOrder => {
23                onig_sys::OnigRegSetLead_ONIG_REGSET_PRIORITY_TO_REGEX_ORDER
24            }
25        }
26    }
27}
28
29/// A `RegSet` allows you to compile multiple regular expressions and search
30/// for any of them in a single pass through the text. This is more efficient
31/// than searching with each regex individually but `RegSet` has to own them.
32#[derive(Debug)]
33pub struct RegSet {
34    raw: *mut onig_sys::OnigRegSet,
35    options: RegexOptions,
36}
37
38unsafe impl Send for RegSet {}
39unsafe impl Sync for RegSet {}
40
41impl RegSet {
42    /// Create a new RegSet from a slice of pattern strings
43    ///
44    /// All patterns will be compiled with default Regex options.
45    ///
46    /// # Examples
47    ///
48    /// ```rust
49    /// use onig::RegSet;
50    ///
51    /// let set = RegSet::new(&[r"\d+", r"[a-z]+", r"[A-Z]+"]).unwrap();
52    /// ```
53    pub fn new(patterns: &[&str]) -> Result<RegSet, Error> {
54        Self::with_options(patterns, RegexOptions::REGEX_OPTION_NONE)
55    }
56
57    /// Create a new RegSet from a slice of pattern strings with specified options
58    ///
59    /// All patterns will be compiled with the specified Regex options.
60    ///
61    /// # Examples
62    ///
63    /// ```rust
64    /// use onig::{RegSet, RegexOptions};
65    ///
66    /// let set = RegSet::with_options(&[r"\d+", r"[a-z]+"], RegexOptions::REGEX_OPTION_CAPTURE_GROUP).unwrap();
67    /// ```
68    pub fn with_options(patterns: &[&str], options: RegexOptions) -> Result<RegSet, Error> {
69        let mut regset = Self::empty_with_options(options)?;
70
71        for pat in patterns {
72            regset.add_pattern(pat)?;
73        }
74
75        Ok(regset)
76    }
77
78    /// Create an empty RegSet
79    ///
80    /// Creates a new empty RegSet that contains no regular expressions.
81    /// Patterns can be added later using the `add_pattern` method.
82    ///
83    /// # Examples
84    ///
85    /// ```rust
86    /// use onig::RegSet;
87    ///
88    /// let empty_set = RegSet::empty().unwrap();
89    /// assert_eq!(empty_set.len(), 0);
90    /// assert!(empty_set.is_empty());
91    /// ```
92    pub fn empty() -> Result<RegSet, Error> {
93        Self::empty_with_options(RegexOptions::REGEX_OPTION_NONE)
94    }
95
96    /// Create an empty RegSet with specified options
97    ///
98    /// Creates a new empty RegSet that contains no regular expressions.
99    /// Patterns added later will use the specified options.
100    ///
101    /// # Examples
102    ///
103    /// ```rust
104    /// use onig::{RegSet, RegexOptions};
105    ///
106    /// let empty_set = RegSet::empty_with_options(RegexOptions::REGEX_OPTION_CAPTURE_GROUP).unwrap();
107    /// assert_eq!(empty_set.len(), 0);
108    /// assert!(empty_set.is_empty());
109    /// ```
110    pub fn empty_with_options(options: RegexOptions) -> Result<RegSet, Error> {
111        let mut raw_set: *mut onig_sys::OnigRegSet = null_mut();
112        let raw_set_ptr = &mut raw_set as *mut *mut onig_sys::OnigRegSet;
113
114        let err = unsafe { onig_sys::onig_regset_new(raw_set_ptr, 0, null_mut()) };
115
116        if err != onig_sys::ONIG_NORMAL as i32 {
117            return Err(Error::from_code(err));
118        }
119
120        if raw_set.is_null() {
121            return Err(Error::custom("Failed to create RegSet"));
122        }
123
124        Ok(RegSet {
125            raw: raw_set,
126            options,
127        })
128    }
129
130    /// Adds a new compiled regex pattern to the end of the RegSet.
131    ///
132    /// # Examples
133    ///
134    /// ```rust
135    /// use onig::RegSet;
136    ///
137    /// let mut set = RegSet::empty().unwrap();
138    /// let idx = set.add_pattern(r"\d+").unwrap();
139    /// assert_eq!(idx, 0);
140    /// assert_eq!(set.len(), 1);
141    ///
142    /// let idx2 = set.add_pattern(r"[a-z]+").unwrap();
143    /// assert_eq!(idx2, 1);
144    /// assert_eq!(set.len(), 2);
145    /// ```
146    pub fn add_pattern(&mut self, pattern: &str) -> Result<usize, Error> {
147        // Compile the new regex using stored options
148        let new_regex = Regex::with_options(pattern, self.options, crate::Syntax::default())?;
149
150        // Get the current length (this will be the index of the new pattern)
151        let new_index = self.len();
152
153        // Add the regex to the regset
154        let err = unsafe { onig_sys::onig_regset_add(self.raw, new_regex.as_raw()) };
155
156        if err != onig_sys::ONIG_NORMAL as i32 {
157            return Err(Error::from_code(err));
158        }
159
160        // Transfer ownership of the regex to the regset
161        std::mem::forget(new_regex);
162
163        Ok(new_index)
164    }
165
166    /// Replace a regex pattern at the specified index
167    ///
168    /// # Examples
169    ///
170    /// ```rust
171    /// use onig::RegSet;
172    ///
173    /// let mut set = RegSet::new(&[r"\d+", r"[a-z]+"]).unwrap();
174    /// set.replace_pattern(0, r"[A-Z]+").unwrap();
175    ///
176    /// assert!(set.find("123").is_none());
177    /// assert!(set.find("ABC").is_some());
178    /// ```
179    pub fn replace_pattern(&mut self, index: usize, pattern: &str) -> Result<(), Error> {
180        let regset_len = self.len();
181        if index >= regset_len {
182            return Err(Error::custom(format!(
183                "Index {} is out of bounds for RegSet with {} regexes",
184                index, regset_len
185            )));
186        }
187
188        let new_regex = Regex::with_options(pattern, self.options, crate::Syntax::default())?;
189
190        // Replace the regex in the regset
191        let err =
192            unsafe { onig_sys::onig_regset_replace(self.raw, index as c_int, new_regex.as_raw()) };
193
194        if err != onig_sys::ONIG_NORMAL as i32 {
195            return Err(Error::from_code(err));
196        }
197
198        // Transfer ownership of the regex to the regset
199        std::mem::forget(new_regex);
200
201        Ok(())
202    }
203
204    /// Returns the number of regexes in the set
205    pub fn len(&self) -> usize {
206        unsafe { onig_sys::onig_regset_number_of_regex(self.raw) as usize }
207    }
208
209    /// Returns true if the RegSet contains no regexes
210    pub fn is_empty(&self) -> bool {
211        self.len() == 0
212    }
213
214    /// Find the first match of any regex in the set
215    ///
216    /// Returns a tuple of `(regex_index, match_position)` if a match is found,
217    /// or `None` if no match is found.
218    /// # Examples
219    ///
220    /// ```rust
221    /// use onig::RegSet;
222    ///
223    /// let set = RegSet::new(&[r"\d+", r"[a-z]+"]).unwrap();
224    /// if let Some((regex_index, pos)) = set.find("hello123") {
225    ///     println!("Regex {} matched at position {}", regex_index, pos);
226    /// }
227    /// ```
228    pub fn find(&self, text: &str) -> Option<(usize, usize)> {
229        self.find_with_options(
230            text,
231            RegSetLead::Position,
232            SearchOptions::SEARCH_OPTION_NONE,
233        )
234    }
235
236    /// Find the first match of any regex in the set with custom options
237    ///
238    /// # Examples
239    ///
240    /// ```rust
241    /// use onig::{RegSet, RegSetLead, SearchOptions};
242    ///
243    /// let set = RegSet::new(&[r"\d+", r"[a-z]+"]).unwrap();
244    /// if let Some((regex_index, pos)) = set.find_with_options(
245    ///     "hello123",
246    ///     RegSetLead::Regex,
247    ///     SearchOptions::SEARCH_OPTION_NONE
248    /// ) {
249    ///     println!("Regex {} matched at position {}", regex_index, pos);
250    /// }
251    /// ```
252    pub fn find_with_options(
253        &self,
254        text: &str,
255        lead: RegSetLead,
256        options: SearchOptions,
257    ) -> Option<(usize, usize)> {
258        self.search_with_encoding(text, 0, text.len(), lead, options)
259    }
260
261    /// Find the first match of any regex in the set with full capture group information
262    ///
263    /// Returns a tuple of `(regex_index, captures)` if a match is found,
264    /// or `None` if no match is found.
265    ///
266    /// # Examples
267    ///
268    /// ```rust
269    /// use onig::RegSet;
270    ///
271    /// let set = RegSet::new(&[r"(\d+)", r"([a-z]+)"]).unwrap();
272    /// if let Some((regex_index, captures)) = set.captures("hello123") {
273    ///     println!("Regex {} matched", regex_index);
274    ///     println!("Full match: {:?}", captures.at(0));
275    ///     println!("First capture group: {:?}", captures.at(1));
276    /// }
277    /// ```
278    pub fn captures<'t>(&self, text: &'t str) -> Option<(usize, Captures<'t>)> {
279        self.captures_with_options(
280            text,
281            0,
282            text.len(),
283            RegSetLead::Position,
284            SearchOptions::SEARCH_OPTION_NONE,
285        )
286    }
287
288    /// Find the first match with full capture group information and encoding support
289    ///
290    /// Returns a tuple of `(regex_index, captures)` if a match is found,
291    /// or `None` if no match is found.
292    ///
293    /// # Examples
294    ///
295    /// ```rust
296    /// use onig::{RegSet, RegSetLead, SearchOptions, EncodedBytes};
297    ///
298    /// let set = RegSet::new(&[r"(\d+)", r"([a-z]+)"]).unwrap();
299    /// if let Some((regex_index, captures)) = set.captures_with_options(
300    ///     "hello123",
301    ///     0,
302    ///     8,
303    ///     RegSetLead::Position,
304    ///     SearchOptions::SEARCH_OPTION_NONE
305    /// ) {
306    ///     println!("Regex {} matched", regex_index);
307    ///     println!("Full match: {:?}", captures.at(0));
308    ///     println!("First capture group: {:?}", captures.at(1));
309    /// }
310    /// ```
311    pub fn captures_with_options<'t>(
312        &self,
313        text: &'t str,
314        from: usize,
315        to: usize,
316        lead: RegSetLead,
317        options: SearchOptions,
318    ) -> Option<(usize, Captures<'t>)> {
319        if let Some((regex_index, match_pos)) =
320            self.do_search_with_encoding(&text, from, to, lead, options)
321        {
322            let region_ptr =
323                unsafe { onig_sys::onig_regset_get_region(self.raw, regex_index as c_int) };
324
325            if !region_ptr.is_null() {
326                // Pre-allocate region with reasonable capacity
327                // Most regexes have < 10 capture groups and it's not worth adding an option for that
328                // for RegSet
329                let mut region = Region::with_capacity(10);
330                unsafe {
331                    onig_sys::onig_region_copy(&mut region.raw, region_ptr);
332                }
333
334                let captures = Captures::new(text, region, match_pos);
335                return Some((regex_index, captures));
336            }
337        }
338        None
339    }
340
341    fn do_search_with_encoding<T>(
342        &self,
343        chars: &T,
344        from: usize,
345        to: usize,
346        lead: RegSetLead,
347        options: SearchOptions,
348    ) -> Option<(usize, usize)>
349    where
350        T: EncodedChars,
351    {
352        if from > chars.len() || to > chars.len() || from > to {
353            return None;
354        }
355
356        let mut rmatch_pos: c_int = 0;
357        let rmatch_pos_ptr = &mut rmatch_pos as *mut c_int;
358
359        let (beg, end) = (chars.start_ptr(), chars.limit_ptr());
360
361        let result = unsafe {
362            let start = beg.add(from);
363            let range = beg.add(to);
364
365            onig_sys::onig_regset_search(
366                self.raw,
367                beg,
368                end,
369                start,
370                range,
371                lead.to_onig_lead(),
372                options.bits(),
373                rmatch_pos_ptr,
374            )
375        };
376
377        if result >= 0 {
378            Some((result as usize, rmatch_pos as usize))
379        } else {
380            None
381        }
382    }
383
384    fn search_with_encoding<T>(
385        &self,
386        chars: T,
387        from: usize,
388        to: usize,
389        lead: RegSetLead,
390        options: SearchOptions,
391    ) -> Option<(usize, usize)>
392    where
393        T: EncodedChars,
394    {
395        self.do_search_with_encoding(&chars, from, to, lead, options)
396    }
397}
398
399impl Drop for RegSet {
400    fn drop(&mut self) {
401        unsafe {
402            onig_sys::onig_regset_free(self.raw);
403        }
404    }
405}
406
407#[cfg(test)]
408mod tests {
409    use super::*;
410
411    #[test]
412    fn test_regset_empty_patterns() {
413        let set = RegSet::new(&[]).unwrap();
414        assert_eq!(set.len(), 0);
415        assert!(set.is_empty());
416    }
417
418    #[test]
419    fn test_regset_new() {
420        let set = RegSet::new(&[r"\d+"]).unwrap();
421        assert_eq!(set.len(), 1);
422        assert!(!set.is_empty());
423    }
424
425    #[test]
426    fn test_regset_find_with_options() {
427        let set = RegSet::new(&[r"\d+", r"[a-z]+"]).unwrap();
428
429        let result = set.find_with_options(
430            "hello123",
431            RegSetLead::Position,
432            SearchOptions::SEARCH_OPTION_NONE,
433        );
434        assert!(result.is_some());
435
436        let result = set.find_with_options(
437            "hello123",
438            RegSetLead::Regex,
439            SearchOptions::SEARCH_OPTION_NONE,
440        );
441        assert!(result.is_some());
442
443        let result = set.find_with_options(
444            "!@#$%",
445            RegSetLead::Regex,
446            SearchOptions::SEARCH_OPTION_NONE,
447        );
448        assert!(result.is_none());
449    }
450
451    #[test]
452    fn test_regset_captures() {
453        let set = RegSet::new(&[r"(\d+)-(\d+)", r"([a-z]+)"]).unwrap();
454
455        if let Some((regex_index, captures)) = set.captures("hello123") {
456            assert_eq!(regex_index, 1); // "[a-z]+" matches first by position
457            assert_eq!(captures.at(0), Some("hello"));
458            assert_eq!(captures.pos(0), Some((0, 5)));
459        } else {
460            panic!("Expected to find a match");
461        }
462
463        if let Some((regex_index, captures)) = set.captures("123-456") {
464            assert_eq!(regex_index, 0); // First pattern with groups
465            assert_eq!(captures.len(), 3); // Full match + 2 groups
466            assert_eq!(captures.at(0), Some("123-456"));
467            assert_eq!(captures.at(1), Some("123"));
468            assert_eq!(captures.at(2), Some("456"));
469        } else {
470            panic!("Expected to find a match");
471        }
472
473        assert!(set.captures("!@#$%").is_none());
474    }
475
476    #[test]
477    fn test_regset_replace_pattern() {
478        let mut set = RegSet::new(&[r"\d+", r"[a-z]+"]).unwrap();
479
480        assert!(set.find("123").is_some());
481        set.replace_pattern(0, r"[A-Z]+").unwrap();
482        assert!(set.replace_pattern(100, r"[A-Z]+").is_err());
483
484        assert!(set.find("123").is_none());
485        assert!(set.find("ABC").is_some());
486        assert!(set.find("hello").is_some());
487        assert_eq!(set.len(), 2);
488    }
489
490    #[test]
491    fn test_regset_add_pattern() {
492        let mut set = RegSet::empty().unwrap();
493
494        let idx1 = set.add_pattern(r"\d+").unwrap();
495        assert_eq!(idx1, 0);
496        assert_eq!(set.len(), 1);
497        assert_eq!(set.find("hello123"), Some((0, 5)));
498
499        let idx2 = set.add_pattern(r"[a-z]+").unwrap();
500        assert_eq!(idx2, 1);
501        assert_eq!(set.len(), 2);
502        assert_eq!(set.find("hello123"), Some((1, 0)));
503    }
504
505    #[test]
506    fn test_regset_add_pattern_captures() {
507        let mut set = RegSet::empty().unwrap();
508        set.add_pattern(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
509
510        let (idx, caps) = set.captures("2023-12-25").unwrap();
511        assert_eq!(idx, 0);
512        assert_eq!(caps.at(1), Some("2023"));
513        assert_eq!(caps.at(2), Some("12"));
514        assert_eq!(caps.at(3), Some("25"));
515    }
516
517    #[test]
518    fn test_regset_add_pattern_errors() {
519        let mut set = RegSet::empty().unwrap();
520
521        assert!(set.add_pattern(r"[").is_err());
522        assert_eq!(set.len(), 0);
523
524        assert!(set.replace_pattern(0, r"\d+").is_err());
525
526        set.add_pattern(r"\d+").unwrap();
527        assert_eq!(set.len(), 1);
528    }
529
530    #[test]
531    fn test_regset_captures_with_options() {
532        let set = RegSet::new(&[r"(\d+)", r"([a-z]+)"]).unwrap();
533
534        if let Some((regex_index, captures)) = set.captures_with_options(
535            "hello123",
536            0,
537            8,
538            RegSetLead::Position,
539            SearchOptions::SEARCH_OPTION_NONE,
540        ) {
541            assert_eq!(regex_index, 1); // "[a-z]+" matches first by position
542            assert_eq!(captures.at(0), Some("hello"));
543            assert_eq!(captures.at(1), Some("hello"));
544        } else {
545            panic!("Expected to find a match");
546        }
547    }
548}