tokenizers/utils/
onig.rs

1use crate::tokenizer::pattern::Pattern;
2use crate::{Offsets, Result};
3use onig::Regex;
4use std::error::Error;
5
6#[derive(Debug)]
7pub struct SysRegex {
8    regex: Regex,
9}
10
11impl SysRegex {
12    pub fn find_iter<'r, 't>(&'r self, inside: &'t str) -> onig::FindMatches<'r, 't> {
13        self.regex.find_iter(inside)
14    }
15
16    pub fn new(
17        regex_str: &str,
18    ) -> std::result::Result<Self, Box<dyn Error + Send + Sync + 'static>> {
19        Ok(Self {
20            regex: Regex::new(regex_str)?,
21        })
22    }
23}
24
25impl Pattern for &Regex {
26    fn find_matches(&self, inside: &str) -> Result<Vec<(Offsets, bool)>> {
27        if inside.is_empty() {
28            return Ok(vec![((0, 0), false)]);
29        }
30
31        let mut prev = 0;
32        let mut splits = Vec::with_capacity(inside.len());
33        for (start, end) in self.find_iter(inside) {
34            if prev != start {
35                splits.push(((prev, start), false));
36            }
37            splits.push(((start, end), true));
38            prev = end;
39        }
40        if prev != inside.len() {
41            splits.push(((prev, inside.len()), false))
42        }
43        Ok(splits)
44    }
45}