gliner/text/
splitter.rs

1use crate::util::result::Result;
2use super::token::Token;
3use regex::Regex;
4
5
6/// Word-level tokenization
7pub trait Splitter {
8    fn split(&self, input: &str, limit: Option<usize>) -> Result<Vec<Token>>;
9}
10
11
12/// Word-level tokenization implemented using regular expressions
13pub struct RegexSplitter {
14    regex: Regex,
15}
16
17
18impl RegexSplitter {
19
20    pub fn new(regex: &str) -> Result<Self> {
21        Ok(Self {
22            regex: Regex::new(regex)?
23        })
24    }
25
26}
27
28impl Default for RegexSplitter {
29    fn default() -> Self {
30        const DEFAULT_REGEX: &str = "\\w+(?:[-_]\\w+)*|\\S";
31        Self::new(DEFAULT_REGEX).unwrap() // safe unwrap (as regex is const and correct)
32    }
33}
34
35
36impl Splitter for RegexSplitter {
37
38    fn split(&self, input: &str, limit: Option<usize>) -> Result<Vec<Token>> {
39        let mut result = Vec::new();
40        for m in self.regex.find_iter(input) {
41            result.push(Token::new(m.start(), m.end(), m.as_str()));
42            if let Some(limit) = limit {
43                if result.len() >= limit {
44                    break
45                }
46            }
47        }
48        Ok(result)
49    }
50
51}
52
53
54
55#[cfg(test)]
56mod tests {
57    #![allow(clippy::unwrap_used)]
58    use super::*;
59
60    #[test]
61    fn test_default_regex_splitter() -> Result<()> {
62        let splitter = RegexSplitter::default();
63        let tokens = splitter.split("This is an oh-yeah test", None)?;
64        assert_eq!(tokens.len(), 5);
65        let token = tokens.get(3).unwrap();
66        assert_eq!(token.start(), 11);
67        assert_eq!(token.end(), 18);
68        assert_eq!(token.text(), "oh-yeah");
69        Ok(())
70    }
71
72    #[test]
73    fn test_unicode() -> Result<()> {
74        let splitter = RegexSplitter::default();
75        let tokens = splitter.split("Word with accents: éàèèçîù foo bar", None)?;
76        assert_eq!(tokens.len(), 7);        
77        Ok(())
78    }
79
80    #[test]
81    fn test_limit() -> Result<()> {
82        let splitter = RegexSplitter::default();
83        let tokens = splitter.split("w1 w2 w3 w4 w5 w6 w7 w8 w9 w10", Some(5))?;
84        assert_eq!(tokens.len(), 5);
85        assert_eq!(tokens.get(4).unwrap().text(), "w5");
86        Ok(())
87    }
88}