1use crate::util::result::Result;
2use super::token::Token;
3use regex::Regex;
4
5
6pub trait Splitter {
8 fn split(&self, input: &str, limit: Option<usize>) -> Result<Vec<Token>>;
9}
10
11
12pub struct RegexSplitter {
14 regex: Regex,
15}
16
17
18impl RegexSplitter {
19
20 pub fn new(regex: &str) -> Result<Self> {
21 Ok(Self {
22 regex: Regex::new(regex)?
23 })
24 }
25
26}
27
28impl Default for RegexSplitter {
29 fn default() -> Self {
30 const DEFAULT_REGEX: &str = "\\w+(?:[-_]\\w+)*|\\S";
31 Self::new(DEFAULT_REGEX).unwrap() }
33}
34
35
36impl Splitter for RegexSplitter {
37
38 fn split(&self, input: &str, limit: Option<usize>) -> Result<Vec<Token>> {
39 let mut result = Vec::new();
40 for m in self.regex.find_iter(input) {
41 result.push(Token::new(m.start(), m.end(), m.as_str()));
42 if let Some(limit) = limit {
43 if result.len() >= limit {
44 break
45 }
46 }
47 }
48 Ok(result)
49 }
50
51}
52
53
54
55#[cfg(test)]
56mod tests {
57 #![allow(clippy::unwrap_used)]
58 use super::*;
59
60 #[test]
61 fn test_default_regex_splitter() -> Result<()> {
62 let splitter = RegexSplitter::default();
63 let tokens = splitter.split("This is an oh-yeah test", None)?;
64 assert_eq!(tokens.len(), 5);
65 let token = tokens.get(3).unwrap();
66 assert_eq!(token.start(), 11);
67 assert_eq!(token.end(), 18);
68 assert_eq!(token.text(), "oh-yeah");
69 Ok(())
70 }
71
72 #[test]
73 fn test_unicode() -> Result<()> {
74 let splitter = RegexSplitter::default();
75 let tokens = splitter.split("Word with accents: éàèèçîù foo bar", None)?;
76 assert_eq!(tokens.len(), 7);
77 Ok(())
78 }
79
80 #[test]
81 fn test_limit() -> Result<()> {
82 let splitter = RegexSplitter::default();
83 let tokens = splitter.split("w1 w2 w3 w4 w5 w6 w7 w8 w9 w10", Some(5))?;
84 assert_eq!(tokens.len(), 5);
85 assert_eq!(tokens.get(4).unwrap().text(), "w5");
86 Ok(())
87 }
88}