tokenizers/pre_tokenizers/
whitespace.rs

1use regex::Regex;
2
3use crate::tokenizer::{
4    pattern::Invert, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior,
5};
6use crate::utils::macro_rules_attribute;
7
8#[derive(Clone, Debug, PartialEq, Eq)]
9#[macro_rules_attribute(impl_serde_type!)]
10pub struct Whitespace;
11
12impl Default for Whitespace {
13    fn default() -> Self {
14        Self
15    }
16}
17
18impl PreTokenizer for Whitespace {
19    fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
20        lazy_static! {
21            static ref RE: Regex = Regex::new(r"\w+|[^\w\s]+").unwrap();
22        }
23        let re_ref: &Regex = &RE;
24
25        pretokenized.split(|_, normalized| {
26            normalized.split(Invert(re_ref), SplitDelimiterBehavior::Removed)
27        })
28    }
29}
30
31#[derive(Copy, Clone, Debug, PartialEq, Eq)]
32#[macro_rules_attribute(impl_serde_type!)]
33pub struct WhitespaceSplit;
34
35impl PreTokenizer for WhitespaceSplit {
36    fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
37        pretokenized.split(|_, normalized| {
38            normalized.split(char::is_whitespace, SplitDelimiterBehavior::Removed)
39        })
40    }
41}
42
43#[cfg(test)]
44mod tests {
45    use super::*;
46    use crate::{OffsetReferential, OffsetType, PreTokenizer};
47
48    #[test]
49    fn basic() {
50        let tests = vec![
51            (
52                "Hey man!",
53                vec![("Hey", (0, 3)), ("man", (4, 7)), ("!", (7, 8))],
54            ),
55            (
56                "How are you doing?",
57                vec![
58                    ("How", (0, 3)),
59                    ("are", (4, 7)),
60                    ("you", (8, 11)),
61                    ("doing", (12, 17)),
62                    ("?", (17, 18)),
63                ],
64            ),
65            ("\n", vec![]),
66        ];
67        let pretok = Whitespace {};
68        for (s, res) in tests {
69            let mut pretokenized = PreTokenizedString::from(s);
70            pretok.pre_tokenize(&mut pretokenized).unwrap();
71            assert_eq!(
72                pretokenized
73                    .get_splits(OffsetReferential::Original, OffsetType::Byte)
74                    .into_iter()
75                    .map(|(s, o, _)| (s, o))
76                    .collect::<Vec<_>>(),
77                res
78            );
79        }
80    }
81
82    #[test]
83    fn whitespace_split() {
84        let tests = vec![
85            ("Hey man!", vec![("Hey", (0, 3)), ("man!", (4, 8))]),
86            (
87                "Hey, man, Good?",
88                vec![("Hey,", (0, 4)), ("man,", (5, 9)), ("Good?", (10, 15))],
89            ),
90        ];
91        let pretok = WhitespaceSplit;
92        for (s, res) in tests {
93            let mut pretokenized = PreTokenizedString::from(s);
94            pretok.pre_tokenize(&mut pretokenized).unwrap();
95            assert_eq!(
96                pretokenized
97                    .get_splits(OffsetReferential::Original, OffsetType::Byte)
98                    .into_iter()
99                    .map(|(s, o, _)| (s, o))
100                    .collect::<Vec<_>>(),
101                res
102            );
103        }
104    }
105}