tokenizers/pre_tokenizers/
whitespace.rs1use regex::Regex;
2
3use crate::tokenizer::{
4 pattern::Invert, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior,
5};
6use crate::utils::macro_rules_attribute;
7
8#[derive(Clone, Debug, PartialEq, Eq)]
9#[macro_rules_attribute(impl_serde_type!)]
10pub struct Whitespace;
11
12impl Default for Whitespace {
13 fn default() -> Self {
14 Self
15 }
16}
17
18impl PreTokenizer for Whitespace {
19 fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
20 lazy_static! {
21 static ref RE: Regex = Regex::new(r"\w+|[^\w\s]+").unwrap();
22 }
23 let re_ref: &Regex = &RE;
24
25 pretokenized.split(|_, normalized| {
26 normalized.split(Invert(re_ref), SplitDelimiterBehavior::Removed)
27 })
28 }
29}
30
31#[derive(Copy, Clone, Debug, PartialEq, Eq)]
32#[macro_rules_attribute(impl_serde_type!)]
33pub struct WhitespaceSplit;
34
35impl PreTokenizer for WhitespaceSplit {
36 fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
37 pretokenized.split(|_, normalized| {
38 normalized.split(char::is_whitespace, SplitDelimiterBehavior::Removed)
39 })
40 }
41}
42
43#[cfg(test)]
44mod tests {
45 use super::*;
46 use crate::{OffsetReferential, OffsetType, PreTokenizer};
47
48 #[test]
49 fn basic() {
50 let tests = vec![
51 (
52 "Hey man!",
53 vec![("Hey", (0, 3)), ("man", (4, 7)), ("!", (7, 8))],
54 ),
55 (
56 "How are you doing?",
57 vec![
58 ("How", (0, 3)),
59 ("are", (4, 7)),
60 ("you", (8, 11)),
61 ("doing", (12, 17)),
62 ("?", (17, 18)),
63 ],
64 ),
65 ("\n", vec![]),
66 ];
67 let pretok = Whitespace {};
68 for (s, res) in tests {
69 let mut pretokenized = PreTokenizedString::from(s);
70 pretok.pre_tokenize(&mut pretokenized).unwrap();
71 assert_eq!(
72 pretokenized
73 .get_splits(OffsetReferential::Original, OffsetType::Byte)
74 .into_iter()
75 .map(|(s, o, _)| (s, o))
76 .collect::<Vec<_>>(),
77 res
78 );
79 }
80 }
81
82 #[test]
83 fn whitespace_split() {
84 let tests = vec![
85 ("Hey man!", vec![("Hey", (0, 3)), ("man!", (4, 8))]),
86 (
87 "Hey, man, Good?",
88 vec![("Hey,", (0, 4)), ("man,", (5, 9)), ("Good?", (10, 15))],
89 ),
90 ];
91 let pretok = WhitespaceSplit;
92 for (s, res) in tests {
93 let mut pretokenized = PreTokenizedString::from(s);
94 pretok.pre_tokenize(&mut pretokenized).unwrap();
95 assert_eq!(
96 pretokenized
97 .get_splits(OffsetReferential::Original, OffsetType::Byte)
98 .into_iter()
99 .map(|(s, o, _)| (s, o))
100 .collect::<Vec<_>>(),
101 res
102 );
103 }
104 }
105}