tokenizers/pre_tokenizers/
sequence.rs

1use crate::pre_tokenizers::PreTokenizerWrapper;
2use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result};
3use crate::utils::macro_rules_attribute;
4use serde::{Deserialize, Serialize};
5
6#[derive(Clone, Debug, PartialEq)]
7#[macro_rules_attribute(impl_serde_type!)]
8pub struct Sequence {
9    pretokenizers: Vec<PreTokenizerWrapper>,
10}
11
12impl Sequence {
13    pub fn new(pretokenizers: Vec<PreTokenizerWrapper>) -> Self {
14        Self { pretokenizers }
15    }
16
17    pub fn get_pre_tokenizers(&self) -> &[PreTokenizerWrapper] {
18        &self.pretokenizers
19    }
20
21    pub fn get_pre_tokenizers_mut(&mut self) -> &mut [PreTokenizerWrapper] {
22        &mut self.pretokenizers
23    }
24}
25
26impl PreTokenizer for Sequence {
27    fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
28        for pretokenizer in &self.pretokenizers {
29            pretokenizer.pre_tokenize(pretokenized)?;
30        }
31        Ok(())
32    }
33}
34
35#[cfg(test)]
36mod tests {
37    use super::*;
38    use crate::pre_tokenizers::{punctuation::Punctuation, whitespace::WhitespaceSplit};
39    use crate::{OffsetReferential, OffsetType};
40
41    #[test]
42    fn sequence_basic() {
43        let pretokenizers = vec![
44            PreTokenizerWrapper::WhitespaceSplit(WhitespaceSplit),
45            PreTokenizerWrapper::Punctuation(Punctuation::default()),
46        ];
47        let pretok = Sequence::new(pretokenizers);
48        let mut pretokenized: PreTokenizedString = "Hey friend!     How are you?!?".into();
49        pretok.pre_tokenize(&mut pretokenized).unwrap();
50        assert_eq!(
51            pretokenized
52                .get_splits(OffsetReferential::Original, OffsetType::Byte)
53                .into_iter()
54                .map(|(s, o, _)| (s, o))
55                .collect::<Vec<_>>(),
56            vec![
57                ("Hey", (0, 3)),
58                ("friend", (4, 10)),
59                ("!", (10, 11)),
60                ("How", (16, 19)),
61                ("are", (20, 23)),
62                ("you", (24, 27)),
63                ("?", (27, 28)),
64                ("!", (28, 29)),
65                ("?", (29, 30)),
66            ]
67        );
68    }
69}