tokenizers/normalizers/
prepend.rs

1use crate::tokenizer::{NormalizedString, Normalizer, Result};
2use serde::{Deserialize, Serialize};
3
4#[derive(Clone, Debug, Deserialize, Serialize)]
5#[serde(tag = "type")]
6pub struct Prepend {
7    pub prepend: String,
8}
9
10impl Prepend {
11    pub fn new(prepend: String) -> Self {
12        Self { prepend }
13    }
14}
15
16impl Normalizer for Prepend {
17    /// Strip the normalized string inplace
18    fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
19        if !normalized.is_empty() {
20            normalized.prepend(&self.prepend);
21        }
22        Ok(())
23    }
24}
25
26#[cfg(test)]
27mod tests {
28    use super::*;
29
30    #[test]
31    fn test_prepend() {
32        let original = "Hello";
33        let normalized = "▁Hello";
34        assert_ne!(original, normalized);
35        let mut n = NormalizedString::from(original);
36        let prepend = Prepend::new("▁".to_string());
37        prepend.normalize(&mut n).unwrap();
38        assert_eq!(&n.get(), &normalized);
39        assert_eq!(
40            n,
41            NormalizedString::new(
42                original.to_string(),
43                normalized.to_string(),
44                vec![
45                    (0, 1),
46                    (0, 1),
47                    (0, 1),
48                    (0, 1),
49                    (1, 2),
50                    (2, 3),
51                    (3, 4),
52                    (4, 5)
53                ],
54                0
55            )
56        );
57        assert_eq!(
58            n.alignments_original(),
59            vec![(0, 4), (4, 5), (5, 6), (6, 7), (7, 8)]
60        );
61    }
62}