bpe_openai/
normalizer.rs

1use std::borrow::Cow;
2
3use unicode_normalization::UnicodeNormalization;
4
5/// Type which represents a normalized string.
6/// This is to avoid calling normalize multiple times or forgetting to call normalization!
7///
8/// TODO: Annotate the type with the normalization type, once there are more than one.
9pub struct NormalizedString<'a>(Cow<'a, str>);
10
11impl<'a> NormalizedString<'a> {
12    /// Returns the normalized inner str buffer.
13    pub fn as_str(&self) -> &str {
14        &self.0
15    }
16
17    /// This function is unsafe, since the caller must ensure that the correct normalization
18    /// was used. The normalization may vary by tokenizer. This mostly a backdoor which might
19    /// be handy for certain optimizations or for testing.
20    ///
21    /// # Safety
22    /// This is safe if `s` is in fact correctly normalized already. The caller is
23    /// responsible for ensuring that.
24    pub unsafe fn from_str(s: &'a str) -> NormalizedString<'a> {
25        NormalizedString(Cow::Borrowed(s))
26    }
27}
28
29/// Helper trait which converts string types into NormalizedString.
30/// Calling normalize on a NormalizedString is a no-op.
31pub trait Normalizable<'a> {
32    fn normalize(self, nfc: bool) -> NormalizedString<'a>;
33}
34
35impl<'a> Normalizable<'a> for &'a str {
36    fn normalize(self, nfc: bool) -> NormalizedString<'a> {
37        if nfc {
38            NormalizedString(self.nfc().collect())
39        } else {
40            NormalizedString(Cow::Borrowed(self))
41        }
42    }
43}
44
45impl<'a, T> Normalizable<'a> for &'a T
46where
47    T: AsRef<str>,
48{
49    fn normalize(self, nfc: bool) -> NormalizedString<'a> {
50        self.as_ref().normalize(nfc)
51    }
52}
53
54impl<'a> Normalizable<'a> for NormalizedString<'a> {
55    fn normalize(self, _: bool) -> NormalizedString<'a> {
56        self
57    }
58}