bpe_openai/normalizer.rs
1use std::borrow::Cow;
2
3use unicode_normalization::UnicodeNormalization;
4
5/// Type which represents a normalized string.
6/// This is to avoid calling normalize multiple times or forgetting to call normalization!
7///
8/// TODO: Annotate the type with the normalization type, once there are more than one.
9pub struct NormalizedString<'a>(Cow<'a, str>);
10
11impl<'a> NormalizedString<'a> {
12 /// Returns the normalized inner str buffer.
13 pub fn as_str(&self) -> &str {
14 &self.0
15 }
16
17 /// This function is unsafe, since the caller must ensure that the correct normalization
18 /// was used. The normalization may vary by tokenizer. This mostly a backdoor which might
19 /// be handy for certain optimizations or for testing.
20 ///
21 /// # Safety
22 /// This is safe if `s` is in fact correctly normalized already. The caller is
23 /// responsible for ensuring that.
24 pub unsafe fn from_str(s: &'a str) -> NormalizedString<'a> {
25 NormalizedString(Cow::Borrowed(s))
26 }
27}
28
29/// Helper trait which converts string types into NormalizedString.
30/// Calling normalize on a NormalizedString is a no-op.
31pub trait Normalizable<'a> {
32 fn normalize(self, nfc: bool) -> NormalizedString<'a>;
33}
34
35impl<'a> Normalizable<'a> for &'a str {
36 fn normalize(self, nfc: bool) -> NormalizedString<'a> {
37 if nfc {
38 NormalizedString(self.nfc().collect())
39 } else {
40 NormalizedString(Cow::Borrowed(self))
41 }
42 }
43}
44
45impl<'a, T> Normalizable<'a> for &'a T
46where
47 T: AsRef<str>,
48{
49 fn normalize(self, nfc: bool) -> NormalizedString<'a> {
50 self.as_ref().normalize(nfc)
51 }
52}
53
54impl<'a> Normalizable<'a> for NormalizedString<'a> {
55 fn normalize(self, _: bool) -> NormalizedString<'a> {
56 self
57 }
58}