unicode_casefold/
lib.rs

1/// Iterators for case folding text. Provides "simple" and "full" algorithms,
2/// with Turkic language options on both.
3///
4/// See [this W3C article][1] for how case folding differs from `.to_lowercase()`.
5///
6/// [1]: https://www.w3.org/International/wiki/Case_folding
7
8use std::iter::{self, Once};
9use std::str::Chars;
10
11mod tables;
12use tables::{Buffer, COMMON_TABLE, FULL_TABLE, SIMPLE_TABLE};
13
14pub use tables::UNICODE_VERSION;
15
16#[derive(Copy, Clone, Debug, Eq, PartialEq)]
17pub enum Locale {
18    NonTurkic,
19    Turkic,
20}
21
22impl Default for Locale {
23    fn default() -> Locale { Locale::NonTurkic }
24}
25
26#[derive(Copy, Clone, Debug, Eq, PartialEq)]
27pub enum Variant {
28    Full,
29    Simple,
30}
31
32impl Default for Variant {
33    fn default() -> Variant { Variant::Full }
34}
35
36/// An iterator over case folded characters.
37#[derive(Copy, Clone, Debug)]
38pub struct CaseFold<I> {
39    inner: I,
40    buffer: Buffer,
41    variant: Variant,
42    locale: Locale,
43}
44
45impl<I: Iterator<Item=char>> CaseFold<I> {
46    fn run(&mut self, c: char) -> char {
47        if self.locale == Locale::Turkic && c == 'I' {
48            '\u{131}'
49        } else if self.locale == Locale::Turkic && c == '\u{130}' {
50            'i'
51        } else {
52            if let Ok(i) = COMMON_TABLE.binary_search_by_key(&c, |x| x.0) {
53                COMMON_TABLE[i].1
54            } else if self.variant == Variant::Full {
55                if let Ok(i) = FULL_TABLE.binary_search_by_key(&c, |x| x.0) {
56                    let (r, b) = FULL_TABLE[i].1;
57                    self.buffer = b;
58                    r
59                } else { c }
60            } else {
61                if let Ok(i) = SIMPLE_TABLE.binary_search_by_key(&c, |x| x.0) {
62                    SIMPLE_TABLE[i].1
63                } else { c }
64            }
65        }
66    }
67}
68
69impl<I: Iterator<Item=char>> Iterator for CaseFold<I> {
70    type Item = char;
71    fn next(&mut self) -> Option<char> {
72        match self.buffer {
73            Buffer::Zero => {
74                if let Some(c) = self.inner.next() {
75                    Some(self.run(c))
76                } else {
77                    None
78                }
79            },
80            Buffer::One(a) => {
81                self.buffer = Buffer::Zero;
82                Some(a)
83            },
84            Buffer::Two(a, b) => {
85                self.buffer = Buffer::One(b);
86                Some(a)
87            },
88        }
89    }
90
91    fn size_hint(&self) -> (usize, Option<usize>) {
92        let extra = match self.buffer {
93            Buffer::Zero => 0,
94            Buffer::One(..) => 1,
95            Buffer::Two(..) => 2,
96        };
97        let (lo, hi) = self.inner.size_hint();
98        let lo = lo.saturating_add(extra);
99        let hi = hi.and_then(|hi| match self.variant {
100            Variant::Full => hi.checked_mul(3),
101            Variant::Simple => Some(hi),
102        }).and_then(|hi| hi.checked_add(extra));
103        (lo, hi)
104    }
105}
106
107/// Methods for case folding text.
108pub trait UnicodeCaseFold<I: Iterator<Item=char>>: Sized {
109    /// Returns an iterator over the case folded characters of `self`.
110    ///
111    /// This is a convenient shorthand for
112    /// `.case_fold(Variant::Full, Locale::NonTurkic)`.
113    ///
114    /// # Examples
115    ///
116    /// ```rust
117    /// # use unicode_casefold::{Locale, Variant, UnicodeCaseFold};
118    /// let s = "Alan Turing".case_fold().collect::<String>();
119    /// assert_eq!(s, "alan turing");
120    /// ```
121    fn case_fold(self) -> CaseFold<I> {
122        self.case_fold_with(Default::default(), Default::default())
123    }
124
125    /// Returns an iterator over the case folded characters of `self`.
126    ///
127    /// # Parameters
128    ///
129    /// The `Variant` can be either:
130    ///
131    /// * `Variant::Full` (recommended), which may expand to a longer string.
132    ///   For example, the full case folded version of `ß` (one character) is
133    ///   `ss` (two characters).
134    ///
135    /// * `Variant::Simple`, a simpler variant which always expands to a string
136    ///   with the same number of characters. This is more efficient, but less
137    ///   complete.
138    ///
139    /// The `Locale` can be either:
140    ///
141    /// * `Locale::NonTurkic` (default), which maps `I` to `i`.
142    ///
143    /// * `Locale::Turkic`, which maps `I` to `ı` (dotless i), as is the case
144    ///   in Turkic languages.
145    ///
146    /// # Examples
147    ///
148    /// ```rust
149    /// # use unicode_casefold::{Locale, Variant, UnicodeCaseFold};
150    /// let name = "Inigo Montoya";
151    /// let turkic = name.case_fold_with(Variant::Full, Locale::Turkic).collect::<String>();
152    /// let non_turkic = name.case_fold_with(Variant::Full, Locale::NonTurkic).collect::<String>();
153    /// assert_eq!(turkic, "ınigo montoya");  // note the dotless i
154    /// assert_eq!(non_turkic, "inigo montoya");
155    /// ```
156    fn case_fold_with(self, Variant, Locale) -> CaseFold<I>;
157}
158
159impl<I: Iterator<Item=char>> UnicodeCaseFold<I> for I {
160    fn case_fold_with(self, variant: Variant, locale: Locale) -> CaseFold<I> {
161        CaseFold {
162            inner: self,
163            buffer: Buffer::Zero,
164            variant: variant,
165            locale: locale,
166        }
167    }
168}
169
170impl<'a> UnicodeCaseFold<Chars<'a>> for &'a str {
171    fn case_fold_with(self, variant: Variant, locale: Locale) -> CaseFold<Chars<'a>> {
172        CaseFold {
173            inner: self.chars(),
174            buffer: Buffer::Zero,
175            variant: variant,
176            locale: locale,
177        }
178    }
179}
180
181impl UnicodeCaseFold<Once<char>> for char {
182    fn case_fold_with(self, variant: Variant, locale: Locale) -> CaseFold<Once<char>> {
183        CaseFold {
184            inner: iter::once(self),
185            buffer: Buffer::Zero,
186            variant: variant,
187            locale: locale,
188        }
189    }
190}
191
192#[cfg(test)]
193mod test {
194    use {Locale, Variant, UnicodeCaseFold};
195
196    #[test]
197    fn simple() {
198        assert_eq!("".case_fold().collect::<String>(), "");
199        assert_eq!("AaBbCcDdEe".case_fold().collect::<String>(), "aabbccddee");
200    }
201
202    #[test]
203    fn turkic() {
204        assert_eq!("I\u{131}\u{130}i".case_fold_with(Variant::Full, Locale::NonTurkic).collect::<String>(), "i\u{131}i\u{307}i");
205        assert_eq!("I\u{131}\u{130}i".case_fold_with(Variant::Simple, Locale::NonTurkic).collect::<String>(), "i\u{131}\u{130}i");
206        assert_eq!("I\u{131}\u{130}i".case_fold_with(Variant::Full, Locale::Turkic).collect::<String>(), "\u{131}\u{131}ii");
207        assert_eq!("I\u{131}\u{130}i".case_fold_with(Variant::Simple, Locale::Turkic).collect::<String>(), "\u{131}\u{131}ii");
208    }
209
210    #[test]
211    fn no_case() {
212        for &s in &["西遊記", "((!))", "サーナイト"] {
213            assert_eq!(s.case_fold().collect::<String>(), s);
214        }
215    }
216
217    #[test]
218    fn size_hint() {
219        let mut ss = vec!['ß'].into_iter().case_fold();
220        assert_eq!(ss.size_hint(), (1, Some(3)));
221        ss.next();
222        assert_eq!(ss.size_hint(), (1, Some(1)));
223    }
224}