unicode-normalization 0.1.6

This crate provides functions for normalization of Unicode strings, including Canonical and Compatible Decomposition and Recomposition, as described in Unicode Standard Annex #15.
Documentation
use normalize::{
    hangul_decomposition_length,
    is_hangul,
};
use tables;

const MAX_NONSTARTERS: usize = 30;
const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';

/// UAX15-D4: This iterator keeps track of how many non-starters there have been
/// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
/// (U+034F) if the count exceeds 30.
pub struct StreamSafe<I> {
    iter: I,
    nonstarter_count: usize,
    buffer: Option<char>,
}

impl<I> StreamSafe<I> {
    pub(crate) fn new(iter: I) -> Self {
        Self { iter, nonstarter_count: 0, buffer: None }
    }
}

impl<I: Iterator<Item=char>> Iterator for StreamSafe<I> {
    type Item = char;

    #[inline]
    fn next(&mut self) -> Option<char> {
        if let Some(ch) = self.buffer.take() {
            return Some(ch);
        }
        let next_ch = match self.iter.next() {
            None => return None,
            Some(c) => c,
        };
        let d = classify_nonstarters(next_ch);
        if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS {
            self.buffer = Some(next_ch);
            self.nonstarter_count = 0;
            return Some(COMBINING_GRAPHEME_JOINER);
        }

        // No starters in the decomposition, so keep accumulating
        if d.leading_nonstarters == d.decomposition_len {
            self.nonstarter_count += d.decomposition_len;
        }
        // Otherwise, restart the nonstarter counter.
        else {
            self.nonstarter_count = d.trailing_nonstarters;
        }
        Some(next_ch)
    }
}

#[derive(Debug)]
struct Decomposition {
    leading_nonstarters: usize,
    trailing_nonstarters: usize,
    decomposition_len: usize,
}

#[inline]
fn classify_nonstarters(c: char) -> Decomposition {
    // As usual, fast path for ASCII (which is always a starter)
    if c <= '\x7f' {
        return Decomposition {
            leading_nonstarters: 0,
            trailing_nonstarters: 0,
            decomposition_len: 1,
        }
    }
    // Next, special case Hangul, since it's not handled by our tables.
    if is_hangul(c) {
        return Decomposition {
            leading_nonstarters: 0,
            trailing_nonstarters: 0,
            decomposition_len: hangul_decomposition_length(c),
        };
    }
    let decomp = tables::compatibility_fully_decomposed(c)
        .or_else(|| tables::canonical_fully_decomposed(c));
    match decomp {
        Some(decomp) => {
            Decomposition {
                leading_nonstarters: tables::stream_safe_leading_nonstarters(c),
                trailing_nonstarters: tables::stream_safe_trailing_nonstarters(c),
                decomposition_len: decomp.len(),
            }
        },
        None => {
            let is_nonstarter = tables::canonical_combining_class(c) != 0;
            let nonstarter = if is_nonstarter { 1 } else { 0 };
            Decomposition {
                leading_nonstarters: nonstarter,
                trailing_nonstarters: nonstarter,
                decomposition_len: 1,
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use super::{
        StreamSafe,
        classify_nonstarters,
    };
    use std::char;
    use normalization_tests::NORMALIZATION_TESTS;
    use normalize::decompose_compatible;
    use tables;

    fn stream_safe(s: &str) -> String {
        StreamSafe::new(s.chars()).collect()
    }

    #[test]
    fn test_normalization_tests_unaffected() {
        for test in NORMALIZATION_TESTS {
            for &s in &[test.source, test.nfc, test.nfd, test.nfkc, test.nfkd] {
                assert_eq!(stream_safe(s), s);
            }
        }
    }

    #[test]
    fn test_simple() {
        let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone";
        assert_eq!(stream_safe(technically_okay), technically_okay);

        let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
        assert_ne!(stream_safe(too_much), too_much);
    }

    #[test]
    fn test_classify_nonstarters() {
        // Highest character in the `compat_fully_decomp` table is 2FA1D
        for ch in 0..0x2FA1E {
            let ch = match char::from_u32(ch) {
                Some(c) => c,
                None => continue,
            };
            let c = classify_nonstarters(ch);
            let mut s = vec![];
            decompose_compatible(ch, |c| s.push(c));

            assert_eq!(s.len(), c.decomposition_len);

            let num_leading = s
                .iter()
                .take_while(|&c| tables::canonical_combining_class(*c) != 0)
                .count();
            let num_trailing = s
                .iter()
                .rev()
                .take_while(|&c| tables::canonical_combining_class(*c) != 0)
                .count();

            assert_eq!(num_leading, c.leading_nonstarters);
            assert_eq!(num_trailing, c.trailing_nonstarters);
        }
    }
}