pinyin-parser 0.1.7

Parses a string of pinyin syllables. Covers marginal cases such as `ẑ`, `ŋ` and `ê`.
Documentation
#![warn(clippy::pedantic, clippy::nursery)]
#![allow(clippy::non_ascii_literal)]

#[cfg(test)]
mod tests;
use unicode_segmentation::UnicodeSegmentation;

#[derive(Copy, Clone, PartialEq, Eq, Debug, Hash)]
#[allow(clippy::struct_excessive_bools)]
pub struct PinyinParser {
    p_strict: bool,
    p_preserve_punctuations: bool,
    p_preserve_spaces: bool,
    p_preserve_miscellaneous: bool,
}

impl Default for PinyinParser {
    fn default() -> Self {
        Self::new()
    }
}

impl PinyinParser {
    #[must_use]
    pub const fn new() -> Self {
        Self {
            p_strict: false,
            p_preserve_spaces: false,
            p_preserve_punctuations: false,
            p_preserve_miscellaneous: false,
        }
    }

    #[must_use]
    pub const fn is_strict(self, b: bool) -> Self {
        Self {
            p_strict: b,
            ..self
        }
    }

    #[must_use]
    pub const fn preserve_spaces(self, b: bool) -> Self {
        Self {
            p_preserve_spaces: b,
            ..self
        }
    }

    #[must_use]
    pub const fn preserve_punctuations(self, b: bool) -> Self {
        Self {
            p_preserve_punctuations: b,
            ..self
        }
    }

    /// ```
    /// use pinyin_parser::PinyinParser;
    /// let parser = PinyinParser::new()
    ///     .is_strict(true)
    ///     .preserve_miscellaneous(true);
    /// assert_eq!(
    ///     parser
    ///         .parse("你Nǐ 好hǎo")
    ///         .into_iter()
    ///         .collect::<Vec<_>>(),
    ///     vec!["你", "nǐ", "好", "hǎo"]
    /// )
    /// ```
    #[must_use]
    pub const fn preserve_miscellaneous(self, b: bool) -> Self {
        Self {
            p_preserve_miscellaneous: b,
            ..self
        }
    }

    /// ```
    /// use pinyin_parser::PinyinParser;
    /// let parser = PinyinParser::new()
    ///     .is_strict(true)
    ///     .preserve_punctuations(true)
    ///     .preserve_spaces(true);
    /// assert_eq!(
    ///     parser
    ///         .parse("Nǐ zuò shénme?")
    ///         .into_iter()
    ///         .collect::<Vec<_>>(),
    ///     vec!["nǐ", " ", "zuò", " ", "shén", "me", "?"]
    /// )
    /// ```
    #[must_use]
    pub fn parse(self, s: &str) -> PinyinParserIter {
        PinyinParserIter {
            configs: self,
            it: VecAndIndex {
                vec: UnicodeSegmentation::graphemes(s, true)
                    .map(|c| pinyin_token::to_token(c, self.p_strict))
                    .collect::<Vec<_>>(),
                next_pos: 0,
            },
            state: ParserState::BeforeWordInitial,
        }
    }

    /// Strict mode:
    /// * forbids the use of breve instead of hacek to represent the third tone
    /// * forbids the use of IPA `ɡ` (U+0261) instead of `g`, and other such lookalike characters
    /// * allows apostrophes only before an `a`, an `e` or an `o`
    /// ```
    /// use pinyin_parser::PinyinParser;
    /// assert_eq!(
    ///     PinyinParser::strict("jīntiān")
    ///         .into_iter()
    ///         .collect::<Vec<_>>(),
    ///     vec!["jīn", "tiān"]
    /// );
    /// ```

    /// ```should_panic
    /// use pinyin_parser::PinyinParser;
    /// assert_eq!(
    ///     PinyinParser::strict("zǒnɡshì") // this `ɡ` is not the `g` from ASCII
    ///         .into_iter()
    ///         .collect::<Vec<_>>(),
    ///     vec!["zǒng", "shì"]
    /// );
    /// ```

    /// ```should_panic
    /// use pinyin_parser::PinyinParser;
    /// assert_eq!(
    ///     // An apostrophe can come only before an `a`, an `e` or an `o` in strict mode    
    ///     PinyinParser::strict("Yīng'guó")
    ///         .into_iter()
    ///         .collect::<Vec<_>>(),
    ///     vec!["yīng", "guó"]
    /// );
    /// ```

    /// This parser supports the use of `ẑ`, `ĉ`, `ŝ` and `ŋ`, though I have never seen anyone use it.
    /// ```
    /// use pinyin_parser::PinyinParser;
    /// assert_eq!(
    ///     PinyinParser::strict("Ẑāŋ").into_iter().collect::<Vec<_>>(),
    ///     vec!["zhāng"]
    /// )
    /// ```

    #[must_use]
    pub fn strict(s: &str) -> PinyinParserIter {
        Self::new().is_strict(true).parse(s)
    }

    /// ```
    /// use pinyin_parser::PinyinParser;
    /// assert_eq!(
    ///     // 'ă' is LATIN SMALL LETTER A WITH BREVE and is not accepted in strict mode.  
    ///     // The correct alphabet to use is 'ǎ'.  
    ///     PinyinParser::loose("mián'ăo")
    ///         .into_iter()
    ///         .collect::<Vec<_>>(),
    ///     vec!["mián", "ǎo"]
    /// );
    /// ```

    /// ```
    /// use pinyin_parser::PinyinParser;
    /// assert_eq!(
    ///     // An apostrophe can come only before an `a`, an `e` or an `o` in strict mode,
    ///     // but allowed here because it's loose    
    ///     PinyinParser::loose("Yīng'guó")
    ///         .into_iter()
    ///         .collect::<Vec<_>>(),
    ///     vec!["yīng", "guó"]
    /// );
    /// ```
    #[must_use]
    pub fn loose(s: &str) -> PinyinParserIter {
        Self::new().parse(s)
    }
}

mod pinyin_token;

struct VecAndIndex<T> {
    vec: std::vec::Vec<T>,
    next_pos: usize,
}

pub struct PinyinParserIter {
    configs: PinyinParser,
    it: VecAndIndex<pinyin_token::PinyinToken>,
    state: ParserState,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
enum ParserState {
    BeforeWordInitial,
    InitialParsed(SpellingInitial),
    ZCSParsed(ZCS),
    AfterSyllablePossiblyConsumingApostrophe,
}

impl<T> VecAndIndex<T> {
    fn next(&mut self) -> Option<&T> {
        let ans = self.vec.get(self.next_pos);
        self.next_pos += 1;
        ans
    }

    fn peek(&self, n: usize) -> Option<&T> {
        self.vec.get(self.next_pos + n)
    }

    fn rewind(&mut self, n: usize) {
        if self.next_pos < n {
            panic!("too much rewind")
        }
        self.next_pos -= n;
    }

    fn advance(&mut self, n: usize) {
        self.next_pos += n;
    }
}

impl Iterator for PinyinParserIter {
    type Item = String;

    #[allow(clippy::too_many_lines)]
    #[allow(clippy::cognitive_complexity)]
    fn next(&mut self) -> Option<Self::Item> {
        use pinyin_token::Alphabet;
        use pinyin_token::PinyinToken::{
            Alph, Apostrophe, LightToneMarker, Others, Punctuation, Space,
        };
        use ParserState::{
            AfterSyllablePossiblyConsumingApostrophe, BeforeWordInitial, InitialParsed, ZCSParsed,
        };
        loop {
            match (self.it.next(), self.state) {
                (
                    b @ Some(LightToneMarker | Punctuation(_) | Apostrophe | Space(_) | Others(_)),
                    a @ (InitialParsed(_) | ZCSParsed(_)),
                ) => panic!("unexpected {:?} found after parsing initial {:?}", b, a),
                (
                    Some(LightToneMarker),
                    AfterSyllablePossiblyConsumingApostrophe | BeforeWordInitial,
                ) => continue, // just ignore it

                (
                    Some(Apostrophe),
                    AfterSyllablePossiblyConsumingApostrophe | BeforeWordInitial,
                ) => panic!("unexpected apostrophe found at the beginning of a word"),
                (None, AfterSyllablePossiblyConsumingApostrophe | BeforeWordInitial) => {
                    return None
                }
                (None, InitialParsed(initial)) => {
                    panic!("unexpected end of string found after {:?}", initial)
                }
                (None, ZCSParsed(zcs)) => panic!("unexpected end of string found after {:?}", zcs),
                (
                    Some(Punctuation(s)),
                    BeforeWordInitial | AfterSyllablePossiblyConsumingApostrophe,
                ) => {
                    if self.configs.p_preserve_punctuations {
                        self.state = BeforeWordInitial;
                        return Some((*s).clone());
                    }
                    continue;
                }
                (Some(Space(s)), BeforeWordInitial | AfterSyllablePossiblyConsumingApostrophe) => {
                    if self.configs.p_preserve_spaces {
                        self.state = BeforeWordInitial;
                        return Some((*s).clone());
                    }
                    continue;
                }

                (Some(Others(s)), BeforeWordInitial | AfterSyllablePossiblyConsumingApostrophe) => {
                    if self.configs.p_preserve_miscellaneous {
                        self.state = BeforeWordInitial;
                        return Some((*s).clone());
                    }
                    continue;
                }

                (
                    Some(Alph(alph)),
                    BeforeWordInitial | AfterSyllablePossiblyConsumingApostrophe,
                ) => match alph.alphabet {
                    Alphabet::B => self.state = InitialParsed(SpellingInitial::B),
                    Alphabet::P => self.state = InitialParsed(SpellingInitial::P),
                    Alphabet::M => {
                        if alph.diacritics.is_empty() {
                            self.state = InitialParsed(SpellingInitial::M);
                        } else {
                            return Some(alph.to_str(self.configs.p_strict));
                        }
                    }
                    Alphabet::F => self.state = InitialParsed(SpellingInitial::F),
                    Alphabet::D => self.state = InitialParsed(SpellingInitial::D),
                    Alphabet::T => self.state = InitialParsed(SpellingInitial::T),
                    Alphabet::N => {
                        if alph.diacritics.is_empty() {
                            self.state = InitialParsed(SpellingInitial::N)
                        } else {
                            return Some(alph.to_str(self.configs.p_strict));
                        }
                    }
                    Alphabet::L => self.state = InitialParsed(SpellingInitial::L),
                    Alphabet::G => self.state = InitialParsed(SpellingInitial::G),
                    Alphabet::K => self.state = InitialParsed(SpellingInitial::K),
                    Alphabet::H => self.state = InitialParsed(SpellingInitial::H),
                    Alphabet::J => self.state = InitialParsed(SpellingInitial::J),
                    Alphabet::Q => self.state = InitialParsed(SpellingInitial::Q),
                    Alphabet::X => self.state = InitialParsed(SpellingInitial::X),
                    Alphabet::R => self.state = InitialParsed(SpellingInitial::R),
                    Alphabet::Y => self.state = InitialParsed(SpellingInitial::Y),
                    Alphabet::W => self.state = InitialParsed(SpellingInitial::W),
                    Alphabet::Z => {
                        if alph.diacritics.is_empty() {
                            self.state = ZCSParsed(ZCS::Z)
                        } else if matches!(
                            &alph.diacritics[..],
                            &[pinyin_token::Diacritic::Circumflex]
                        ) {
                            self.state = InitialParsed(SpellingInitial::ZH)
                        } else {
                            return Some(alph.to_str(self.configs.p_strict));
                        }
                    }
                    Alphabet::C => {
                        if alph.diacritics.is_empty() {
                            self.state = ZCSParsed(ZCS::C)
                        } else if matches!(
                            &alph.diacritics[..],
                            &[pinyin_token::Diacritic::Circumflex]
                        ) {
                            self.state = InitialParsed(SpellingInitial::CH)
                        } else {
                            return Some(alph.to_str(self.configs.p_strict));
                        }
                    }
                    Alphabet::S => {
                        if alph.diacritics.is_empty() {
                            self.state = ZCSParsed(ZCS::S)
                        } else if matches!(
                            &alph.diacritics[..],
                            &[pinyin_token::Diacritic::Circumflex]
                        ) {
                            self.state = InitialParsed(SpellingInitial::SH)
                        } else {
                            return Some(alph.to_str(self.configs.p_strict));
                        }
                    }
                    Alphabet::A | Alphabet::E | Alphabet::O => {
                        self.it.rewind(1);
                        self.state = InitialParsed(SpellingInitial::ZeroAEO);
                    }

                    Alphabet::I | Alphabet::U | Alphabet::Ŋ => panic!(
                        "unexpected alphabet {:?} found at the beginning of a word",
                        alph.alphabet,
                    ),
                },

                (Some(Alph(alph)), ZCSParsed(zcs)) => {
                    if alph.alphabet == Alphabet::H {
                        self.state = match zcs {
                            ZCS::Z => InitialParsed(SpellingInitial::ZH),
                            ZCS::C => InitialParsed(SpellingInitial::CH),
                            ZCS::S => InitialParsed(SpellingInitial::SH),
                        }
                    } else {
                        self.it.rewind(1);
                        self.state = match zcs {
                            ZCS::Z => InitialParsed(SpellingInitial::Z),
                            ZCS::C => InitialParsed(SpellingInitial::C),
                            ZCS::S => InitialParsed(SpellingInitial::S),
                        }
                    }
                }

                (Some(Alph(_)), InitialParsed(initial)) => {
                    use finals::Candidate;
                    self.it.rewind(1);
                    let candidates = self.it.get_candidates_without_rhotic(self.configs.p_strict);

                    if candidates.is_empty() {
                        panic!(
                            "no adequate candidate for finals (-an, -ian, ...) is found, after the initial {:?}",
                            initial
                        );
                    }

                    for Candidate { ŋ, fin, tone } in candidates.clone() {
                        let fin_len = fin.len() - if ŋ { 1 } else { 0 }; // ŋ accounts for ng, hence the len is shorter by 1
                        self.it.advance(fin_len);

                        // ITERATOR IS TEMPORARILY ADVANCED HERE
                        match self.it.peek(0) {
                            None => {
                                self.it.advance(1);
                                self.state = AfterSyllablePossiblyConsumingApostrophe;
                                return Some(format!(
                                    "{}{}",
                                    initial,
                                    finals::FinalWithTone { fin, tone }
                                ));
                            }

                            Some(Apostrophe) => {
                                self.it.advance(1);

                                // In the strict mode, `a`, `e` or `o` must follow the apostrophe
                                if self.configs.p_strict {
                                    let a_e_o = match self.it.peek(0) {
                                        Some(Alph(a)) => matches!(
                                            a.alphabet,
                                            Alphabet::A | Alphabet::E | Alphabet::O
                                        ),
                                        _ => false,
                                    };

                                    if !a_e_o {
                                        panic!("In strict mode, an apostrophe must be followed by either 'a', 'e' or 'o'")
                                    }
                                }

                                self.state = AfterSyllablePossiblyConsumingApostrophe;
                                return Some(format!(
                                    "{}{}",
                                    initial,
                                    finals::FinalWithTone { fin, tone }
                                ));
                            }

                            Some(Punctuation(_) | LightToneMarker | Space(_) | Others(_)) => {
                                self.state = AfterSyllablePossiblyConsumingApostrophe;
                                return Some(format!(
                                    "{}{}",
                                    initial,
                                    finals::FinalWithTone { fin, tone }
                                ));
                            }

                            Some(Alph(alph)) => match alph.alphabet {
                                Alphabet::A
                                | Alphabet::E
                                | Alphabet::I
                                | Alphabet::O
                                | Alphabet::U
                                | Alphabet::Ŋ => {
                                    /* we have read too much or too little; this candidate is not good; ignore. */
                                    self.it.rewind(fin_len);
                                    continue;
                                }

                                Alphabet::R =>
                                /* possibly rhotic */
                                {
                                    let vowel_follows = match self.it.peek(1) {
                                        Some(Alph(a)) => matches!(
                                            a.alphabet,
                                            Alphabet::A
                                                | Alphabet::E
                                                | Alphabet::I
                                                | Alphabet::O
                                                | Alphabet::U
                                        ),
                                        _ => false,
                                    };
                                    if vowel_follows {
                                        // cannot be rhotic
                                        // peeking `r` was not needed
                                        // hence simply return
                                        self.state = AfterSyllablePossiblyConsumingApostrophe;
                                        return Some(format!(
                                            "{}{}",
                                            initial,
                                            finals::FinalWithTone { fin, tone }
                                        ));
                                    }
                                    // this is rhotic
                                    self.it.advance(1);
                                    self.state = AfterSyllablePossiblyConsumingApostrophe;
                                    return Some(format!(
                                        "{}{}r",
                                        initial,
                                        finals::FinalWithTone { fin, tone }
                                    ));
                                }

                                Alphabet::G =>
                                /* possibly g */
                                {
                                    let vowel_follows = match self.it.peek(1) {
                                        Some(Alph(a)) => matches!(
                                            a.alphabet,
                                            Alphabet::A
                                                | Alphabet::E
                                                | Alphabet::I
                                                | Alphabet::O
                                                | Alphabet::U
                                        ),
                                        _ => false,
                                    };
                                    if vowel_follows {
                                        // cannot be an additiona g
                                        // peeking `g` was not needed
                                        // hence simply return
                                        self.state = AfterSyllablePossiblyConsumingApostrophe;
                                        return Some(format!(
                                            "{}{}",
                                            initial,
                                            finals::FinalWithTone { fin, tone }
                                        ));
                                    }
                                    // this candidate is wrong
                                    self.it.rewind(fin_len);
                                    continue;
                                }

                                Alphabet::N => {
                                    let vowel_follows = match self.it.peek(1) {
                                        Some(Alph(a)) => matches!(
                                            a.alphabet,
                                            Alphabet::A
                                                | Alphabet::E
                                                | Alphabet::I
                                                | Alphabet::O
                                                | Alphabet::U
                                        ),
                                        _ => false,
                                    };
                                    if vowel_follows {
                                        // peeking `n` was not needed
                                        // hence simply return
                                        self.state = AfterSyllablePossiblyConsumingApostrophe;
                                        return Some(format!(
                                            "{}{}",
                                            initial,
                                            finals::FinalWithTone { fin, tone }
                                        ));
                                    }
                                    // this candidate is not good
                                    self.it.rewind(fin_len);
                                    continue;
                                }

                                _ => {
                                    self.state = AfterSyllablePossiblyConsumingApostrophe;
                                    return Some(format!(
                                        "{}{}",
                                        initial,
                                        finals::FinalWithTone { fin, tone }
                                    ));
                                }
                            },
                        }
                    }
                    panic!(
                        "no adequate candidate for finals (-an, -ian, ...) found, among possible candidates {:?}",
                        candidates
                    )
                }
            }
        }
    }
}

mod finals;

#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
enum ZCS {
    Z,
    C,
    S,
}

#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
enum SpellingInitial {
    B,
    P,
    M,
    F,
    D,
    T,
    N,
    L,
    G,
    K,
    H,
    J,
    Q,
    X,
    ZH,
    CH,
    SH,
    R,
    Z,
    C,
    S,
    Y,
    W,
    ZeroAEO,
}

impl std::fmt::Display for SpellingInitial {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        match self {
            SpellingInitial::B => write!(f, "b"),
            SpellingInitial::P => write!(f, "p"),
            SpellingInitial::M => write!(f, "m"),
            SpellingInitial::F => write!(f, "f"),
            SpellingInitial::D => write!(f, "d"),
            SpellingInitial::T => write!(f, "t"),
            SpellingInitial::N => write!(f, "n"),
            SpellingInitial::L => write!(f, "l"),
            SpellingInitial::G => write!(f, "g"),
            SpellingInitial::K => write!(f, "k"),
            SpellingInitial::H => write!(f, "h"),
            SpellingInitial::J => write!(f, "j"),
            SpellingInitial::Q => write!(f, "q"),
            SpellingInitial::X => write!(f, "x"),
            SpellingInitial::ZH => write!(f, "zh"),
            SpellingInitial::CH => write!(f, "ch"),
            SpellingInitial::SH => write!(f, "sh"),
            SpellingInitial::R => write!(f, "r"),
            SpellingInitial::Z => write!(f, "z"),
            SpellingInitial::C => write!(f, "c"),
            SpellingInitial::S => write!(f, "s"),
            SpellingInitial::Y => write!(f, "y"),
            SpellingInitial::W => write!(f, "w"),
            SpellingInitial::ZeroAEO => write!(f, ""),
        }
    }
}