pinyin_parser/
lib.rs

1#![warn(clippy::pedantic, clippy::nursery)]
2#![allow(clippy::non_ascii_literal)]
3
4#[cfg(test)]
5mod tests;
6use unicode_segmentation::UnicodeSegmentation;
7
8#[derive(Copy, Clone, PartialEq, Eq, Debug, Hash)]
9pub enum Strictness {
10    Strict,
11    StrictAndSeparateApostropheFromCurlyQuote,
12    Loose,
13}
14
15impl Strictness {
16    #[must_use]
17    pub fn is_strict(self) -> bool {
18        self == Self::Strict || self == Self::StrictAndSeparateApostropheFromCurlyQuote
19    }
20}
21
22#[derive(Copy, Clone, PartialEq, Eq, Debug, Hash)]
23#[allow(clippy::struct_excessive_bools)]
24pub struct PinyinParser {
25    p_strict: Strictness,
26    p_preserve_punctuations: bool,
27    p_preserve_spaces: bool,
28    p_preserve_miscellaneous: bool,
29}
30
31impl Default for PinyinParser {
32    fn default() -> Self {
33        Self::new()
34    }
35}
36
37impl PinyinParser {
38    #[must_use]
39    pub const fn new() -> Self {
40        Self {
41            p_strict: Strictness::Loose,
42            p_preserve_spaces: false,
43            p_preserve_punctuations: false,
44            p_preserve_miscellaneous: false,
45        }
46    }
47
48    #[must_use]
49    #[deprecated = "Use `with_strictness(Strictness::Strict)` or `with_strictness(Strictness::Loose)`"]
50    pub const fn is_strict(self, b: bool) -> Self {
51        Self {
52            p_strict: if b {
53                Strictness::Strict
54            } else {
55                Strictness::Loose
56            },
57            ..self
58        }
59    }
60
61    #[must_use]
62    pub const fn with_strictness(self, strictness: Strictness) -> Self {
63        Self {
64            p_strict: strictness,
65            ..self
66        }
67    }
68
69    #[must_use]
70    pub const fn preserve_spaces(self, b: bool) -> Self {
71        Self {
72            p_preserve_spaces: b,
73            ..self
74        }
75    }
76
77    #[must_use]
78    pub const fn preserve_punctuations(self, b: bool) -> Self {
79        Self {
80            p_preserve_punctuations: b,
81            ..self
82        }
83    }
84
85    /// ```
86    /// use pinyin_parser::PinyinParser;
87    /// let parser = PinyinParser::new()
88    ///     .is_strict(true)
89    ///     .preserve_miscellaneous(true);
90    /// assert_eq!(
91    ///     parser
92    ///         .parse("你Nǐ 好hǎo")
93    ///         .into_iter()
94    ///         .collect::<Vec<_>>(),
95    ///     vec!["你", "nǐ", "好", "hǎo"]
96    /// )
97    /// ```
98    #[must_use]
99    pub const fn preserve_miscellaneous(self, b: bool) -> Self {
100        Self {
101            p_preserve_miscellaneous: b,
102            ..self
103        }
104    }
105
106    /// ```
107    /// use pinyin_parser::PinyinParser;
108    /// let parser = PinyinParser::new()
109    ///     .is_strict(true)
110    ///     .preserve_punctuations(true)
111    ///     .preserve_spaces(true);
112    /// assert_eq!(
113    ///     parser
114    ///         .parse("Nǐ zuò shénme?")
115    ///         .into_iter()
116    ///         .collect::<Vec<_>>(),
117    ///     vec!["nǐ", " ", "zuò", " ", "shén", "me", "?"]
118    /// )
119    /// ```
120    #[must_use]
121    pub fn parse(self, s: &str) -> PinyinParserIter {
122        PinyinParserIter {
123            configs: self,
124            it: VecAndIndex {
125                vec: UnicodeSegmentation::graphemes(s, true)
126                    .map(|c| pinyin_token::to_token(c, self.p_strict))
127                    .collect::<Vec<_>>(),
128                next_pos: 0,
129            },
130            state: ParserState::BeforeWordInitial,
131        }
132    }
133
134    /// Strict mode:
135    /// * forbids the use of breve instead of hacek to represent the third tone
136    /// * forbids the use of IPA `ɡ` (U+0261) instead of `g`, and other such lookalike characters
137    /// * allows apostrophes only before an `a`, an `e` or an `o`
138    /// ```
139    /// use pinyin_parser::PinyinParser;
140    /// assert_eq!(
141    ///     PinyinParser::strict("jīntiān")
142    ///         .into_iter()
143    ///         .collect::<Vec<_>>(),
144    ///     vec!["jīn", "tiān"]
145    /// );
146    /// ```
147
148    /// ```should_panic
149    /// use pinyin_parser::PinyinParser;
150    /// assert_eq!(
151    ///     PinyinParser::strict("zǒnɡshì") // this `ɡ` is not the `g` from ASCII
152    ///         .into_iter()
153    ///         .collect::<Vec<_>>(),
154    ///     vec!["zǒng", "shì"]
155    /// );
156    /// ```
157
158    /// ```should_panic
159    /// use pinyin_parser::PinyinParser;
160    /// assert_eq!(
161    ///     // An apostrophe can come only before an `a`, an `e` or an `o` in strict mode    
162    ///     PinyinParser::strict("Yīng'guó")
163    ///         .into_iter()
164    ///         .collect::<Vec<_>>(),
165    ///     vec!["yīng", "guó"]
166    /// );
167    /// ```
168
169    /// This parser supports the use of `ẑ`, `ĉ`, `ŝ` and `ŋ`, though I have never seen anyone use it.
170    /// ```
171    /// use pinyin_parser::PinyinParser;
172    /// assert_eq!(
173    ///     PinyinParser::strict("Ẑāŋ").into_iter().collect::<Vec<_>>(),
174    ///     vec!["zhāng"]
175    /// )
176    /// ```
177
178    #[must_use]
179    pub fn strict(s: &str) -> PinyinParserIter {
180        Self::new().with_strictness(Strictness::Strict).parse(s)
181    }
182
183    /// ```
184    /// use pinyin_parser::PinyinParser;
185    /// assert_eq!(
186    ///     // 'ă' is LATIN SMALL LETTER A WITH BREVE and is not accepted in strict mode.  
187    ///     // The correct alphabet to use is 'ǎ'.  
188    ///     PinyinParser::loose("mián'ăo")
189    ///         .into_iter()
190    ///         .collect::<Vec<_>>(),
191    ///     vec!["mián", "ǎo"]
192    /// );
193    /// ```
194
195    /// ```
196    /// use pinyin_parser::PinyinParser;
197    /// assert_eq!(
198    ///     // An apostrophe can come only before an `a`, an `e` or an `o` in strict mode,
199    ///     // but allowed here because it's loose    
200    ///     PinyinParser::loose("Yīng'guó")
201    ///         .into_iter()
202    ///         .collect::<Vec<_>>(),
203    ///     vec!["yīng", "guó"]
204    /// );
205    /// ```
206    #[must_use]
207    pub fn loose(s: &str) -> PinyinParserIter {
208        Self::new().parse(s)
209    }
210}
211
212mod pinyin_token;
213
214struct VecAndIndex<T> {
215    vec: std::vec::Vec<T>,
216    next_pos: usize,
217}
218
219pub struct PinyinParserIter {
220    configs: PinyinParser,
221    it: VecAndIndex<pinyin_token::PinyinToken>,
222    state: ParserState,
223}
224
225#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
226enum ParserState {
227    BeforeWordInitial,
228    InitialParsed(SpellingInitial),
229    ZCSParsed(ZCS),
230    AfterSyllablePossiblyConsumingApostrophe,
231}
232
233impl<T> VecAndIndex<T> {
234    fn next(&mut self) -> Option<&T> {
235        let ans = self.vec.get(self.next_pos);
236        self.next_pos += 1;
237        ans
238    }
239
240    fn peek(&self, n: usize) -> Option<&T> {
241        self.vec.get(self.next_pos + n)
242    }
243
244    fn rewind(&mut self, n: usize) {
245        assert!(self.next_pos >= n, "too much rewind");
246        self.next_pos -= n;
247    }
248
249    fn advance(&mut self, n: usize) {
250        self.next_pos += n;
251    }
252}
253
254pub struct PinyinParserIterWithSplitR {
255    iter: PinyinParserIter,
256    next_is_r: bool,
257}
258
259impl Iterator for PinyinParserIterWithSplitR {
260    type Item = String;
261
262    fn next(&mut self) -> Option<Self::Item> {
263        if self.next_is_r {
264            self.next_is_r = false;
265            return Some("r".to_owned());
266        }
267
268        let ans = self.iter.next()?;
269
270        // r should be split off from ans, unless they are "er", "ēr", "ér", "ěr", or "èr"
271        if matches!(&ans[..], "er" | "ēr" | "ér" | "ěr" | "èr") {
272            return Some(ans);
273        }
274
275        if let Some(rest) = ans.strip_suffix('r') {
276            self.next_is_r = true;
277            return Some(rest.to_owned());
278        }
279
280        Some(ans)
281    }
282}
283
284impl PinyinParserIter {
285    #[must_use]
286    pub const fn split_erhua(self) -> PinyinParserIterWithSplitR {
287        PinyinParserIterWithSplitR {
288            iter: self,
289            next_is_r: false,
290        }
291    }
292}
293
294impl Iterator for PinyinParserIter {
295    type Item = String;
296
297    #[allow(clippy::too_many_lines)]
298    #[allow(clippy::cognitive_complexity)]
299    fn next(&mut self) -> Option<Self::Item> {
300        use pinyin_token::Alphabet;
301        use pinyin_token::PinyinToken::{
302            Alph, Apostrophe, LightToneMarker, Others, Punctuation, Space,
303        };
304        use ParserState::{
305            AfterSyllablePossiblyConsumingApostrophe, BeforeWordInitial, InitialParsed, ZCSParsed,
306        };
307        loop {
308            match (self.it.next(), self.state) {
309                (
310                    b @ Some(LightToneMarker | Punctuation(_) | Apostrophe | Space(_) | Others(_)),
311                    a @ (InitialParsed(_) | ZCSParsed(_)),
312                ) => panic!("unexpected {b:?} found after parsing initial {a:?}"),
313                (
314                    Some(LightToneMarker),
315                    AfterSyllablePossiblyConsumingApostrophe | BeforeWordInitial,
316                ) => continue, // just ignore it
317
318                (
319                    Some(Apostrophe),
320                    AfterSyllablePossiblyConsumingApostrophe | BeforeWordInitial,
321                ) => panic!("unexpected apostrophe found at the beginning of a word"),
322                (None, AfterSyllablePossiblyConsumingApostrophe | BeforeWordInitial) => {
323                    return None
324                }
325                (None, InitialParsed(initial)) => {
326                    panic!("unexpected end of string found after {initial:?}");
327                }
328                (None, ZCSParsed(zcs)) => panic!("unexpected end of string found after {zcs:?}"),
329                (
330                    Some(Punctuation(s)),
331                    BeforeWordInitial | AfterSyllablePossiblyConsumingApostrophe,
332                ) => {
333                    if self.configs.p_preserve_punctuations {
334                        self.state = BeforeWordInitial;
335                        return Some((*s).clone());
336                    }
337                    continue;
338                }
339                (Some(Space(s)), BeforeWordInitial | AfterSyllablePossiblyConsumingApostrophe) => {
340                    if self.configs.p_preserve_spaces {
341                        self.state = BeforeWordInitial;
342                        return Some((*s).clone());
343                    }
344                    continue;
345                }
346
347                (Some(Others(s)), BeforeWordInitial | AfterSyllablePossiblyConsumingApostrophe) => {
348                    if self.configs.p_preserve_miscellaneous {
349                        self.state = BeforeWordInitial;
350                        return Some((*s).clone());
351                    }
352                    continue;
353                }
354
355                (
356                    Some(Alph(alph)),
357                    BeforeWordInitial | AfterSyllablePossiblyConsumingApostrophe,
358                ) => match alph.alphabet {
359                    Alphabet::B => self.state = InitialParsed(SpellingInitial::B),
360                    Alphabet::P => self.state = InitialParsed(SpellingInitial::P),
361                    Alphabet::M => {
362                        if alph.diacritics.is_empty() {
363                            self.state = InitialParsed(SpellingInitial::M);
364                        } else {
365                            return Some(alph.to_str(self.configs.p_strict));
366                        }
367                    }
368                    Alphabet::F => self.state = InitialParsed(SpellingInitial::F),
369                    Alphabet::D => self.state = InitialParsed(SpellingInitial::D),
370                    Alphabet::T => self.state = InitialParsed(SpellingInitial::T),
371                    Alphabet::N => {
372                        if alph.diacritics.is_empty() {
373                            self.state = InitialParsed(SpellingInitial::N);
374                        } else {
375                            return Some(alph.to_str(self.configs.p_strict));
376                        }
377                    }
378                    Alphabet::L => self.state = InitialParsed(SpellingInitial::L),
379                    Alphabet::G => self.state = InitialParsed(SpellingInitial::G),
380                    Alphabet::K => self.state = InitialParsed(SpellingInitial::K),
381                    Alphabet::H => self.state = InitialParsed(SpellingInitial::H),
382                    Alphabet::J => self.state = InitialParsed(SpellingInitial::J),
383                    Alphabet::Q => self.state = InitialParsed(SpellingInitial::Q),
384                    Alphabet::X => self.state = InitialParsed(SpellingInitial::X),
385                    Alphabet::R => self.state = InitialParsed(SpellingInitial::R),
386                    Alphabet::Y => self.state = InitialParsed(SpellingInitial::Y),
387                    Alphabet::W => self.state = InitialParsed(SpellingInitial::W),
388                    Alphabet::Z => {
389                        if alph.diacritics.is_empty() {
390                            self.state = ZCSParsed(ZCS::Z);
391                        } else if matches!(
392                            &alph.diacritics[..],
393                            &[pinyin_token::Diacritic::Circumflex]
394                        ) {
395                            self.state = InitialParsed(SpellingInitial::ZH);
396                        } else {
397                            return Some(alph.to_str(self.configs.p_strict));
398                        }
399                    }
400                    Alphabet::C => {
401                        if alph.diacritics.is_empty() {
402                            self.state = ZCSParsed(ZCS::C);
403                        } else if matches!(
404                            &alph.diacritics[..],
405                            &[pinyin_token::Diacritic::Circumflex]
406                        ) {
407                            self.state = InitialParsed(SpellingInitial::CH);
408                        } else {
409                            return Some(alph.to_str(self.configs.p_strict));
410                        }
411                    }
412                    Alphabet::S => {
413                        if alph.diacritics.is_empty() {
414                            self.state = ZCSParsed(ZCS::S);
415                        } else if matches!(
416                            &alph.diacritics[..],
417                            &[pinyin_token::Diacritic::Circumflex]
418                        ) {
419                            self.state = InitialParsed(SpellingInitial::SH);
420                        } else {
421                            return Some(alph.to_str(self.configs.p_strict));
422                        }
423                    }
424                    Alphabet::A | Alphabet::E | Alphabet::O => {
425                        self.it.rewind(1);
426                        self.state = InitialParsed(SpellingInitial::ZeroAEO);
427                    }
428
429                    Alphabet::I | Alphabet::U | Alphabet::Ŋ => panic!(
430                        "unexpected alphabet {:?} found at the beginning of a word",
431                        alph.alphabet,
432                    ),
433                },
434
435                (Some(Alph(alph)), ZCSParsed(zcs)) => {
436                    if alph.alphabet == Alphabet::H {
437                        self.state = match zcs {
438                            ZCS::Z => InitialParsed(SpellingInitial::ZH),
439                            ZCS::C => InitialParsed(SpellingInitial::CH),
440                            ZCS::S => InitialParsed(SpellingInitial::SH),
441                        }
442                    } else {
443                        self.it.rewind(1);
444                        self.state = match zcs {
445                            ZCS::Z => InitialParsed(SpellingInitial::Z),
446                            ZCS::C => InitialParsed(SpellingInitial::C),
447                            ZCS::S => InitialParsed(SpellingInitial::S),
448                        }
449                    }
450                }
451
452                (Some(Alph(_)), InitialParsed(initial)) => {
453                    use finals::Candidate;
454                    self.it.rewind(1);
455                    let candidates = self.it.get_candidates_without_rhotic(self.configs.p_strict);
456
457                    assert!(!candidates.is_empty(),
458                            "no adequate candidate for finals (-an, -ian, ...) is found, after the initial {initial:?}"
459                        );
460
461                    for Candidate { ŋ, fin, tone } in candidates.clone() {
462                        let fin_len = fin.len() - usize::from(ŋ); // ŋ accounts for ng, hence the len is shorter by 1
463                        self.it.advance(fin_len);
464
465                        // ITERATOR IS TEMPORARILY ADVANCED HERE
466                        match self.it.peek(0) {
467                            None => {
468                                self.it.advance(1);
469                                self.state = AfterSyllablePossiblyConsumingApostrophe;
470                                return Some(format!(
471                                    "{}{}",
472                                    initial,
473                                    finals::FinalWithTone { fin, tone }
474                                ));
475                            }
476
477                            Some(Apostrophe) => {
478                                self.it.advance(1);
479
480                                // In the strict mode, `a`, `e` or `o` must follow the apostrophe
481                                if self.configs.p_strict.is_strict() {
482                                    let a_e_o = match self.it.peek(0) {
483                                        Some(Alph(a)) => matches!(
484                                            a.alphabet,
485                                            Alphabet::A | Alphabet::E | Alphabet::O
486                                        ),
487                                        _ => false,
488                                    };
489
490                                    assert!(a_e_o, "In strict mode, an apostrophe must be followed by either 'a', 'e' or 'o'");
491                                }
492
493                                self.state = AfterSyllablePossiblyConsumingApostrophe;
494                                return Some(format!(
495                                    "{}{}",
496                                    initial,
497                                    finals::FinalWithTone { fin, tone }
498                                ));
499                            }
500
501                            Some(Punctuation(_) | LightToneMarker | Space(_) | Others(_)) => {
502                                self.state = AfterSyllablePossiblyConsumingApostrophe;
503                                return Some(format!(
504                                    "{}{}",
505                                    initial,
506                                    finals::FinalWithTone { fin, tone }
507                                ));
508                            }
509
510                            Some(Alph(alph)) => match alph.alphabet {
511                                Alphabet::A
512                                | Alphabet::E
513                                | Alphabet::I
514                                | Alphabet::O
515                                | Alphabet::U
516                                | Alphabet::Ŋ => {
517                                    /* we have read too much or too little; this candidate is not good; ignore. */
518                                    self.it.rewind(fin_len);
519                                    continue;
520                                }
521
522                                Alphabet::R =>
523                                /* possibly rhotic */
524                                {
525                                    let vowel_follows = match self.it.peek(1) {
526                                        Some(Alph(a)) => matches!(
527                                            a.alphabet,
528                                            Alphabet::A
529                                                | Alphabet::E
530                                                | Alphabet::I
531                                                | Alphabet::O
532                                                | Alphabet::U
533                                        ),
534                                        _ => false,
535                                    };
536                                    if vowel_follows {
537                                        // cannot be rhotic
538                                        // peeking `r` was not needed
539                                        // hence simply return
540                                        self.state = AfterSyllablePossiblyConsumingApostrophe;
541                                        return Some(format!(
542                                            "{}{}",
543                                            initial,
544                                            finals::FinalWithTone { fin, tone }
545                                        ));
546                                    }
547                                    // this is rhotic
548                                    self.it.advance(1);
549                                    self.state = AfterSyllablePossiblyConsumingApostrophe;
550                                    return Some(format!(
551                                        "{}{}r",
552                                        initial,
553                                        finals::FinalWithTone { fin, tone }
554                                    ));
555                                }
556
557                                Alphabet::G =>
558                                /* possibly g */
559                                {
560                                    let vowel_follows = match self.it.peek(1) {
561                                        Some(Alph(a)) => matches!(
562                                            a.alphabet,
563                                            Alphabet::A
564                                                | Alphabet::E
565                                                | Alphabet::I
566                                                | Alphabet::O
567                                                | Alphabet::U
568                                        ),
569                                        _ => false,
570                                    };
571                                    if vowel_follows {
572                                        // cannot be an additiona g
573                                        // peeking `g` was not needed
574                                        // hence simply return
575                                        self.state = AfterSyllablePossiblyConsumingApostrophe;
576                                        return Some(format!(
577                                            "{}{}",
578                                            initial,
579                                            finals::FinalWithTone { fin, tone }
580                                        ));
581                                    }
582                                    // this candidate is wrong
583                                    self.it.rewind(fin_len);
584                                    continue;
585                                }
586
587                                Alphabet::N => {
588                                    let vowel_follows = match self.it.peek(1) {
589                                        Some(Alph(a)) => matches!(
590                                            a.alphabet,
591                                            Alphabet::A
592                                                | Alphabet::E
593                                                | Alphabet::I
594                                                | Alphabet::O
595                                                | Alphabet::U
596                                        ),
597                                        _ => false,
598                                    };
599                                    if vowel_follows {
600                                        // peeking `n` was not needed
601                                        // hence simply return
602                                        self.state = AfterSyllablePossiblyConsumingApostrophe;
603                                        return Some(format!(
604                                            "{}{}",
605                                            initial,
606                                            finals::FinalWithTone { fin, tone }
607                                        ));
608                                    }
609                                    // this candidate is not good
610                                    self.it.rewind(fin_len);
611                                    continue;
612                                }
613
614                                _ => {
615                                    self.state = AfterSyllablePossiblyConsumingApostrophe;
616                                    return Some(format!(
617                                        "{}{}",
618                                        initial,
619                                        finals::FinalWithTone { fin, tone }
620                                    ));
621                                }
622                            },
623                        }
624                    }
625                    panic!(
626                        "no adequate candidate for finals (-an, -ian, ...) found, among possible candidates {candidates:?}"
627                    );
628                }
629            }
630        }
631    }
632}
633
634mod finals;
635
636#[allow(clippy::upper_case_acronyms)]
637#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
638enum ZCS {
639    Z,
640    C,
641    S,
642}
643
644#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
645enum SpellingInitial {
646    B,
647    P,
648    M,
649    F,
650    D,
651    T,
652    N,
653    L,
654    G,
655    K,
656    H,
657    J,
658    Q,
659    X,
660    ZH,
661    CH,
662    SH,
663    R,
664    Z,
665    C,
666    S,
667    Y,
668    W,
669    ZeroAEO,
670}
671
672impl std::fmt::Display for SpellingInitial {
673    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
674        match self {
675            Self::B => write!(f, "b"),
676            Self::P => write!(f, "p"),
677            Self::M => write!(f, "m"),
678            Self::F => write!(f, "f"),
679            Self::D => write!(f, "d"),
680            Self::T => write!(f, "t"),
681            Self::N => write!(f, "n"),
682            Self::L => write!(f, "l"),
683            Self::G => write!(f, "g"),
684            Self::K => write!(f, "k"),
685            Self::H => write!(f, "h"),
686            Self::J => write!(f, "j"),
687            Self::Q => write!(f, "q"),
688            Self::X => write!(f, "x"),
689            Self::ZH => write!(f, "zh"),
690            Self::CH => write!(f, "ch"),
691            Self::SH => write!(f, "sh"),
692            Self::R => write!(f, "r"),
693            Self::Z => write!(f, "z"),
694            Self::C => write!(f, "c"),
695            Self::S => write!(f, "s"),
696            Self::Y => write!(f, "y"),
697            Self::W => write!(f, "w"),
698            Self::ZeroAEO => write!(f, ""),
699        }
700    }
701}