1#![warn(clippy::pedantic, clippy::nursery)]
2#![allow(clippy::non_ascii_literal)]
3
4#[cfg(test)]
5mod tests;
6use unicode_segmentation::UnicodeSegmentation;
7
8#[derive(Copy, Clone, PartialEq, Eq, Debug, Hash)]
9pub enum Strictness {
10 Strict,
11 StrictAndSeparateApostropheFromCurlyQuote,
12 Loose,
13}
14
15impl Strictness {
16 #[must_use]
17 pub fn is_strict(self) -> bool {
18 self == Self::Strict || self == Self::StrictAndSeparateApostropheFromCurlyQuote
19 }
20}
21
22#[derive(Copy, Clone, PartialEq, Eq, Debug, Hash)]
23#[allow(clippy::struct_excessive_bools)]
24pub struct PinyinParser {
25 p_strict: Strictness,
26 p_preserve_punctuations: bool,
27 p_preserve_spaces: bool,
28 p_preserve_miscellaneous: bool,
29}
30
31impl Default for PinyinParser {
32 fn default() -> Self {
33 Self::new()
34 }
35}
36
37impl PinyinParser {
38 #[must_use]
39 pub const fn new() -> Self {
40 Self {
41 p_strict: Strictness::Loose,
42 p_preserve_spaces: false,
43 p_preserve_punctuations: false,
44 p_preserve_miscellaneous: false,
45 }
46 }
47
48 #[must_use]
49 #[deprecated = "Use `with_strictness(Strictness::Strict)` or `with_strictness(Strictness::Loose)`"]
50 pub const fn is_strict(self, b: bool) -> Self {
51 Self {
52 p_strict: if b {
53 Strictness::Strict
54 } else {
55 Strictness::Loose
56 },
57 ..self
58 }
59 }
60
61 #[must_use]
62 pub const fn with_strictness(self, strictness: Strictness) -> Self {
63 Self {
64 p_strict: strictness,
65 ..self
66 }
67 }
68
69 #[must_use]
70 pub const fn preserve_spaces(self, b: bool) -> Self {
71 Self {
72 p_preserve_spaces: b,
73 ..self
74 }
75 }
76
77 #[must_use]
78 pub const fn preserve_punctuations(self, b: bool) -> Self {
79 Self {
80 p_preserve_punctuations: b,
81 ..self
82 }
83 }
84
85 #[must_use]
99 pub const fn preserve_miscellaneous(self, b: bool) -> Self {
100 Self {
101 p_preserve_miscellaneous: b,
102 ..self
103 }
104 }
105
106 #[must_use]
121 pub fn parse(self, s: &str) -> PinyinParserIter {
122 PinyinParserIter {
123 configs: self,
124 it: VecAndIndex {
125 vec: UnicodeSegmentation::graphemes(s, true)
126 .map(|c| pinyin_token::to_token(c, self.p_strict))
127 .collect::<Vec<_>>(),
128 next_pos: 0,
129 },
130 state: ParserState::BeforeWordInitial,
131 }
132 }
133
134 #[must_use]
179 pub fn strict(s: &str) -> PinyinParserIter {
180 Self::new().with_strictness(Strictness::Strict).parse(s)
181 }
182
183 #[must_use]
207 pub fn loose(s: &str) -> PinyinParserIter {
208 Self::new().parse(s)
209 }
210}
211
212mod pinyin_token;
213
214struct VecAndIndex<T> {
215 vec: std::vec::Vec<T>,
216 next_pos: usize,
217}
218
219pub struct PinyinParserIter {
220 configs: PinyinParser,
221 it: VecAndIndex<pinyin_token::PinyinToken>,
222 state: ParserState,
223}
224
225#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
226enum ParserState {
227 BeforeWordInitial,
228 InitialParsed(SpellingInitial),
229 ZCSParsed(ZCS),
230 AfterSyllablePossiblyConsumingApostrophe,
231}
232
233impl<T> VecAndIndex<T> {
234 fn next(&mut self) -> Option<&T> {
235 let ans = self.vec.get(self.next_pos);
236 self.next_pos += 1;
237 ans
238 }
239
240 fn peek(&self, n: usize) -> Option<&T> {
241 self.vec.get(self.next_pos + n)
242 }
243
244 fn rewind(&mut self, n: usize) {
245 assert!(self.next_pos >= n, "too much rewind");
246 self.next_pos -= n;
247 }
248
249 fn advance(&mut self, n: usize) {
250 self.next_pos += n;
251 }
252}
253
254pub struct PinyinParserIterWithSplitR {
255 iter: PinyinParserIter,
256 next_is_r: bool,
257}
258
259impl Iterator for PinyinParserIterWithSplitR {
260 type Item = String;
261
262 fn next(&mut self) -> Option<Self::Item> {
263 if self.next_is_r {
264 self.next_is_r = false;
265 return Some("r".to_owned());
266 }
267
268 let ans = self.iter.next()?;
269
270 if matches!(&ans[..], "er" | "ēr" | "ér" | "ěr" | "èr") {
272 return Some(ans);
273 }
274
275 if let Some(rest) = ans.strip_suffix('r') {
276 self.next_is_r = true;
277 return Some(rest.to_owned());
278 }
279
280 Some(ans)
281 }
282}
283
284impl PinyinParserIter {
285 #[must_use]
286 pub const fn split_erhua(self) -> PinyinParserIterWithSplitR {
287 PinyinParserIterWithSplitR {
288 iter: self,
289 next_is_r: false,
290 }
291 }
292}
293
294impl Iterator for PinyinParserIter {
295 type Item = String;
296
297 #[allow(clippy::too_many_lines)]
298 #[allow(clippy::cognitive_complexity)]
299 fn next(&mut self) -> Option<Self::Item> {
300 use pinyin_token::Alphabet;
301 use pinyin_token::PinyinToken::{
302 Alph, Apostrophe, LightToneMarker, Others, Punctuation, Space,
303 };
304 use ParserState::{
305 AfterSyllablePossiblyConsumingApostrophe, BeforeWordInitial, InitialParsed, ZCSParsed,
306 };
307 loop {
308 match (self.it.next(), self.state) {
309 (
310 b @ Some(LightToneMarker | Punctuation(_) | Apostrophe | Space(_) | Others(_)),
311 a @ (InitialParsed(_) | ZCSParsed(_)),
312 ) => panic!("unexpected {b:?} found after parsing initial {a:?}"),
313 (
314 Some(LightToneMarker),
315 AfterSyllablePossiblyConsumingApostrophe | BeforeWordInitial,
316 ) => continue, (
319 Some(Apostrophe),
320 AfterSyllablePossiblyConsumingApostrophe | BeforeWordInitial,
321 ) => panic!("unexpected apostrophe found at the beginning of a word"),
322 (None, AfterSyllablePossiblyConsumingApostrophe | BeforeWordInitial) => {
323 return None
324 }
325 (None, InitialParsed(initial)) => {
326 panic!("unexpected end of string found after {initial:?}");
327 }
328 (None, ZCSParsed(zcs)) => panic!("unexpected end of string found after {zcs:?}"),
329 (
330 Some(Punctuation(s)),
331 BeforeWordInitial | AfterSyllablePossiblyConsumingApostrophe,
332 ) => {
333 if self.configs.p_preserve_punctuations {
334 self.state = BeforeWordInitial;
335 return Some((*s).clone());
336 }
337 continue;
338 }
339 (Some(Space(s)), BeforeWordInitial | AfterSyllablePossiblyConsumingApostrophe) => {
340 if self.configs.p_preserve_spaces {
341 self.state = BeforeWordInitial;
342 return Some((*s).clone());
343 }
344 continue;
345 }
346
347 (Some(Others(s)), BeforeWordInitial | AfterSyllablePossiblyConsumingApostrophe) => {
348 if self.configs.p_preserve_miscellaneous {
349 self.state = BeforeWordInitial;
350 return Some((*s).clone());
351 }
352 continue;
353 }
354
355 (
356 Some(Alph(alph)),
357 BeforeWordInitial | AfterSyllablePossiblyConsumingApostrophe,
358 ) => match alph.alphabet {
359 Alphabet::B => self.state = InitialParsed(SpellingInitial::B),
360 Alphabet::P => self.state = InitialParsed(SpellingInitial::P),
361 Alphabet::M => {
362 if alph.diacritics.is_empty() {
363 self.state = InitialParsed(SpellingInitial::M);
364 } else {
365 return Some(alph.to_str(self.configs.p_strict));
366 }
367 }
368 Alphabet::F => self.state = InitialParsed(SpellingInitial::F),
369 Alphabet::D => self.state = InitialParsed(SpellingInitial::D),
370 Alphabet::T => self.state = InitialParsed(SpellingInitial::T),
371 Alphabet::N => {
372 if alph.diacritics.is_empty() {
373 self.state = InitialParsed(SpellingInitial::N);
374 } else {
375 return Some(alph.to_str(self.configs.p_strict));
376 }
377 }
378 Alphabet::L => self.state = InitialParsed(SpellingInitial::L),
379 Alphabet::G => self.state = InitialParsed(SpellingInitial::G),
380 Alphabet::K => self.state = InitialParsed(SpellingInitial::K),
381 Alphabet::H => self.state = InitialParsed(SpellingInitial::H),
382 Alphabet::J => self.state = InitialParsed(SpellingInitial::J),
383 Alphabet::Q => self.state = InitialParsed(SpellingInitial::Q),
384 Alphabet::X => self.state = InitialParsed(SpellingInitial::X),
385 Alphabet::R => self.state = InitialParsed(SpellingInitial::R),
386 Alphabet::Y => self.state = InitialParsed(SpellingInitial::Y),
387 Alphabet::W => self.state = InitialParsed(SpellingInitial::W),
388 Alphabet::Z => {
389 if alph.diacritics.is_empty() {
390 self.state = ZCSParsed(ZCS::Z);
391 } else if matches!(
392 &alph.diacritics[..],
393 &[pinyin_token::Diacritic::Circumflex]
394 ) {
395 self.state = InitialParsed(SpellingInitial::ZH);
396 } else {
397 return Some(alph.to_str(self.configs.p_strict));
398 }
399 }
400 Alphabet::C => {
401 if alph.diacritics.is_empty() {
402 self.state = ZCSParsed(ZCS::C);
403 } else if matches!(
404 &alph.diacritics[..],
405 &[pinyin_token::Diacritic::Circumflex]
406 ) {
407 self.state = InitialParsed(SpellingInitial::CH);
408 } else {
409 return Some(alph.to_str(self.configs.p_strict));
410 }
411 }
412 Alphabet::S => {
413 if alph.diacritics.is_empty() {
414 self.state = ZCSParsed(ZCS::S);
415 } else if matches!(
416 &alph.diacritics[..],
417 &[pinyin_token::Diacritic::Circumflex]
418 ) {
419 self.state = InitialParsed(SpellingInitial::SH);
420 } else {
421 return Some(alph.to_str(self.configs.p_strict));
422 }
423 }
424 Alphabet::A | Alphabet::E | Alphabet::O => {
425 self.it.rewind(1);
426 self.state = InitialParsed(SpellingInitial::ZeroAEO);
427 }
428
429 Alphabet::I | Alphabet::U | Alphabet::Ŋ => panic!(
430 "unexpected alphabet {:?} found at the beginning of a word",
431 alph.alphabet,
432 ),
433 },
434
435 (Some(Alph(alph)), ZCSParsed(zcs)) => {
436 if alph.alphabet == Alphabet::H {
437 self.state = match zcs {
438 ZCS::Z => InitialParsed(SpellingInitial::ZH),
439 ZCS::C => InitialParsed(SpellingInitial::CH),
440 ZCS::S => InitialParsed(SpellingInitial::SH),
441 }
442 } else {
443 self.it.rewind(1);
444 self.state = match zcs {
445 ZCS::Z => InitialParsed(SpellingInitial::Z),
446 ZCS::C => InitialParsed(SpellingInitial::C),
447 ZCS::S => InitialParsed(SpellingInitial::S),
448 }
449 }
450 }
451
452 (Some(Alph(_)), InitialParsed(initial)) => {
453 use finals::Candidate;
454 self.it.rewind(1);
455 let candidates = self.it.get_candidates_without_rhotic(self.configs.p_strict);
456
457 assert!(!candidates.is_empty(),
458 "no adequate candidate for finals (-an, -ian, ...) is found, after the initial {initial:?}"
459 );
460
461 for Candidate { ŋ, fin, tone } in candidates.clone() {
462 let fin_len = fin.len() - usize::from(ŋ); self.it.advance(fin_len);
464
465 match self.it.peek(0) {
467 None => {
468 self.it.advance(1);
469 self.state = AfterSyllablePossiblyConsumingApostrophe;
470 return Some(format!(
471 "{}{}",
472 initial,
473 finals::FinalWithTone { fin, tone }
474 ));
475 }
476
477 Some(Apostrophe) => {
478 self.it.advance(1);
479
480 if self.configs.p_strict.is_strict() {
482 let a_e_o = match self.it.peek(0) {
483 Some(Alph(a)) => matches!(
484 a.alphabet,
485 Alphabet::A | Alphabet::E | Alphabet::O
486 ),
487 _ => false,
488 };
489
490 assert!(a_e_o, "In strict mode, an apostrophe must be followed by either 'a', 'e' or 'o'");
491 }
492
493 self.state = AfterSyllablePossiblyConsumingApostrophe;
494 return Some(format!(
495 "{}{}",
496 initial,
497 finals::FinalWithTone { fin, tone }
498 ));
499 }
500
501 Some(Punctuation(_) | LightToneMarker | Space(_) | Others(_)) => {
502 self.state = AfterSyllablePossiblyConsumingApostrophe;
503 return Some(format!(
504 "{}{}",
505 initial,
506 finals::FinalWithTone { fin, tone }
507 ));
508 }
509
510 Some(Alph(alph)) => match alph.alphabet {
511 Alphabet::A
512 | Alphabet::E
513 | Alphabet::I
514 | Alphabet::O
515 | Alphabet::U
516 | Alphabet::Ŋ => {
517 self.it.rewind(fin_len);
519 continue;
520 }
521
522 Alphabet::R =>
523 {
525 let vowel_follows = match self.it.peek(1) {
526 Some(Alph(a)) => matches!(
527 a.alphabet,
528 Alphabet::A
529 | Alphabet::E
530 | Alphabet::I
531 | Alphabet::O
532 | Alphabet::U
533 ),
534 _ => false,
535 };
536 if vowel_follows {
537 self.state = AfterSyllablePossiblyConsumingApostrophe;
541 return Some(format!(
542 "{}{}",
543 initial,
544 finals::FinalWithTone { fin, tone }
545 ));
546 }
547 self.it.advance(1);
549 self.state = AfterSyllablePossiblyConsumingApostrophe;
550 return Some(format!(
551 "{}{}r",
552 initial,
553 finals::FinalWithTone { fin, tone }
554 ));
555 }
556
557 Alphabet::G =>
558 {
560 let vowel_follows = match self.it.peek(1) {
561 Some(Alph(a)) => matches!(
562 a.alphabet,
563 Alphabet::A
564 | Alphabet::E
565 | Alphabet::I
566 | Alphabet::O
567 | Alphabet::U
568 ),
569 _ => false,
570 };
571 if vowel_follows {
572 self.state = AfterSyllablePossiblyConsumingApostrophe;
576 return Some(format!(
577 "{}{}",
578 initial,
579 finals::FinalWithTone { fin, tone }
580 ));
581 }
582 self.it.rewind(fin_len);
584 continue;
585 }
586
587 Alphabet::N => {
588 let vowel_follows = match self.it.peek(1) {
589 Some(Alph(a)) => matches!(
590 a.alphabet,
591 Alphabet::A
592 | Alphabet::E
593 | Alphabet::I
594 | Alphabet::O
595 | Alphabet::U
596 ),
597 _ => false,
598 };
599 if vowel_follows {
600 self.state = AfterSyllablePossiblyConsumingApostrophe;
603 return Some(format!(
604 "{}{}",
605 initial,
606 finals::FinalWithTone { fin, tone }
607 ));
608 }
609 self.it.rewind(fin_len);
611 continue;
612 }
613
614 _ => {
615 self.state = AfterSyllablePossiblyConsumingApostrophe;
616 return Some(format!(
617 "{}{}",
618 initial,
619 finals::FinalWithTone { fin, tone }
620 ));
621 }
622 },
623 }
624 }
625 panic!(
626 "no adequate candidate for finals (-an, -ian, ...) found, among possible candidates {candidates:?}"
627 );
628 }
629 }
630 }
631 }
632}
633
634mod finals;
635
636#[allow(clippy::upper_case_acronyms)]
637#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
638enum ZCS {
639 Z,
640 C,
641 S,
642}
643
644#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
645enum SpellingInitial {
646 B,
647 P,
648 M,
649 F,
650 D,
651 T,
652 N,
653 L,
654 G,
655 K,
656 H,
657 J,
658 Q,
659 X,
660 ZH,
661 CH,
662 SH,
663 R,
664 Z,
665 C,
666 S,
667 Y,
668 W,
669 ZeroAEO,
670}
671
672impl std::fmt::Display for SpellingInitial {
673 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
674 match self {
675 Self::B => write!(f, "b"),
676 Self::P => write!(f, "p"),
677 Self::M => write!(f, "m"),
678 Self::F => write!(f, "f"),
679 Self::D => write!(f, "d"),
680 Self::T => write!(f, "t"),
681 Self::N => write!(f, "n"),
682 Self::L => write!(f, "l"),
683 Self::G => write!(f, "g"),
684 Self::K => write!(f, "k"),
685 Self::H => write!(f, "h"),
686 Self::J => write!(f, "j"),
687 Self::Q => write!(f, "q"),
688 Self::X => write!(f, "x"),
689 Self::ZH => write!(f, "zh"),
690 Self::CH => write!(f, "ch"),
691 Self::SH => write!(f, "sh"),
692 Self::R => write!(f, "r"),
693 Self::Z => write!(f, "z"),
694 Self::C => write!(f, "c"),
695 Self::S => write!(f, "s"),
696 Self::Y => write!(f, "y"),
697 Self::W => write!(f, "w"),
698 Self::ZeroAEO => write!(f, ""),
699 }
700 }
701}