Skip to main content

pinyin/
lib.rs

1mod error;
2mod loader;
3mod matcher;
4mod pinyin;
5
6use std::fmt;
7use std::ops::Index;
8use std::sync::OnceLock;
9
10pub use error::PinyinError;
11use error::Result;
12use loader::Lexicon;
13use matcher::{Matcher, Segment, group_unmatched_for_sentence};
14use pinyin::{first_pronunciation, format_phrase, initials_token, slug_token, split_phrase};
15
16static LEXICON: OnceLock<Lexicon> = OnceLock::new();
17static DEFAULT_MATCHER: OnceLock<Matcher> = OnceLock::new();
18static PLAIN_MATCHER: OnceLock<Matcher> = OnceLock::new();
19static SURNAME_MATCHER: OnceLock<Matcher> = OnceLock::new();
20
21const VALID_DELIMITERS: [&str; 4] = ["-", "_", ".", ""];
22
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
24pub enum ToneStyle {
25    #[default]
26    Mark,
27    Number,
28    None,
29}
30
31#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
32pub enum YuStyle {
33    #[default]
34    Umlaut,
35    V,
36    Yu,
37    U,
38}
39
40#[derive(Debug, Clone, PartialEq, Eq)]
41pub struct PinyinWord {
42    pub text: String,
43    pub pinyin: String,
44}
45
46impl PinyinWord {
47    pub fn new(text: impl Into<String>, pinyin: impl Into<String>) -> Self {
48        Self {
49            text: text.into(),
50            pinyin: pinyin.into(),
51        }
52    }
53}
54
55#[derive(Debug, Clone, PartialEq, Eq)]
56pub struct PinyinResult {
57    words: Vec<PinyinWord>,
58    tone_style: ToneStyle,
59    yu_style: YuStyle,
60}
61
62impl PinyinResult {
63    pub fn new(words: Vec<PinyinWord>) -> Self {
64        Self {
65            words,
66            tone_style: ToneStyle::Mark,
67            yu_style: YuStyle::Umlaut,
68        }
69    }
70
71    pub fn with_tone_style(mut self, style: ToneStyle) -> Self {
72        self.tone_style = style;
73        self
74    }
75
76    pub fn without_tone(mut self) -> Self {
77        self.tone_style = ToneStyle::None;
78        if self.yu_style == YuStyle::Umlaut {
79            self.yu_style = YuStyle::V;
80        }
81        self
82    }
83
84    pub fn flatten(self) -> Self {
85        self
86    }
87
88    pub fn yu_to_v(mut self) -> Self {
89        self.yu_style = YuStyle::V;
90        self
91    }
92
93    pub fn yu_to_yu(mut self) -> Self {
94        self.yu_style = YuStyle::Yu;
95        self
96    }
97
98    pub fn yu_to_u(mut self) -> Self {
99        self.yu_style = YuStyle::U;
100        self
101    }
102
103    pub fn yu_to_umlaut(mut self) -> Self {
104        self.yu_style = YuStyle::Umlaut;
105        self
106    }
107
108    pub fn len(&self) -> usize {
109        self.words.len()
110    }
111
112    pub fn is_empty(&self) -> bool {
113        self.words.is_empty()
114    }
115
116    pub fn words(&self) -> &[PinyinWord] {
117        &self.words
118    }
119
120    pub fn iter(&self) -> impl Iterator<Item = String> + '_ {
121        self.words
122            .iter()
123            .map(|word| format_phrase(&word.pinyin, self.tone_style, self.yu_style))
124    }
125
126    pub fn to_vec(&self) -> Vec<String> {
127        self.iter().collect()
128    }
129
130    pub fn join(&self, separator: &str) -> String {
131        self.to_string_with(separator)
132    }
133
134    pub fn to_string_with(&self, separator: &str) -> String {
135        self.iter().collect::<Vec<_>>().join(separator)
136    }
137
138    pub fn to_permalink(&self) -> String {
139        self.clone().without_tone().yu_to_v().to_string_with("-")
140    }
141}
142
143impl fmt::Display for PinyinResult {
144    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
145        formatter.write_str(&self.to_string_with(" "))
146    }
147}
148
149impl Index<usize> for PinyinResult {
150    type Output = PinyinWord;
151
152    fn index(&self, index: usize) -> &Self::Output {
153        &self.words[index]
154    }
155}
156
157impl IntoIterator for PinyinResult {
158    type IntoIter = std::vec::IntoIter<PinyinWord>;
159    type Item = PinyinWord;
160
161    fn into_iter(self) -> Self::IntoIter {
162        self.words.into_iter()
163    }
164}
165
166#[derive(Debug, Clone)]
167pub struct PinyinConfig {
168    pub enable_polyphone: bool,
169    pub prefer_long_words: bool,
170    pub max_input_length: usize,
171}
172
173impl Default for PinyinConfig {
174    fn default() -> Self {
175        Self {
176            enable_polyphone: false,
177            prefer_long_words: true,
178            max_input_length: 10_000,
179        }
180    }
181}
182
183impl PinyinConfig {
184    pub fn new() -> Self {
185        Self::default()
186    }
187
188    pub fn with_polyphone(mut self, enabled: bool) -> Self {
189        self.enable_polyphone = enabled;
190        self
191    }
192
193    pub fn with_long_words(mut self, enabled: bool) -> Self {
194        self.prefer_long_words = enabled;
195        self
196    }
197
198    pub fn with_max_length(mut self, length: usize) -> Self {
199        self.max_input_length = length;
200        self
201    }
202
203    pub fn validate(&self) -> Result<()> {
204        if self.max_input_length == 0 {
205            return Err(PinyinError::InvalidMaxInputLength(self.max_input_length));
206        }
207        Ok(())
208    }
209}
210
211#[derive(Debug, Clone)]
212pub struct Converter {
213    input: String,
214    tone_style: ToneStyle,
215    yu_style: YuStyle,
216    surname_mode: bool,
217    only_hans: bool,
218    keep_punctuation: bool,
219    split_words: bool,
220}
221
222impl Converter {
223    pub fn new(input: &str) -> Self {
224        Self {
225            input: input.to_string(),
226            tone_style: ToneStyle::Mark,
227            yu_style: YuStyle::Umlaut,
228            surname_mode: false,
229            only_hans: false,
230            keep_punctuation: true,
231            split_words: true,
232        }
233    }
234
235    pub fn with_tone_style(mut self, style: ToneStyle) -> Self {
236        self.tone_style = style;
237        self
238    }
239
240    pub fn without_tone(mut self) -> Self {
241        self.tone_style = ToneStyle::None;
242        if self.yu_style == YuStyle::Umlaut {
243            self.yu_style = YuStyle::V;
244        }
245        self
246    }
247
248    pub fn yu_to_v(mut self) -> Self {
249        self.yu_style = YuStyle::V;
250        self
251    }
252
253    pub fn yu_to_yu(mut self) -> Self {
254        self.yu_style = YuStyle::Yu;
255        self
256    }
257
258    pub fn yu_to_u(mut self) -> Self {
259        self.yu_style = YuStyle::U;
260        self
261    }
262
263    pub fn yu_to_umlaut(mut self) -> Self {
264        self.yu_style = YuStyle::Umlaut;
265        self
266    }
267
268    pub fn flatten(self) -> Self {
269        self
270    }
271
272    pub fn as_surnames(mut self) -> Self {
273        self.surname_mode = true;
274        self
275    }
276
277    pub fn only_hans(mut self) -> Self {
278        self.only_hans = true;
279        self
280    }
281
282    pub fn no_punctuation(mut self) -> Self {
283        self.keep_punctuation = false;
284        self
285    }
286
287    pub fn raw_words(mut self) -> Self {
288        self.split_words = false;
289        self
290    }
291
292    pub fn convert(&self) -> PinyinResult {
293        let segments = if self.surname_mode {
294            name_segments(&self.input)
295        } else {
296            default_matcher().segments(&self.input)
297        };
298
299        let words = result_words(
300            group_unmatched_for_sentence(segments),
301            self.only_hans,
302            self.keep_punctuation,
303            self.split_words,
304        );
305
306        PinyinResult::new(words)
307            .with_tone_style(self.tone_style)
308            .with_yu_style(self.yu_style)
309    }
310
311    pub fn to_string_with(&self, separator: &str) -> String {
312        self.convert().to_string_with(separator)
313    }
314
315    pub fn to_permalink(&self) -> String {
316        self.clone()
317            .without_tone()
318            .yu_to_v()
319            .no_punctuation()
320            .convert()
321            .to_permalink()
322    }
323}
324
325impl fmt::Display for Converter {
326    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
327        formatter.write_str(&self.convert().to_string())
328    }
329}
330
331trait WithYuStyle {
332    fn with_yu_style(self, style: YuStyle) -> Self;
333}
334
335impl WithYuStyle for PinyinResult {
336    fn with_yu_style(mut self, style: YuStyle) -> Self {
337        self.yu_style = style;
338        self
339    }
340}
341
342pub struct Pinyin;
343
344impl Pinyin {
345    pub fn sentence(input: &str) -> PinyinResult {
346        Converter::new(input).convert()
347    }
348
349    pub fn phrase(input: &str) -> PinyinResult {
350        Converter::new(input).no_punctuation().convert()
351    }
352
353    pub fn full_sentence(input: &str) -> PinyinResult {
354        Self::sentence(input)
355    }
356
357    pub fn permalink(input: &str) -> String {
358        Self::permalink_with(input, "-").expect("default delimiter is valid")
359    }
360
361    pub fn permalink_with(input: &str, delimiter: &str) -> Result<String> {
362        if !VALID_DELIMITERS.contains(&delimiter) {
363            return Err(PinyinError::invalid_delimiter(delimiter));
364        }
365
366        let tokens = Converter::new(input)
367            .without_tone()
368            .yu_to_v()
369            .no_punctuation()
370            .convert()
371            .iter()
372            .map(|token| slug_token(&token))
373            .filter(|token| !token.is_empty())
374            .collect::<Vec<_>>();
375
376        Ok(tokens.join(delimiter))
377    }
378
379    pub fn abbr(input: &str) -> PinyinResult {
380        let words = Converter::new(input)
381            .without_tone()
382            .yu_to_v()
383            .no_punctuation()
384            .convert()
385            .iter()
386            .filter_map(|token| {
387                let initial = initials_token(&token);
388                (!initial.is_empty()).then(|| PinyinWord::new(token, initial))
389            })
390            .collect();
391
392        PinyinResult::new(words).without_tone().yu_to_v()
393    }
394
395    pub fn name_abbr(input: &str) -> PinyinResult {
396        let words = Self::name(input)
397            .without_tone()
398            .yu_to_v()
399            .iter()
400            .filter_map(|token| {
401                let initial = initials_token(&token);
402                (!initial.is_empty()).then(|| PinyinWord::new(token, initial))
403            })
404            .collect();
405
406        PinyinResult::new(words).without_tone().yu_to_v()
407    }
408
409    pub fn name(input: &str) -> PinyinResult {
410        Converter::new(input).as_surnames().convert()
411    }
412
413    pub fn passport_name(input: &str) -> PinyinResult {
414        Self::name(input).without_tone().yu_to_yu()
415    }
416
417    pub fn chars(input: &str) -> PinyinResult {
418        let words = input
419            .chars()
420            .filter_map(|ch| {
421                lexicon().char_pinyin(ch).map(|pinyin| {
422                    PinyinWord::new(ch.to_string(), first_pronunciation(pinyin).to_string())
423                })
424            })
425            .collect();
426        PinyinResult::new(words)
427    }
428
429    pub fn heteronym(input: &str) -> Vec<(char, Vec<String>)> {
430        input
431            .chars()
432            .filter_map(|ch| {
433                lexicon().heteronyms(ch).map(|items| {
434                    (
435                        ch,
436                        items
437                            .into_iter()
438                            .map(str::to_string)
439                            .collect::<Vec<String>>(),
440                    )
441                })
442            })
443            .collect()
444    }
445}
446
447pub fn match_word_pinyin(input: &str) -> Vec<(String, String)> {
448    default_matcher()
449        .segments(input)
450        .into_iter()
451        .filter(|segment| segment.matched)
452        .map(|segment| (segment.text, segment.pinyin))
453        .collect()
454}
455
456pub fn convert(input: &str) -> Vec<String> {
457    default_matcher()
458        .segments(input)
459        .into_iter()
460        .map(|segment| segment.pinyin)
461        .collect()
462}
463
464pub fn convert_as_surname(input: &str) -> Vec<String> {
465    surname_matcher()
466        .segments(input)
467        .into_iter()
468        .map(|segment| segment.pinyin)
469        .collect()
470}
471
472pub fn convert_safe(input: &str) -> Result<Vec<String>> {
473    convert_with_config(input, &PinyinConfig::default())
474}
475
476pub fn convert_with_config(input: &str, config: &PinyinConfig) -> Result<Vec<String>> {
477    config.validate()?;
478    if input.len() > config.max_input_length {
479        return Err(PinyinError::InputTooLong {
480            actual: input.len(),
481            max: config.max_input_length,
482        });
483    }
484
485    let mut result = if config.prefer_long_words {
486        convert(input)
487    } else {
488        input
489            .chars()
490            .map(|ch| {
491                lexicon()
492                    .char_pinyin(ch)
493                    .map(str::to_string)
494                    .unwrap_or_else(|| ch.to_string())
495            })
496            .collect()
497    };
498
499    if !config.enable_polyphone {
500        for item in &mut result {
501            *item = first_pronunciation(item).to_string();
502        }
503    }
504
505    Ok(result)
506}
507
508fn result_words(
509    segments: Vec<Segment>,
510    only_hans: bool,
511    keep_punctuation: bool,
512    split_words: bool,
513) -> Vec<PinyinWord> {
514    let mut words = Vec::with_capacity(segments.len());
515
516    for segment in segments {
517        if only_hans && !segment.matched {
518            continue;
519        }
520        if !keep_punctuation && is_punctuation_token(&segment.text) {
521            continue;
522        }
523
524        if segment.matched {
525            push_matched_words(&mut words, &segment, split_words);
526        } else if !segment.text.trim().is_empty() {
527            words.push(PinyinWord::new(segment.text, segment.pinyin));
528        }
529    }
530
531    words
532}
533
534fn push_matched_words(words: &mut Vec<PinyinWord>, segment: &Segment, split_words: bool) {
535    let char_count = segment.text.chars().count();
536    let syllables = split_phrase(&segment.pinyin);
537
538    if !split_words {
539        let pinyin = if char_count == 1 {
540            first_pronunciation(&segment.pinyin).to_string()
541        } else {
542            segment.pinyin.clone()
543        };
544        words.push(PinyinWord::new(segment.text.clone(), pinyin));
545        return;
546    }
547
548    if char_count == 1 {
549        words.push(PinyinWord::new(
550            segment.text.clone(),
551            first_pronunciation(&segment.pinyin).to_string(),
552        ));
553        return;
554    }
555
556    let chars = segment.text.chars().collect::<Vec<_>>();
557    if chars.len() == syllables.len() {
558        for (ch, syllable) in chars.into_iter().zip(syllables) {
559            words.push(PinyinWord::new(ch.to_string(), syllable.to_string()));
560        }
561    } else {
562        for syllable in syllables {
563            words.push(PinyinWord::new(segment.text.clone(), syllable.to_string()));
564        }
565    }
566}
567
568fn name_segments(input: &str) -> Vec<Segment> {
569    let Some(prefix) = lexicon().longest_surname_prefix(input) else {
570        return default_matcher().segments(input);
571    };
572
573    let Some(pinyin) = lexicon().surname_pinyin(prefix) else {
574        return default_matcher().segments(input);
575    };
576
577    let mut segments = vec![Segment {
578        text: prefix.to_string(),
579        pinyin: pinyin.to_string(),
580        matched: true,
581    }];
582    segments.extend(plain_matcher().segments(&input[prefix.len()..]));
583    segments
584}
585
586fn lexicon() -> &'static Lexicon {
587    LEXICON.get_or_init(Lexicon::new)
588}
589
590fn default_matcher() -> &'static Matcher {
591    DEFAULT_MATCHER.get_or_init(|| Matcher::new(lexicon().default_entries()))
592}
593
594fn plain_matcher() -> &'static Matcher {
595    PLAIN_MATCHER.get_or_init(|| Matcher::new(lexicon().plain_entries()))
596}
597
598fn surname_matcher() -> &'static Matcher {
599    SURNAME_MATCHER.get_or_init(|| Matcher::new(lexicon().surname_entries()))
600}
601
602fn is_punctuation_token(token: &str) -> bool {
603    token.chars().all(|ch| !ch.is_alphanumeric())
604}
605
606#[cfg(test)]
607mod tests {
608    use super::*;
609    use pretty_assertions::assert_eq;
610
611    #[test]
612    fn converts_with_longest_dictionary_matches() {
613        assert_eq!(convert("你好世界"), ["nǐ hǎo", "shì jiè"]);
614        assert_eq!(
615            convert("中国人喜欢中国吃饭"),
616            ["zhōng guó rén", "xǐ huan", "zhōng guó", "chī fàn"]
617        );
618    }
619
620    #[test]
621    fn keeps_unmatched_text_as_tokens() {
622        assert_eq!(convert("Hi!"), ["H", "i", "!"]);
623    }
624
625    #[test]
626    fn sentence_splits_words_into_syllables() {
627        assert_eq!(
628            Pinyin::sentence("你好,世界").to_string(),
629            "nǐ hǎo , shì jiè"
630        );
631        assert_eq!(Pinyin::phrase("你好,世界").to_string(), "nǐ hǎo shì jiè");
632    }
633
634    #[test]
635    fn formats_tone_styles_correctly() {
636        assert_eq!(
637            Converter::new("你好")
638                .with_tone_style(ToneStyle::Number)
639                .to_string(),
640            "ni3 hao3"
641        );
642        assert_eq!(Converter::new("旅行").to_string(), "lǚ xíng");
643        assert_eq!(Converter::new("旅行").without_tone().to_string(), "lv xing");
644    }
645
646    #[test]
647    fn handles_names_and_passports() {
648        assert_eq!(Pinyin::name("单某某").to_string(), "shàn mǒu mǒu");
649        assert_eq!(Pinyin::name("单于单").to_string(), "chán yú dān");
650        assert_eq!(Pinyin::passport_name("吕秀才").to_string(), "lyu xiu cai");
651    }
652
653    #[test]
654    fn builds_permalink_and_abbr() {
655        assert_eq!(
656            Pinyin::permalink("带着希望去旅行"),
657            "dai-zhe-xi-wang-qu-lv-xing"
658        );
659        assert_eq!(
660            Pinyin::permalink_with("带着希望去旅行", "_").unwrap(),
661            "dai_zhe_xi_wang_qu_lv_xing"
662        );
663        assert!(Pinyin::permalink_with("你好", "=").is_err());
664        assert_eq!(Pinyin::abbr("北京大学").to_string(), "b j d x");
665        assert_eq!(Pinyin::name_abbr("单某某").to_string(), "s m m");
666    }
667
668    #[test]
669    fn supports_configured_conversion() {
670        let no_words = PinyinConfig::new().with_long_words(false);
671        assert_eq!(
672            convert_with_config("你好", &no_words).unwrap(),
673            ["nǐ", "hǎo"]
674        );
675
676        let err = convert_with_config("你好", &PinyinConfig::new().with_max_length(1));
677        assert!(matches!(err, Err(PinyinError::InputTooLong { .. })));
678    }
679
680    #[test]
681    fn exposes_chars_and_heteronyms() {
682        assert_eq!(Pinyin::chars("重庆").to_string(), "zhòng qìng");
683        assert!(Pinyin::heteronym("重").iter().any(|(_, items)| {
684            items.contains(&"zhòng".to_string()) && items.contains(&"chóng".to_string())
685        }));
686    }
687}