1mod error;
2mod loader;
3mod matcher;
4mod pinyin;
5
6use std::fmt;
7use std::ops::Index;
8use std::sync::OnceLock;
9
10pub use error::PinyinError;
11use error::Result;
12use loader::Lexicon;
13use matcher::{Matcher, Segment, group_unmatched_for_sentence};
14use pinyin::{first_pronunciation, format_phrase, initials_token, slug_token, split_phrase};
15
16static LEXICON: OnceLock<Lexicon> = OnceLock::new();
17static DEFAULT_MATCHER: OnceLock<Matcher> = OnceLock::new();
18static PLAIN_MATCHER: OnceLock<Matcher> = OnceLock::new();
19static SURNAME_MATCHER: OnceLock<Matcher> = OnceLock::new();
20
21const VALID_DELIMITERS: [&str; 4] = ["-", "_", ".", ""];
22
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
24pub enum ToneStyle {
25 #[default]
26 Mark,
27 Number,
28 None,
29}
30
31#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
32pub enum YuStyle {
33 #[default]
34 Umlaut,
35 V,
36 Yu,
37 U,
38}
39
40#[derive(Debug, Clone, PartialEq, Eq)]
41pub struct PinyinWord {
42 pub text: String,
43 pub pinyin: String,
44}
45
46impl PinyinWord {
47 pub fn new(text: impl Into<String>, pinyin: impl Into<String>) -> Self {
48 Self {
49 text: text.into(),
50 pinyin: pinyin.into(),
51 }
52 }
53}
54
55#[derive(Debug, Clone, PartialEq, Eq)]
56pub struct PinyinResult {
57 words: Vec<PinyinWord>,
58 tone_style: ToneStyle,
59 yu_style: YuStyle,
60}
61
62impl PinyinResult {
63 pub fn new(words: Vec<PinyinWord>) -> Self {
64 Self {
65 words,
66 tone_style: ToneStyle::Mark,
67 yu_style: YuStyle::Umlaut,
68 }
69 }
70
71 pub fn with_tone_style(mut self, style: ToneStyle) -> Self {
72 self.tone_style = style;
73 self
74 }
75
76 pub fn without_tone(mut self) -> Self {
77 self.tone_style = ToneStyle::None;
78 if self.yu_style == YuStyle::Umlaut {
79 self.yu_style = YuStyle::V;
80 }
81 self
82 }
83
84 pub fn flatten(self) -> Self {
85 self
86 }
87
88 pub fn yu_to_v(mut self) -> Self {
89 self.yu_style = YuStyle::V;
90 self
91 }
92
93 pub fn yu_to_yu(mut self) -> Self {
94 self.yu_style = YuStyle::Yu;
95 self
96 }
97
98 pub fn yu_to_u(mut self) -> Self {
99 self.yu_style = YuStyle::U;
100 self
101 }
102
103 pub fn yu_to_umlaut(mut self) -> Self {
104 self.yu_style = YuStyle::Umlaut;
105 self
106 }
107
108 pub fn len(&self) -> usize {
109 self.words.len()
110 }
111
112 pub fn is_empty(&self) -> bool {
113 self.words.is_empty()
114 }
115
116 pub fn words(&self) -> &[PinyinWord] {
117 &self.words
118 }
119
120 pub fn iter(&self) -> impl Iterator<Item = String> + '_ {
121 self.words
122 .iter()
123 .map(|word| format_phrase(&word.pinyin, self.tone_style, self.yu_style))
124 }
125
126 pub fn to_vec(&self) -> Vec<String> {
127 self.iter().collect()
128 }
129
130 pub fn join(&self, separator: &str) -> String {
131 self.to_string_with(separator)
132 }
133
134 pub fn to_string_with(&self, separator: &str) -> String {
135 self.iter().collect::<Vec<_>>().join(separator)
136 }
137
138 pub fn to_permalink(&self) -> String {
139 self.clone().without_tone().yu_to_v().to_string_with("-")
140 }
141}
142
143impl fmt::Display for PinyinResult {
144 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
145 formatter.write_str(&self.to_string_with(" "))
146 }
147}
148
149impl Index<usize> for PinyinResult {
150 type Output = PinyinWord;
151
152 fn index(&self, index: usize) -> &Self::Output {
153 &self.words[index]
154 }
155}
156
157impl IntoIterator for PinyinResult {
158 type IntoIter = std::vec::IntoIter<PinyinWord>;
159 type Item = PinyinWord;
160
161 fn into_iter(self) -> Self::IntoIter {
162 self.words.into_iter()
163 }
164}
165
166#[derive(Debug, Clone)]
167pub struct PinyinConfig {
168 pub enable_polyphone: bool,
169 pub prefer_long_words: bool,
170 pub max_input_length: usize,
171}
172
173impl Default for PinyinConfig {
174 fn default() -> Self {
175 Self {
176 enable_polyphone: false,
177 prefer_long_words: true,
178 max_input_length: 10_000,
179 }
180 }
181}
182
183impl PinyinConfig {
184 pub fn new() -> Self {
185 Self::default()
186 }
187
188 pub fn with_polyphone(mut self, enabled: bool) -> Self {
189 self.enable_polyphone = enabled;
190 self
191 }
192
193 pub fn with_long_words(mut self, enabled: bool) -> Self {
194 self.prefer_long_words = enabled;
195 self
196 }
197
198 pub fn with_max_length(mut self, length: usize) -> Self {
199 self.max_input_length = length;
200 self
201 }
202
203 pub fn validate(&self) -> Result<()> {
204 if self.max_input_length == 0 {
205 return Err(PinyinError::InvalidMaxInputLength(self.max_input_length));
206 }
207 Ok(())
208 }
209}
210
211#[derive(Debug, Clone)]
212pub struct Converter {
213 input: String,
214 tone_style: ToneStyle,
215 yu_style: YuStyle,
216 surname_mode: bool,
217 only_hans: bool,
218 keep_punctuation: bool,
219 split_words: bool,
220}
221
222impl Converter {
223 pub fn new(input: &str) -> Self {
224 Self {
225 input: input.to_string(),
226 tone_style: ToneStyle::Mark,
227 yu_style: YuStyle::Umlaut,
228 surname_mode: false,
229 only_hans: false,
230 keep_punctuation: true,
231 split_words: true,
232 }
233 }
234
235 pub fn with_tone_style(mut self, style: ToneStyle) -> Self {
236 self.tone_style = style;
237 self
238 }
239
240 pub fn without_tone(mut self) -> Self {
241 self.tone_style = ToneStyle::None;
242 if self.yu_style == YuStyle::Umlaut {
243 self.yu_style = YuStyle::V;
244 }
245 self
246 }
247
248 pub fn yu_to_v(mut self) -> Self {
249 self.yu_style = YuStyle::V;
250 self
251 }
252
253 pub fn yu_to_yu(mut self) -> Self {
254 self.yu_style = YuStyle::Yu;
255 self
256 }
257
258 pub fn yu_to_u(mut self) -> Self {
259 self.yu_style = YuStyle::U;
260 self
261 }
262
263 pub fn yu_to_umlaut(mut self) -> Self {
264 self.yu_style = YuStyle::Umlaut;
265 self
266 }
267
268 pub fn flatten(self) -> Self {
269 self
270 }
271
272 pub fn as_surnames(mut self) -> Self {
273 self.surname_mode = true;
274 self
275 }
276
277 pub fn only_hans(mut self) -> Self {
278 self.only_hans = true;
279 self
280 }
281
282 pub fn no_punctuation(mut self) -> Self {
283 self.keep_punctuation = false;
284 self
285 }
286
287 pub fn raw_words(mut self) -> Self {
288 self.split_words = false;
289 self
290 }
291
292 pub fn convert(&self) -> PinyinResult {
293 let segments = if self.surname_mode {
294 name_segments(&self.input)
295 } else {
296 default_matcher().segments(&self.input)
297 };
298
299 let words = result_words(
300 group_unmatched_for_sentence(segments),
301 self.only_hans,
302 self.keep_punctuation,
303 self.split_words,
304 );
305
306 PinyinResult::new(words)
307 .with_tone_style(self.tone_style)
308 .with_yu_style(self.yu_style)
309 }
310
311 pub fn to_string_with(&self, separator: &str) -> String {
312 self.convert().to_string_with(separator)
313 }
314
315 pub fn to_permalink(&self) -> String {
316 self.clone()
317 .without_tone()
318 .yu_to_v()
319 .no_punctuation()
320 .convert()
321 .to_permalink()
322 }
323}
324
325impl fmt::Display for Converter {
326 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
327 formatter.write_str(&self.convert().to_string())
328 }
329}
330
331trait WithYuStyle {
332 fn with_yu_style(self, style: YuStyle) -> Self;
333}
334
335impl WithYuStyle for PinyinResult {
336 fn with_yu_style(mut self, style: YuStyle) -> Self {
337 self.yu_style = style;
338 self
339 }
340}
341
342pub struct Pinyin;
343
344impl Pinyin {
345 pub fn sentence(input: &str) -> PinyinResult {
346 Converter::new(input).convert()
347 }
348
349 pub fn phrase(input: &str) -> PinyinResult {
350 Converter::new(input).no_punctuation().convert()
351 }
352
353 pub fn full_sentence(input: &str) -> PinyinResult {
354 Self::sentence(input)
355 }
356
357 pub fn permalink(input: &str) -> String {
358 Self::permalink_with(input, "-").expect("default delimiter is valid")
359 }
360
361 pub fn permalink_with(input: &str, delimiter: &str) -> Result<String> {
362 if !VALID_DELIMITERS.contains(&delimiter) {
363 return Err(PinyinError::invalid_delimiter(delimiter));
364 }
365
366 let tokens = Converter::new(input)
367 .without_tone()
368 .yu_to_v()
369 .no_punctuation()
370 .convert()
371 .iter()
372 .map(|token| slug_token(&token))
373 .filter(|token| !token.is_empty())
374 .collect::<Vec<_>>();
375
376 Ok(tokens.join(delimiter))
377 }
378
379 pub fn abbr(input: &str) -> PinyinResult {
380 let words = Converter::new(input)
381 .without_tone()
382 .yu_to_v()
383 .no_punctuation()
384 .convert()
385 .iter()
386 .filter_map(|token| {
387 let initial = initials_token(&token);
388 (!initial.is_empty()).then(|| PinyinWord::new(token, initial))
389 })
390 .collect();
391
392 PinyinResult::new(words).without_tone().yu_to_v()
393 }
394
395 pub fn name_abbr(input: &str) -> PinyinResult {
396 let words = Self::name(input)
397 .without_tone()
398 .yu_to_v()
399 .iter()
400 .filter_map(|token| {
401 let initial = initials_token(&token);
402 (!initial.is_empty()).then(|| PinyinWord::new(token, initial))
403 })
404 .collect();
405
406 PinyinResult::new(words).without_tone().yu_to_v()
407 }
408
409 pub fn name(input: &str) -> PinyinResult {
410 Converter::new(input).as_surnames().convert()
411 }
412
413 pub fn passport_name(input: &str) -> PinyinResult {
414 Self::name(input).without_tone().yu_to_yu()
415 }
416
417 pub fn chars(input: &str) -> PinyinResult {
418 let words = input
419 .chars()
420 .filter_map(|ch| {
421 lexicon().char_pinyin(ch).map(|pinyin| {
422 PinyinWord::new(ch.to_string(), first_pronunciation(pinyin).to_string())
423 })
424 })
425 .collect();
426 PinyinResult::new(words)
427 }
428
429 pub fn heteronym(input: &str) -> Vec<(char, Vec<String>)> {
430 input
431 .chars()
432 .filter_map(|ch| {
433 lexicon().heteronyms(ch).map(|items| {
434 (
435 ch,
436 items
437 .into_iter()
438 .map(str::to_string)
439 .collect::<Vec<String>>(),
440 )
441 })
442 })
443 .collect()
444 }
445}
446
447pub fn match_word_pinyin(input: &str) -> Vec<(String, String)> {
448 default_matcher()
449 .segments(input)
450 .into_iter()
451 .filter(|segment| segment.matched)
452 .map(|segment| (segment.text, segment.pinyin))
453 .collect()
454}
455
456pub fn convert(input: &str) -> Vec<String> {
457 default_matcher()
458 .segments(input)
459 .into_iter()
460 .map(|segment| segment.pinyin)
461 .collect()
462}
463
464pub fn convert_as_surname(input: &str) -> Vec<String> {
465 surname_matcher()
466 .segments(input)
467 .into_iter()
468 .map(|segment| segment.pinyin)
469 .collect()
470}
471
472pub fn convert_safe(input: &str) -> Result<Vec<String>> {
473 convert_with_config(input, &PinyinConfig::default())
474}
475
476pub fn convert_with_config(input: &str, config: &PinyinConfig) -> Result<Vec<String>> {
477 config.validate()?;
478 if input.len() > config.max_input_length {
479 return Err(PinyinError::InputTooLong {
480 actual: input.len(),
481 max: config.max_input_length,
482 });
483 }
484
485 let mut result = if config.prefer_long_words {
486 convert(input)
487 } else {
488 input
489 .chars()
490 .map(|ch| {
491 lexicon()
492 .char_pinyin(ch)
493 .map(str::to_string)
494 .unwrap_or_else(|| ch.to_string())
495 })
496 .collect()
497 };
498
499 if !config.enable_polyphone {
500 for item in &mut result {
501 *item = first_pronunciation(item).to_string();
502 }
503 }
504
505 Ok(result)
506}
507
508fn result_words(
509 segments: Vec<Segment>,
510 only_hans: bool,
511 keep_punctuation: bool,
512 split_words: bool,
513) -> Vec<PinyinWord> {
514 let mut words = Vec::with_capacity(segments.len());
515
516 for segment in segments {
517 if only_hans && !segment.matched {
518 continue;
519 }
520 if !keep_punctuation && is_punctuation_token(&segment.text) {
521 continue;
522 }
523
524 if segment.matched {
525 push_matched_words(&mut words, &segment, split_words);
526 } else if !segment.text.trim().is_empty() {
527 words.push(PinyinWord::new(segment.text, segment.pinyin));
528 }
529 }
530
531 words
532}
533
534fn push_matched_words(words: &mut Vec<PinyinWord>, segment: &Segment, split_words: bool) {
535 let char_count = segment.text.chars().count();
536 let syllables = split_phrase(&segment.pinyin);
537
538 if !split_words {
539 let pinyin = if char_count == 1 {
540 first_pronunciation(&segment.pinyin).to_string()
541 } else {
542 segment.pinyin.clone()
543 };
544 words.push(PinyinWord::new(segment.text.clone(), pinyin));
545 return;
546 }
547
548 if char_count == 1 {
549 words.push(PinyinWord::new(
550 segment.text.clone(),
551 first_pronunciation(&segment.pinyin).to_string(),
552 ));
553 return;
554 }
555
556 let chars = segment.text.chars().collect::<Vec<_>>();
557 if chars.len() == syllables.len() {
558 for (ch, syllable) in chars.into_iter().zip(syllables) {
559 words.push(PinyinWord::new(ch.to_string(), syllable.to_string()));
560 }
561 } else {
562 for syllable in syllables {
563 words.push(PinyinWord::new(segment.text.clone(), syllable.to_string()));
564 }
565 }
566}
567
568fn name_segments(input: &str) -> Vec<Segment> {
569 let Some(prefix) = lexicon().longest_surname_prefix(input) else {
570 return default_matcher().segments(input);
571 };
572
573 let Some(pinyin) = lexicon().surname_pinyin(prefix) else {
574 return default_matcher().segments(input);
575 };
576
577 let mut segments = vec![Segment {
578 text: prefix.to_string(),
579 pinyin: pinyin.to_string(),
580 matched: true,
581 }];
582 segments.extend(plain_matcher().segments(&input[prefix.len()..]));
583 segments
584}
585
586fn lexicon() -> &'static Lexicon {
587 LEXICON.get_or_init(Lexicon::new)
588}
589
590fn default_matcher() -> &'static Matcher {
591 DEFAULT_MATCHER.get_or_init(|| Matcher::new(lexicon().default_entries()))
592}
593
594fn plain_matcher() -> &'static Matcher {
595 PLAIN_MATCHER.get_or_init(|| Matcher::new(lexicon().plain_entries()))
596}
597
598fn surname_matcher() -> &'static Matcher {
599 SURNAME_MATCHER.get_or_init(|| Matcher::new(lexicon().surname_entries()))
600}
601
602fn is_punctuation_token(token: &str) -> bool {
603 token.chars().all(|ch| !ch.is_alphanumeric())
604}
605
606#[cfg(test)]
607mod tests {
608 use super::*;
609 use pretty_assertions::assert_eq;
610
611 #[test]
612 fn converts_with_longest_dictionary_matches() {
613 assert_eq!(convert("你好世界"), ["nǐ hǎo", "shì jiè"]);
614 assert_eq!(
615 convert("中国人喜欢中国吃饭"),
616 ["zhōng guó rén", "xǐ huan", "zhōng guó", "chī fàn"]
617 );
618 }
619
620 #[test]
621 fn keeps_unmatched_text_as_tokens() {
622 assert_eq!(convert("Hi!"), ["H", "i", "!"]);
623 }
624
625 #[test]
626 fn sentence_splits_words_into_syllables() {
627 assert_eq!(
628 Pinyin::sentence("你好,世界").to_string(),
629 "nǐ hǎo , shì jiè"
630 );
631 assert_eq!(Pinyin::phrase("你好,世界").to_string(), "nǐ hǎo shì jiè");
632 }
633
634 #[test]
635 fn formats_tone_styles_correctly() {
636 assert_eq!(
637 Converter::new("你好")
638 .with_tone_style(ToneStyle::Number)
639 .to_string(),
640 "ni3 hao3"
641 );
642 assert_eq!(Converter::new("旅行").to_string(), "lǚ xíng");
643 assert_eq!(Converter::new("旅行").without_tone().to_string(), "lv xing");
644 }
645
646 #[test]
647 fn handles_names_and_passports() {
648 assert_eq!(Pinyin::name("单某某").to_string(), "shàn mǒu mǒu");
649 assert_eq!(Pinyin::name("单于单").to_string(), "chán yú dān");
650 assert_eq!(Pinyin::passport_name("吕秀才").to_string(), "lyu xiu cai");
651 }
652
653 #[test]
654 fn builds_permalink_and_abbr() {
655 assert_eq!(
656 Pinyin::permalink("带着希望去旅行"),
657 "dai-zhe-xi-wang-qu-lv-xing"
658 );
659 assert_eq!(
660 Pinyin::permalink_with("带着希望去旅行", "_").unwrap(),
661 "dai_zhe_xi_wang_qu_lv_xing"
662 );
663 assert!(Pinyin::permalink_with("你好", "=").is_err());
664 assert_eq!(Pinyin::abbr("北京大学").to_string(), "b j d x");
665 assert_eq!(Pinyin::name_abbr("单某某").to_string(), "s m m");
666 }
667
668 #[test]
669 fn supports_configured_conversion() {
670 let no_words = PinyinConfig::new().with_long_words(false);
671 assert_eq!(
672 convert_with_config("你好", &no_words).unwrap(),
673 ["nǐ", "hǎo"]
674 );
675
676 let err = convert_with_config("你好", &PinyinConfig::new().with_max_length(1));
677 assert!(matches!(err, Err(PinyinError::InputTooLong { .. })));
678 }
679
680 #[test]
681 fn exposes_chars_and_heteronyms() {
682 assert_eq!(Pinyin::chars("重庆").to_string(), "zhòng qìng");
683 assert!(Pinyin::heteronym("重").iter().any(|(_, items)| {
684 items.contains(&"zhòng".to_string()) && items.contains(&"chóng".to_string())
685 }));
686 }
687}