kakasi/
lib.rs

1#![doc = include_str!("../README.md")]
2#![warn(missing_docs, clippy::todo)]
3
4mod hepburn_dict;
5mod phfbin;
6mod syn_dict;
7mod types;
8mod util;
9
10pub use types::{IsJapanese, KakasiResult};
11
12use unicode_normalization::UnicodeNormalization;
13
14use phfbin::PhfMap;
15use types::{CharType, KanjiString, Readings};
16
17/// Convert the given Japanese text to hiragana/romaji
18///
19/// ```
20/// let res = kakasi::convert("こんにちは世界!");
21/// assert_eq!(res.hiragana, "こんにちはせかい!");
22/// assert_eq!(res.romaji, "konnichiha sekai!");
23/// ```
24pub fn convert<S: AsRef<str>>(text: S) -> KakasiResult {
25    let dict = PhfMap::new(util::KANJI_DICT);
26
27    let text = normalize(text.as_ref());
28
29    let mut char_indices = text.char_indices().peekable();
30    let mut kana_buf = String::new();
31    // Type of the character last added to kana_buf
32    let mut prev_buf_type = CharType::Whitespace;
33    // Type of the character last added to the result + is_japanese flag
34    let mut prev_acc_type = (CharType::Whitespace, false);
35    // Capitalization flags
36    // 0: capitalize next word, 1: capitalize first sentence, 2: first sentence capitalized
37    let mut cap = (false, false, false);
38
39    let mut res = KakasiResult::new(text.len());
40
41    let conv_kana_buf = |kana_buf: &mut String,
42                         res: &mut KakasiResult,
43                         prev_acc_type: &mut (CharType, bool),
44                         cap: &mut (bool, bool, bool)| {
45        if !kana_buf.is_empty() {
46            let hira = convert_katakana(kana_buf);
47            res.hiragana.push_str(&hira);
48            let mut rom = hiragana_to_romaji(&hira);
49
50            if cap.0 {
51                rom = util::capitalize_first_c(&rom);
52                cap.0 = false;
53            }
54            if cap.1 && !cap.2 {
55                res.romaji = util::capitalize_first_c(&res.romaji);
56                cap.2 = true;
57            }
58
59            util::ensure_trailing_space(&mut res.romaji, prev_acc_type.0.space_after());
60            res.romaji.push_str(&rom);
61
62            kana_buf.clear();
63            *prev_acc_type = (CharType::Hiragana, true);
64        }
65    };
66
67    while let Some((i, c)) = char_indices.next() {
68        if util::is_char_in_range(c, util::HIRAGANA) {
69            if prev_buf_type != CharType::Hiragana {
70                conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
71            }
72            kana_buf.push(c);
73            prev_buf_type = CharType::Hiragana;
74        } else if util::is_char_in_range(c, util::KATAKANA) {
75            if prev_buf_type != CharType::Katakana {
76                conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
77            }
78            kana_buf.push(c);
79            prev_buf_type = CharType::Katakana;
80        } else if util::is_char_in_range(c, util::KANJI) {
81            let (t, n) = convert_kanji(&text[i..], &kana_buf, &dict);
82            conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
83
84            if n > 0 {
85                kana_buf = t;
86                conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
87                for _ in 1..n {
88                    char_indices.next();
89                }
90            } else {
91                // Unknown kanji
92                res.hiragana.push(c);
93                res.romaji.push(c);
94            }
95            prev_acc_type = (CharType::Kanji, true);
96        } else if c.is_whitespace() {
97            conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
98            res.hiragana.push(c);
99            res.romaji.push(c);
100            prev_acc_type = (CharType::Whitespace, false);
101        } else if c == '・' {
102            conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
103            res.hiragana.push(c);
104            res.romaji.push(' ');
105            prev_acc_type = (CharType::Whitespace, false);
106        } else if c == util::PROLONGED_SOUND_MARK {
107            if prev_buf_type != CharType::Hiragana && prev_buf_type != CharType::Katakana {
108                conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
109            }
110            kana_buf.push(c);
111            prev_buf_type = match prev_buf_type {
112                CharType::Hiragana => CharType::Hiragana,
113                _ => CharType::Katakana,
114            };
115        } else {
116            // The rest. Latin characters, other scripts, numbers, special characters
117            conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
118            res.hiragana.push(c);
119
120            // Determine the character type (required for correct spacing and capitalization).
121            // Japanese punctuation can be looked up in the dictionary, otherwise assume CharType::Other.
122            // Special case: dots and commas used as decimal seperators
123            let (c_rom, char_type) = util::PCT_DICT.get(&c).copied().unwrap_or_else(|| {
124                let is_point = c == '.' || c == ',';
125                (
126                    c,
127                    if c.is_ascii_digit()
128                        || (is_point
129                            && prev_acc_type.0 == CharType::Numeric
130                            && char_indices
131                                .peek()
132                                .map(|(_, nc)| nc.is_ascii_digit())
133                                .unwrap_or_default())
134                    {
135                        CharType::Numeric
136                    } else if is_point {
137                        CharType::TrailingPunct
138                    } else {
139                        CharType::Other
140                    },
141                )
142            });
143
144            // Add correct spacing if it is a Japanese punctuation character or the last character
145            // was Japanese
146            let is_jpunct = util::is_char_japanese_punctuation(c);
147            if prev_acc_type.1 || is_jpunct {
148                util::ensure_trailing_space(
149                    &mut res.romaji,
150                    prev_acc_type.0.space_after() && char_type.space_before(),
151                );
152            }
153
154            // Japanese punctuation was not normalized at the beginning,
155            // the normalization here will replace fullwidth characters with normal ones.
156            if is_jpunct && char_type == CharType::Other {
157                res.romaji.extend(c_rom.nfkc());
158            } else {
159                res.romaji.push(c_rom);
160            }
161
162            // If the current character is a full stop (no decimal point),
163            // the next word should be capitalized.
164            // Keep the capitalization flag set if the following character is leading or joining
165            // punctuation. Example: `Sentence1. "Nice", sentence 2.`
166            cap.0 =
167                c_rom == '.' && char_type != CharType::Numeric || cap.0 && !char_type.space_after();
168            cap.1 |= cap.0;
169
170            prev_acc_type = (char_type, is_jpunct);
171        };
172    }
173
174    conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
175    res
176}
177
178/// Check if the input text is Japanese
179///
180/// Note that (especially very short) japanese texts are not always
181/// distinguishable from Chinese, because these languages use the same
182/// characters.
183///
184/// Thus if only CJK ideographs are found, the function returns
185/// [`IsJapanese::Maybe`].
186///
187/// ```
188/// # use kakasi::IsJapanese;
189/// assert_eq!(kakasi::is_japanese("Abc"), IsJapanese::False);
190/// assert_eq!(kakasi::is_japanese("日本"), IsJapanese::Maybe);
191/// assert_eq!(kakasi::is_japanese("ラスト"), IsJapanese::True);
192/// ```
193pub fn is_japanese<S: AsRef<str>>(text: S) -> IsJapanese {
194    let mut maybe = false;
195    for c in text.as_ref().chars() {
196        if util::is_char_in_range(c, util::HIRAGANA) || util::is_char_in_range(c, util::KATAKANA) {
197            return IsJapanese::True;
198        }
199        maybe |= util::is_char_in_range(c, util::KANJI);
200    }
201    match maybe {
202        true => IsJapanese::Maybe,
203        false => IsJapanese::False,
204    }
205}
206
207/// Convert the katakana from the input string to hiragana
208fn convert_katakana(text: &str) -> String {
209    let mut buf = String::with_capacity(text.len());
210    text.chars().for_each(|c| {
211        match c as u32 {
212            0x30a1..=0x30f6 => buf.push(char::from_u32(c as u32 - (0x30a1 - 0x3041)).unwrap()),
213            0x30f7 => buf.push_str("ゔぁ"),
214            0x30f8 => buf.push_str("ゔぃ"),
215            0x30f9 => buf.push_str("ゔぇ"),
216            0x30fa => buf.push_str("ゔぉ"),
217            _ => buf.push(c),
218        };
219    });
220    buf
221}
222
223/// Convert the hiragana from the input string to latin characters
224fn hiragana_to_romaji(text: &str) -> String {
225    let mut buf = String::with_capacity(text.len());
226    let mut chars = text.char_indices().peekable();
227    let mut kc_match = None;
228
229    while let Some((i, c)) = chars.peek().copied() {
230        if util::is_char_in_range(c, util::HIRAGANA) {
231            match kc_match {
232                Some((m_i, n_char, m_rom)) => {
233                    let kc_str = &text[m_i..i + c.len_utf8()];
234                    match hepburn_dict::HEPBURN_DICT.get(kc_str).copied() {
235                        Some(rom) => {
236                            // If we have reached the maximum key length,
237                            // the match can be added directly
238                            if n_char >= hepburn_dict::HEPBURN_MAX_KLEN - 1 {
239                                buf.push_str(rom);
240                                kc_match = None;
241                                chars.next();
242                            } else {
243                                kc_match = Some((m_i, n_char + 1, rom));
244                                chars.next();
245                            }
246                        }
247                        None => {
248                            // Add the previous match and dont advance the iterator
249                            buf.push_str(m_rom);
250                            kc_match = None;
251                        }
252                    }
253                }
254                None => {
255                    let kc_str = &text[i..i + c.len_utf8()];
256                    match hepburn_dict::HEPBURN_DICT.get(kc_str).copied() {
257                        Some(rom) => {
258                            kc_match = Some((i, 1, rom));
259                        }
260                        None => buf.push(c),
261                    }
262                    chars.next();
263                }
264            }
265        } else if c == util::PROLONGED_SOUND_MARK {
266            if let Some((_, _, rom)) = kc_match {
267                buf.push_str(rom);
268                kc_match = None;
269            }
270            buf.push(buf.chars().last().unwrap_or('-'));
271            chars.next();
272        } else {
273            buf.push(c);
274            chars.next();
275        }
276    }
277
278    if let Some((_, _, rom)) = kc_match {
279        buf.push_str(rom);
280    }
281
282    buf
283}
284
285/// Convert the leading kanji from the input string to hiragana
286///
287/// # Arguments
288///
289/// * `text` - Input string starting with the kanji to convert.
290///
291///   The input needs to be NFKC-normalized and synonymous kanji need to be
292///   replaced using [`convert_syn`].
293///
294/// * `btext` - Buffer string (leading kana)
295///
296/// # Return
297///
298/// * `0` - String of hiragana
299/// * `1` -  Number of converted chars from the input string
300fn convert_kanji(text: &str, btext: &str, dict: &PhfMap) -> (String, usize) {
301    let mut translation: Option<String> = None;
302    let mut i_c = 0;
303    let mut n_c = 0;
304    let mut char_indices = text.char_indices().peekable();
305
306    while let Some((i, c)) = char_indices.next() {
307        let kanji = &text[0..i + c.len_utf8()];
308        let mut more_chars = 0;
309
310        let this_tl = match dict.get::<KanjiString, Readings>(KanjiString::new(kanji)) {
311            Some(readings) => readings.iter().and_then(|mut ri| {
312                ri.find_map(|r| match r {
313                    types::Reading::Simple { hira } => Some(hira),
314                    types::Reading::Tail { mut hira, ch } => {
315                        char_indices.peek().and_then(|(_, next_c)| {
316                            // Shortcut if the next character is not hiragana
317                            if util::is_char_in_range(*next_c, util::HIRAGANA) {
318                                util::CLETTERS.get(&ch).and_then(|cltr| {
319                                    if cltr.contains(next_c) {
320                                        // Add the next character to the char count
321                                        more_chars += 1;
322                                        hira.push(*next_c);
323                                        Some(hira)
324                                    } else {
325                                        None
326                                    }
327                                })
328                            } else {
329                                None
330                            }
331                        })
332                    }
333                    types::Reading::Context { hira, ctx } => {
334                        if btext.contains(&ctx) {
335                            Some(hira)
336                        } else {
337                            None
338                        }
339                    }
340                })
341            }),
342            None => {
343                break;
344            }
345        };
346
347        i_c += 1;
348        if let Some(tl) = this_tl {
349            translation = Some(tl);
350            n_c = i_c + more_chars;
351        }
352    }
353
354    translation.map(|tl| (tl, n_c)).unwrap_or_default()
355}
356
357/// NFKC-normalize the text, convert all synonymous kanji
358/// and replace iteration marks (`々`)
359fn normalize(text: &str) -> String {
360    let mut imcount = 0;
361    let replacements = text.char_indices().filter_map(|(i, c)| {
362        if c == util::ITERATION_MARK {
363            // Count iteration marks
364            if imcount == 0 {
365                imcount = 1;
366                for c in text[i + c.len_utf8()..].chars() {
367                    if c == util::ITERATION_MARK {
368                        imcount += 1;
369                    } else {
370                        break;
371                    }
372                }
373            }
374
375            // Replace withe the character imcount positions before
376            text[0..i]
377                .chars()
378                .rev()
379                .nth(imcount - 1)
380                .map(|prev| (i, c.len_utf8(), prev))
381        } else {
382            imcount = 0;
383            syn_dict::SYN_DICT
384                .get(&c)
385                .map(|r_char| (i, c.len_utf8(), *r_char))
386                .or_else(|| {
387                    // Dont normalize japanese punctuation, we need it to add correct spacing
388                    if util::is_char_fwidth_punctuation(c) {
389                        Some((i, c.len_utf8(), c))
390                    } else {
391                        None
392                    }
393                })
394        }
395    });
396
397    let mut new = String::with_capacity(text.len());
398    let mut last = 0;
399
400    for (i, clen, r_char) in replacements {
401        new.extend(text[last..i].nfkc());
402        new.push(r_char);
403        last = i + clen;
404    }
405    new.extend(text[last..].nfkc());
406    new
407}
408
409#[cfg(test)]
410mod tests {
411    use super::*;
412    use rstest::rstest;
413
414    #[rstest]
415    #[case("\u{ff1f}", "?")]
416    #[case("\u{ff1e}", ">")]
417    #[case("…", "...")]
418    #[case("‥", "..")]
419    #[case("\u{FF70}", "\u{30FC}")]
420    fn t_unicode_nfkc(#[case] text: &str, #[case] expect: &str) {
421        let res = text.nfkc().collect::<String>();
422        assert_eq!(res, expect);
423    }
424
425    #[rstest]
426    #[case("壱意", "一意")]
427    #[case("", "")]
428    #[case("Abc", "Abc")]
429    fn t_normalize(#[case] text: &str, #[case] expect: &str) {
430        let res = normalize(text);
431        assert_eq!(res, expect);
432    }
433
434    #[rstest]
435    #[case("ァ", "ぁ")]
436    #[case("ヷ", "ゔぁ")]
437    #[case("ヸ", "ゔぃ")]
438    #[case("ヹ", "ゔぇ")]
439    #[case("ヺ", "ゔぉ")]
440    #[case("", "")]
441    #[case("Abc", "Abc")]
442    fn t_convert_katakana(#[case] text: &str, #[case] expect: &str) {
443        let res = convert_katakana(text);
444        assert_eq!(res, expect);
445    }
446
447    #[rstest]
448    #[case("", "")]
449    #[case("Abc", "Abc")]
450    #[case("ば", "ba")]
451    #[case("ばば", "baba")]
452    #[case("ばー", "baa")]
453    #[case("っふぁ", "ffa")]
454    fn t_to_romaji(#[case] text: &str, #[case] expect: &str) {
455        let res = hiragana_to_romaji(text);
456        assert_eq!(res, expect);
457    }
458
459    #[rstest]
460    #[case("会っAbc", "あっ", 2)]
461    #[case("渋谷", "しぶや", 2)]
462    #[case(
463        "東北大学電気通信研究所",
464        "とうほくだいがくでんきつうしんけんきゅうじょ",
465        11
466    )]
467    #[case("暑中お見舞い申し上げます", "しょちゅうおみまいもうしあげます", 12)]
468    fn t_convert_kanji(#[case] text: &str, #[case] expect: &str, #[case] expect_n: usize) {
469        let dict = PhfMap::new(util::KANJI_DICT);
470        let (res, n) = convert_kanji(text, "", &dict);
471        assert_eq!(res, expect);
472        assert_eq!(n, expect_n);
473    }
474}