1#![doc = include_str!("../README.md")]
2#![warn(missing_docs, clippy::todo)]
3
4mod hepburn_dict;
5mod phfbin;
6mod syn_dict;
7mod types;
8mod util;
9
10pub use types::{IsJapanese, KakasiResult};
11
12use unicode_normalization::UnicodeNormalization;
13
14use phfbin::PhfMap;
15use types::{CharType, KanjiString, Readings};
16
17pub fn convert<S: AsRef<str>>(text: S) -> KakasiResult {
25 let dict = PhfMap::new(util::KANJI_DICT);
26
27 let text = normalize(text.as_ref());
28
29 let mut char_indices = text.char_indices().peekable();
30 let mut kana_buf = String::new();
31 let mut prev_buf_type = CharType::Whitespace;
33 let mut prev_acc_type = (CharType::Whitespace, false);
35 let mut cap = (false, false, false);
38
39 let mut res = KakasiResult::new(text.len());
40
41 let conv_kana_buf = |kana_buf: &mut String,
42 res: &mut KakasiResult,
43 prev_acc_type: &mut (CharType, bool),
44 cap: &mut (bool, bool, bool)| {
45 if !kana_buf.is_empty() {
46 let hira = convert_katakana(kana_buf);
47 res.hiragana.push_str(&hira);
48 let mut rom = hiragana_to_romaji(&hira);
49
50 if cap.0 {
51 rom = util::capitalize_first_c(&rom);
52 cap.0 = false;
53 }
54 if cap.1 && !cap.2 {
55 res.romaji = util::capitalize_first_c(&res.romaji);
56 cap.2 = true;
57 }
58
59 util::ensure_trailing_space(&mut res.romaji, prev_acc_type.0.space_after());
60 res.romaji.push_str(&rom);
61
62 kana_buf.clear();
63 *prev_acc_type = (CharType::Hiragana, true);
64 }
65 };
66
67 while let Some((i, c)) = char_indices.next() {
68 if util::is_char_in_range(c, util::HIRAGANA) {
69 if prev_buf_type != CharType::Hiragana {
70 conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
71 }
72 kana_buf.push(c);
73 prev_buf_type = CharType::Hiragana;
74 } else if util::is_char_in_range(c, util::KATAKANA) {
75 if prev_buf_type != CharType::Katakana {
76 conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
77 }
78 kana_buf.push(c);
79 prev_buf_type = CharType::Katakana;
80 } else if util::is_char_in_range(c, util::KANJI) {
81 let (t, n) = convert_kanji(&text[i..], &kana_buf, &dict);
82 conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
83
84 if n > 0 {
85 kana_buf = t;
86 conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
87 for _ in 1..n {
88 char_indices.next();
89 }
90 } else {
91 res.hiragana.push(c);
93 res.romaji.push(c);
94 }
95 prev_acc_type = (CharType::Kanji, true);
96 } else if c.is_whitespace() {
97 conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
98 res.hiragana.push(c);
99 res.romaji.push(c);
100 prev_acc_type = (CharType::Whitespace, false);
101 } else if c == '・' {
102 conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
103 res.hiragana.push(c);
104 res.romaji.push(' ');
105 prev_acc_type = (CharType::Whitespace, false);
106 } else if c == util::PROLONGED_SOUND_MARK {
107 if prev_buf_type != CharType::Hiragana && prev_buf_type != CharType::Katakana {
108 conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
109 }
110 kana_buf.push(c);
111 prev_buf_type = match prev_buf_type {
112 CharType::Hiragana => CharType::Hiragana,
113 _ => CharType::Katakana,
114 };
115 } else {
116 conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
118 res.hiragana.push(c);
119
120 let (c_rom, char_type) = util::PCT_DICT.get(&c).copied().unwrap_or_else(|| {
124 let is_point = c == '.' || c == ',';
125 (
126 c,
127 if c.is_ascii_digit()
128 || (is_point
129 && prev_acc_type.0 == CharType::Numeric
130 && char_indices
131 .peek()
132 .map(|(_, nc)| nc.is_ascii_digit())
133 .unwrap_or_default())
134 {
135 CharType::Numeric
136 } else if is_point {
137 CharType::TrailingPunct
138 } else {
139 CharType::Other
140 },
141 )
142 });
143
144 let is_jpunct = util::is_char_japanese_punctuation(c);
147 if prev_acc_type.1 || is_jpunct {
148 util::ensure_trailing_space(
149 &mut res.romaji,
150 prev_acc_type.0.space_after() && char_type.space_before(),
151 );
152 }
153
154 if is_jpunct && char_type == CharType::Other {
157 res.romaji.extend(c_rom.nfkc());
158 } else {
159 res.romaji.push(c_rom);
160 }
161
162 cap.0 =
167 c_rom == '.' && char_type != CharType::Numeric || cap.0 && !char_type.space_after();
168 cap.1 |= cap.0;
169
170 prev_acc_type = (char_type, is_jpunct);
171 };
172 }
173
174 conv_kana_buf(&mut kana_buf, &mut res, &mut prev_acc_type, &mut cap);
175 res
176}
177
178pub fn is_japanese<S: AsRef<str>>(text: S) -> IsJapanese {
194 let mut maybe = false;
195 for c in text.as_ref().chars() {
196 if util::is_char_in_range(c, util::HIRAGANA) || util::is_char_in_range(c, util::KATAKANA) {
197 return IsJapanese::True;
198 }
199 maybe |= util::is_char_in_range(c, util::KANJI);
200 }
201 match maybe {
202 true => IsJapanese::Maybe,
203 false => IsJapanese::False,
204 }
205}
206
207fn convert_katakana(text: &str) -> String {
209 let mut buf = String::with_capacity(text.len());
210 text.chars().for_each(|c| {
211 match c as u32 {
212 0x30a1..=0x30f6 => buf.push(char::from_u32(c as u32 - (0x30a1 - 0x3041)).unwrap()),
213 0x30f7 => buf.push_str("ゔぁ"),
214 0x30f8 => buf.push_str("ゔぃ"),
215 0x30f9 => buf.push_str("ゔぇ"),
216 0x30fa => buf.push_str("ゔぉ"),
217 _ => buf.push(c),
218 };
219 });
220 buf
221}
222
223fn hiragana_to_romaji(text: &str) -> String {
225 let mut buf = String::with_capacity(text.len());
226 let mut chars = text.char_indices().peekable();
227 let mut kc_match = None;
228
229 while let Some((i, c)) = chars.peek().copied() {
230 if util::is_char_in_range(c, util::HIRAGANA) {
231 match kc_match {
232 Some((m_i, n_char, m_rom)) => {
233 let kc_str = &text[m_i..i + c.len_utf8()];
234 match hepburn_dict::HEPBURN_DICT.get(kc_str).copied() {
235 Some(rom) => {
236 if n_char >= hepburn_dict::HEPBURN_MAX_KLEN - 1 {
239 buf.push_str(rom);
240 kc_match = None;
241 chars.next();
242 } else {
243 kc_match = Some((m_i, n_char + 1, rom));
244 chars.next();
245 }
246 }
247 None => {
248 buf.push_str(m_rom);
250 kc_match = None;
251 }
252 }
253 }
254 None => {
255 let kc_str = &text[i..i + c.len_utf8()];
256 match hepburn_dict::HEPBURN_DICT.get(kc_str).copied() {
257 Some(rom) => {
258 kc_match = Some((i, 1, rom));
259 }
260 None => buf.push(c),
261 }
262 chars.next();
263 }
264 }
265 } else if c == util::PROLONGED_SOUND_MARK {
266 if let Some((_, _, rom)) = kc_match {
267 buf.push_str(rom);
268 kc_match = None;
269 }
270 buf.push(buf.chars().last().unwrap_or('-'));
271 chars.next();
272 } else {
273 buf.push(c);
274 chars.next();
275 }
276 }
277
278 if let Some((_, _, rom)) = kc_match {
279 buf.push_str(rom);
280 }
281
282 buf
283}
284
285fn convert_kanji(text: &str, btext: &str, dict: &PhfMap) -> (String, usize) {
301 let mut translation: Option<String> = None;
302 let mut i_c = 0;
303 let mut n_c = 0;
304 let mut char_indices = text.char_indices().peekable();
305
306 while let Some((i, c)) = char_indices.next() {
307 let kanji = &text[0..i + c.len_utf8()];
308 let mut more_chars = 0;
309
310 let this_tl = match dict.get::<KanjiString, Readings>(KanjiString::new(kanji)) {
311 Some(readings) => readings.iter().and_then(|mut ri| {
312 ri.find_map(|r| match r {
313 types::Reading::Simple { hira } => Some(hira),
314 types::Reading::Tail { mut hira, ch } => {
315 char_indices.peek().and_then(|(_, next_c)| {
316 if util::is_char_in_range(*next_c, util::HIRAGANA) {
318 util::CLETTERS.get(&ch).and_then(|cltr| {
319 if cltr.contains(next_c) {
320 more_chars += 1;
322 hira.push(*next_c);
323 Some(hira)
324 } else {
325 None
326 }
327 })
328 } else {
329 None
330 }
331 })
332 }
333 types::Reading::Context { hira, ctx } => {
334 if btext.contains(&ctx) {
335 Some(hira)
336 } else {
337 None
338 }
339 }
340 })
341 }),
342 None => {
343 break;
344 }
345 };
346
347 i_c += 1;
348 if let Some(tl) = this_tl {
349 translation = Some(tl);
350 n_c = i_c + more_chars;
351 }
352 }
353
354 translation.map(|tl| (tl, n_c)).unwrap_or_default()
355}
356
357fn normalize(text: &str) -> String {
360 let mut imcount = 0;
361 let replacements = text.char_indices().filter_map(|(i, c)| {
362 if c == util::ITERATION_MARK {
363 if imcount == 0 {
365 imcount = 1;
366 for c in text[i + c.len_utf8()..].chars() {
367 if c == util::ITERATION_MARK {
368 imcount += 1;
369 } else {
370 break;
371 }
372 }
373 }
374
375 text[0..i]
377 .chars()
378 .rev()
379 .nth(imcount - 1)
380 .map(|prev| (i, c.len_utf8(), prev))
381 } else {
382 imcount = 0;
383 syn_dict::SYN_DICT
384 .get(&c)
385 .map(|r_char| (i, c.len_utf8(), *r_char))
386 .or_else(|| {
387 if util::is_char_fwidth_punctuation(c) {
389 Some((i, c.len_utf8(), c))
390 } else {
391 None
392 }
393 })
394 }
395 });
396
397 let mut new = String::with_capacity(text.len());
398 let mut last = 0;
399
400 for (i, clen, r_char) in replacements {
401 new.extend(text[last..i].nfkc());
402 new.push(r_char);
403 last = i + clen;
404 }
405 new.extend(text[last..].nfkc());
406 new
407}
408
409#[cfg(test)]
410mod tests {
411 use super::*;
412 use rstest::rstest;
413
414 #[rstest]
415 #[case("\u{ff1f}", "?")]
416 #[case("\u{ff1e}", ">")]
417 #[case("…", "...")]
418 #[case("‥", "..")]
419 #[case("\u{FF70}", "\u{30FC}")]
420 fn t_unicode_nfkc(#[case] text: &str, #[case] expect: &str) {
421 let res = text.nfkc().collect::<String>();
422 assert_eq!(res, expect);
423 }
424
425 #[rstest]
426 #[case("壱意", "一意")]
427 #[case("", "")]
428 #[case("Abc", "Abc")]
429 fn t_normalize(#[case] text: &str, #[case] expect: &str) {
430 let res = normalize(text);
431 assert_eq!(res, expect);
432 }
433
434 #[rstest]
435 #[case("ァ", "ぁ")]
436 #[case("ヷ", "ゔぁ")]
437 #[case("ヸ", "ゔぃ")]
438 #[case("ヹ", "ゔぇ")]
439 #[case("ヺ", "ゔぉ")]
440 #[case("", "")]
441 #[case("Abc", "Abc")]
442 fn t_convert_katakana(#[case] text: &str, #[case] expect: &str) {
443 let res = convert_katakana(text);
444 assert_eq!(res, expect);
445 }
446
447 #[rstest]
448 #[case("", "")]
449 #[case("Abc", "Abc")]
450 #[case("ば", "ba")]
451 #[case("ばば", "baba")]
452 #[case("ばー", "baa")]
453 #[case("っふぁ", "ffa")]
454 fn t_to_romaji(#[case] text: &str, #[case] expect: &str) {
455 let res = hiragana_to_romaji(text);
456 assert_eq!(res, expect);
457 }
458
459 #[rstest]
460 #[case("会っAbc", "あっ", 2)]
461 #[case("渋谷", "しぶや", 2)]
462 #[case(
463 "東北大学電気通信研究所",
464 "とうほくだいがくでんきつうしんけんきゅうじょ",
465 11
466 )]
467 #[case("暑中お見舞い申し上げます", "しょちゅうおみまいもうしあげます", 12)]
468 fn t_convert_kanji(#[case] text: &str, #[case] expect: &str, #[case] expect_n: usize) {
469 let dict = PhfMap::new(util::KANJI_DICT);
470 let (res, n) = convert_kanji(text, "", &dict);
471 assert_eq!(res, expect);
472 assert_eq!(n, expect_n);
473 }
474}