1#![cfg_attr(docsrs, feature(doc_cfg))]
31#![cfg_attr(feature = "doc", doc = document_features::document_features!())]
32use bon::bon;
33use daachorse::{CharwiseDoubleArrayAhoCorasick, CharwiseDoubleArrayAhoCorasickBuilder, MatchKind};
34
35use ib_unicode::str::RoundCharBoundaryExt;
36
37#[cfg(feature = "cache")]
38pub mod cache;
39pub mod data;
40
41#[derive(Clone)]
43pub struct HepburnRomanizer {
44 ac: CharwiseDoubleArrayAhoCorasick<u32>,
46 kanji: bool,
47}
48
49#[bon]
50impl HepburnRomanizer {
51 #[builder(builder_type = HepburnRomanizerBuilder, state_mod(vis = "pub(crate)"))]
53 pub fn new(
54 #[builder(default = false, getter(vis = "pub(crate)"))] kana: bool,
55 #[builder(default = false, getter(vis = "pub(crate)"))] kanji: bool,
56 #[builder(default = false, getter(vis = "pub(crate)"))] word: bool,
57 ) -> Self {
58 #[cfg(not(feature = "compress-words"))]
74 let words = data::WORDS.split('\n');
75 #[cfg(feature = "compress-words")]
76 let words = include_bytes_zstd::include_bytes_zstd!("src/data/words.in.txt", 22);
77 #[cfg(feature = "compress-words")]
78 let words = words
79 .split(|&b| b == b'\n')
80 .map(|b| unsafe { str::from_utf8_unchecked(b) });
81
82 let ac =
94 CharwiseDoubleArrayAhoCorasickBuilder::new().match_kind(MatchKind::LeftmostLongest);
95 let ac = match (kana, word) {
96 (true, true) => ac.build(data::kana::HEPBURN_KANAS.iter().cloned().chain(words)),
97 (true, false) => ac.build(data::kana::HEPBURN_KANAS),
98 (false, true) => ac.build(words),
99 (false, false) => ac.build([] as [&str; 0]),
100 }
101 .unwrap();
102
103 Self { ac, kanji }
104 }
105
106 pub fn romanize_kana<S: ?Sized + AsRef<str>>(&self, s: &S) -> Option<(usize, &'static str)> {
116 let s = s.as_ref();
117 let s = &s[..s.floor_char_boundary_ib(data::kana::KANA_MAX_LEN)];
118 let m = self
121 .ac
122 .leftmost_find_iter(s)
123 .next()
124 .filter(|m| m.start() == 0)?;
125 let pattern = m.value() as usize;
126 let len = m.end() - m.start();
127 data::kana::HEPBURN_ROMAJIS
128 .get(pattern)
129 .map(|&romaji| (len, romaji))
130 }
131
132 pub fn romanize_kana_str<S: ?Sized + AsRef<str>>(&self, s: &S) -> Option<(usize, String)> {
134 let s = s.as_ref();
135 let mut len = 0;
136 let mut buf = String::new();
137 while let Some((l, romaji)) = self.romanize_kana(&s[len..]).or_else(|| {
138 if s[len..].starts_with("、") {
139 Some((3, "、"))
140 } else {
141 None
142 }
143 }) {
144 len += l;
145 buf.push_str(romaji);
146 if len >= s.len() {
147 return Some((len, buf));
148 }
149 }
150 if len == 0 { None } else { Some((len, buf)) }
151 }
152
153 pub fn romanize_kana_str_all<S: ?Sized + AsRef<str>>(&self, s: &S) -> Option<String> {
155 let s = s.as_ref();
156 match self.romanize_kana_str(s) {
157 Some((len, buf)) if len == s.len() => Some(buf),
158 _ => None,
159 }
160 }
161
162 pub fn romanize_and_try_for_each<S: ?Sized + AsRef<str>, T>(
181 &self,
182 s: &S,
183 mut f: impl FnMut(usize, &'static str) -> Option<T>,
184 ) -> Option<T> {
185 let s = s.as_ref();
186 let s = &s[..s.floor_char_boundary_ib(data::WORD_MAX_LEN)];
187
188 if let Some(m) = self
190 .ac
191 .leftmost_find_iter(s)
192 .next()
193 .filter(|m| m.start() == 0)
194 {
195 let pattern = m.value() as usize;
197 let len = m.end() - m.start();
198 if pattern < data::kana::HEPBURN_ROMAJIS.len() {
199 let romaji = data::kana::HEPBURN_ROMAJIS[pattern];
200 if let Some(result) = f(len, romaji) {
201 return Some(result);
202 }
203 } else if pattern < data::kana::HEPBURN_ROMAJIS.len() + data::WORD_ROMAJIS.len() {
204 for romaji in data::WORD_ROMAJIS[pattern - data::kana::HEPBURN_ROMAJIS.len()] {
206 if let Some(result) = f(len, romaji) {
207 return Some(result);
208 }
209 }
210 }
211 }
212
213 if self.kanji {
214 if let Some(kanji) = s.chars().next() {
216 for romaji in data::kanji_romajis(kanji) {
218 if let Some(result) = f(kanji.len_utf8(), romaji) {
220 return Some(result);
221 }
222 }
223 }
224 }
225
226 None
227 }
228
229 pub fn romanize_vec<S: ?Sized + AsRef<str>>(&self, s: &S) -> Vec<(usize, &'static str)> {
238 let mut results = Vec::new();
239 self.romanize_and_try_for_each(s, |len, romaji| {
240 results.push((len, romaji));
241 None::<()>
242 });
243 results
244 }
245
246 pub fn is_romanizable<S: ?Sized + AsRef<str>>(&self, s: &S) -> bool {
250 let s = s.as_ref();
251 if s.is_empty() {
252 return true;
253 }
254 self.romanize_and_try_for_each(s, |len, _| self.is_romanizable(&s[len..]).then_some(()))
255 .is_some()
256 }
257
258 pub fn is_romanizable_to<S: ?Sized + AsRef<str>>(&self, s: &S, romaji: &S) -> bool {
260 let s = s.as_ref();
261 let romaji = romaji.as_ref();
262 if s.is_empty() {
263 return romaji.is_empty();
264 }
265 self.romanize_and_try_for_each(s, |len, word_romaji| {
266 self.is_romanizable_to(&s[len..], romaji.strip_prefix(word_romaji)?)
267 .then_some(())
268 })
269 .is_some()
270 }
271}
272
273impl Default for HepburnRomanizer {
274 fn default() -> Self {
275 Self::builder().kana(true).kanji(true).word(true).build()
276 }
277}
278
279#[cfg(test)]
280mod tests {
281 use std::{fs, io::Write};
282
283 use indexmap::IndexSet;
284
285 use super::*;
286
287 #[test]
288 fn min_len() {
289 let min_len = data::kana::HEPBURN_KANAS
290 .iter()
291 .inspect(|kana| {
292 if kana.len() == data::kana::KANA_MIN_LEN {
293 println!("{}", kana);
294 }
295 })
296 .map(|s| s.len())
297 .min()
298 .unwrap();
299 assert_eq!(data::kana::KANA_MIN_LEN, min_len);
300
301 assert!(data::MIN_LEN <= data::kana::KANA_MIN_LEN);
302 assert!(data::MIN_LEN <= data::KANJI_MIN_LEN);
303 }
304
305 #[test]
306 fn kana_max_len() {
307 let max_len = data::kana::HEPBURN_KANAS
308 .iter()
309 .inspect(|kana| {
310 if kana.len() == data::kana::KANA_MAX_LEN {
311 println!("{}", kana);
312 }
313 })
314 .map(|s| s.len())
315 .max()
316 .unwrap();
317 assert_eq!(data::kana::KANA_MAX_LEN, max_len);
318
319 let max_len = data::kana::HEPBURN_ROMAJIS
320 .iter()
321 .inspect(|romaji| {
322 if romaji.len() == data::kana::KANA_ROMAJI_MAX_LEN {
323 println!("{}", romaji);
324 }
325 })
326 .map(|s| s.len())
327 .max()
328 .unwrap();
329 assert_eq!(data::kana::KANA_ROMAJI_MAX_LEN, max_len);
330 }
331
332 #[test]
333 fn kana() {
334 let data = HepburnRomanizer::builder().kana(true).build();
335 assert_eq!(data.romanize_kana("は"), Some((3, "ha")));
336 assert_eq!(data.romanize_kana("ハハハ"), Some((3, "ha")));
337 assert_eq!(data.romanize_kana("ジョジョ"), Some((6, "jo")));
338 assert_eq!(data.romanize_kana("って"), Some((6, "tte")));
339 assert_eq!(data.romanize_kana("日は"), None);
340 }
341
342 #[test]
343 fn kana_str() {
344 let data = HepburnRomanizer::builder().kana(true).build();
345 assert_eq!(data.romanize_kana_str("は"), Some((3, "ha".into())));
346 assert_eq!(data.romanize_kana_str("ハハハ"), Some((9, "hahaha".into())));
347 assert_eq!(
348 data.romanize_kana_str("ジョジョ"),
349 Some((12, "jojo".into()))
350 );
351 assert_eq!(data.romanize_kana_str("って"), Some((6, "tte".into())));
352 assert_eq!(data.romanize_kana_str("日は"), None);
353 }
354
355 #[test]
356 fn is_romanizable_to() {
357 let data = HepburnRomanizer::builder().kana(true).kanji(true).build();
358 assert!(data.is_romanizable_to("は", "ha"));
359 assert!(data.is_romanizable_to("ハハハ", "hahaha"));
360 assert!(data.is_romanizable_to("ジョジョ", "jojo"));
361 assert!(data.is_romanizable_to("って", "tte"));
362 assert!(data.is_romanizable_to("日は", "hiha"));
363 assert!(data.is_romanizable_to("日は", "kusaha"));
364 assert!(!data.is_romanizable_to("今日", "kyou"));
365 assert!(data.is_romanizable_to("今日", "imakusa"));
366 }
367
368 #[ignore]
369 #[test]
370 fn codegen_kanji() {
371 let romanizer = HepburnRomanizer::builder().kana(true).build();
372
373 let mut dup_count = 0;
374 let mut romaji_max_len = 0;
375
376 let kanjidic = fs::read_to_string("data/kanjidic.csv").unwrap();
377 let mut out_kanjis = fs::File::create("src/data/kanjis.rs").unwrap();
378 writeln!(out_kanjis, "match kanji {{").unwrap();
379 let mut range = 0;
380 for (_i, line) in kanjidic.lines().enumerate() {
381 let (kanji, kanas) = match line.split_once('\t') {
382 Some(v) => v,
383 None => continue,
384 };
385
386 write!(out_kanjis, "'{kanji}'=>").unwrap();
387
388 let kanas_count = kanas.split('\t').count();
389 let mut kanas_set: IndexSet<String> = kanas
390 .split('\t')
391 .map(|kana| match romanizer.romanize_kana_str_all(kana) {
392 Some(romaji) => format!("\"{}\"", romaji),
393 None => {
394 println!("Failed to romanize kana: {kana}");
395 kana.into()
396 }
397 })
398 .collect();
399 kanas_set.sort_unstable();
400 if kanas_set.len() != kanas_count {
401 dup_count += 1;
403 }
404
405 assert!(
406 data::KANJI_LEN.contains(&kanji.len()),
407 "{kanji} {}",
408 kanji.len()
409 );
410 {
411 let max_len = kanas_set.iter().map(|s| s.len()).max().unwrap();
412 if max_len > romaji_max_len {
413 romaji_max_len = max_len;
414 }
415 if max_len == data::KANJI_ROMAJI_MAX_LEN {
416 println!("Max len romaji: {kanji} {kanas_set:?}");
417 }
418 }
419
420 write!(
421 out_kanjis,
422 "&[{}],",
423 kanas_set.into_iter().collect::<Vec<_>>().join(",")
424 )
425 .unwrap();
426
427 let c = kanji.chars().next().unwrap() as u32;
430 if c / 10 != range {
431 range = c / 10;
432 out_kanjis.write_all(b"\n").unwrap();
433 }
434 }
435 write!(out_kanjis, "_ => &[]\n}}").unwrap();
436
437 println!("Kanjis with duplicated romajis: {dup_count}");
438 println!("Romaji max len: {romaji_max_len}");
439 assert_eq!(romaji_max_len, data::KANJI_ROMAJI_MAX_LEN);
440 }
441
442 #[ignore]
446 #[test]
447 fn codegen_word() {
448 let romanizer = HepburnRomanizer::builder().kana(true).build();
449 let kanji_romanizer = HepburnRomanizer::builder().kana(true).kanji(true).build();
450
451 let mut dup_count = 0;
452 let mut romanizable_count = 0;
453 let mut partial_romanizable_count = 0;
454 let mut diff_romanizable_count = 0;
455 let mut unromanizable_count = 0;
456 let mut max_len = 0;
457 let mut romaji_max_len = 0;
458
459 let jmdict = fs::read_to_string("data/jmdict.csv").unwrap();
460 let mut out_words = fs::File::create("src/data/words.in.txt").unwrap();
461 let mut out_kanas = fs::File::create("src/data/word_kanas.rs").unwrap();
462 writeln!(out_kanas, "&[").unwrap();
466 let mut range = 0;
468 let mut range_c = 0;
469 let mut range_2 = 0;
470 for (i, line) in jmdict.lines().enumerate() {
471 let (word, kanas) = match line.split_once('\t') {
472 Some(v) => v,
473 None => continue,
474 };
475
476 let kanas_count = kanas.split('\t').count();
477 let kanas_set: IndexSet<String> = kanas
478 .split('\t')
479 .map(|kana| match romanizer.romanize_kana_str_all(kana) {
480 Some(romaji) => romaji,
482 None => {
483 println!("Failed to romanize kana: {kana}");
484 kana.into()
485 }
486 })
487 .collect();
488 if kanas_set.len() != kanas_count {
489 dup_count += 1;
491 }
492
493 let mut romajis = if kanji_romanizer.is_romanizable(word) {
498 let romajis = kanas_set
499 .iter()
500 .cloned()
501 .filter(|romaji| !kanji_romanizer.is_romanizable_to(word, romaji))
502 .collect::<Vec<_>>();
503 if romajis.len() != kanas_set.len() {
504 if romajis.is_empty() {
505 romanizable_count += 1;
507 continue;
508 }
509 println!(
510 "partial: {word} -{} {kanas_set:?} -> {romajis:?}",
511 kanas_set.len() - romajis.len()
512 );
513 partial_romanizable_count += 1;
514 } else {
515 println!("diff: {word} {kanas_set:?}");
516 diff_romanizable_count += 1;
517 }
518 romajis
519 } else {
520 println!("un: {word}");
521 unromanizable_count += 1;
522 kanas_set.into_iter().collect()
523 };
524 romajis.sort_unstable();
525
526 if word.len() > max_len {
527 max_len = word.len();
528 }
529 if word.len() == data::WORD_MAX_LEN {
530 println!("Max len word: {word}");
531 }
532 {
533 let max_len = romajis.iter().map(|s| s.len()).max().unwrap();
534 if max_len > romaji_max_len {
535 romaji_max_len = max_len;
536 }
537 if max_len == data::WORD_ROMAJI_MAX_LEN {
538 println!("Max len romaji: {word} {romajis:?}");
539 }
540 }
541
542 if i == 0 {
549 write!(out_words, "{word}").unwrap();
550 } else {
551 write!(out_words, "\n{word}").unwrap();
552 }
553
554 let ch = word.chars().next().unwrap() as u32;
557 let ch2 = word.chars().nth(1).unwrap_or_default() as u32;
558 if ch / 100 != range || range_c > 10 && ch2 / 100 != range_2 {
559 if ch / 100 != range {
560 range = ch / 100;
561 range_c = 0;
562 }
563 range_2 = ch2 / 100;
564 if i != 0 {
565 out_kanas.write_all(b"\n").unwrap();
568 }
569 } else {
570 range_c += 1;
571 }
572
573 write!(
574 out_kanas,
575 "&[{}],",
576 romajis
577 .into_iter()
578 .map(|romaji| format!("\"{}\"", romaji))
579 .collect::<Vec<_>>()
580 .join(",")
581 )
582 .unwrap();
583
584 }
586 write!(out_kanas, "\n]").unwrap();
589
590 println!("Words with duplicated romajis: {dup_count}");
591 println!();
592 println!("Romanizable words: {romanizable_count}");
593 println!("Partial romanizable words: {partial_romanizable_count}");
594 println!("Different romanizable words: {diff_romanizable_count}");
595 println!("Unromanizable words: {unromanizable_count}");
596 println!();
597 println!("Max word length: {max_len}");
598 assert_eq!(data::WORD_MAX_LEN, max_len);
599 println!("Romaji max length: {romaji_max_len}");
600 assert_eq!(data::WORD_ROMAJI_MAX_LEN, romaji_max_len);
601 }
602
603 #[test]
604 fn kanji() {
605 assert_eq!(
606 data::kanji_romajis('日'),
607 [
608 "a", "aki", "bi", "chi", "he", "hi", "iru", "jitsu", "ka", "kou", "ku", "kusa",
609 "nchi", "ni", "nichi", "nitsu", "su", "tachi"
610 ]
611 );
612
613 let data = HepburnRomanizer::builder().kana(true).kanji(true).build();
614 assert_eq!(data.romanize_vec("は"), vec![(3, "ha")]);
615 assert_eq!(data.romanize_vec("ハハハ"), vec![(3, "ha")]);
616 assert_eq!(data.romanize_vec("ジョジョ"), vec![(6, "jo")]);
617 assert_eq!(data.romanize_vec("って"), vec![(6, "tte")]);
618 assert_eq!(
619 data.romanize_vec("日は"),
620 [
621 "a", "aki", "bi", "chi", "he", "hi", "iru", "jitsu", "ka", "kou", "ku", "kusa",
622 "nchi", "ni", "nichi", "nitsu", "su", "tachi"
623 ]
624 .map(|romaji| (3, romaji))
625 );
626 assert_eq!(
627 data.romanize_vec("今日"),
628 vec![(3, "ima"), (3, "kin"), (3, "kon"), (3, "na")]
629 );
630 }
631
632 #[test]
633 fn word() {
634 let data = HepburnRomanizer::builder().kana(true).word(true).build();
635 assert_eq!(data.romanize_vec("は"), vec![(3, "ha")]);
636 assert_eq!(data.romanize_vec("ハハハ"), vec![(3, "ha")]);
637 assert_eq!(data.romanize_vec("ジョジョ"), vec![(6, "jo")]);
638 assert_eq!(data.romanize_vec("って"), vec![(6, "tte")]);
639 assert_eq!(data.romanize_vec("日は"), vec![]);
640 assert_eq!(data.romanize_vec("今日"), vec![(6, "kyou")]);
641
642 let data = HepburnRomanizer::builder()
643 .kana(true)
644 .kanji(true)
645 .word(true)
646 .build();
647 assert_eq!(
648 data.romanize_vec("今日"),
649 vec![(6, "kyou"), (3, "ima"), (3, "kin"), (3, "kon"), (3, "na")]
650 );
651 }
652}