ib_romaji/
lib.rs

1//! A fast Japanese romanizer.
2//!
3//! ## Usage
4//! ```rust
5//! use ib_romaji::HepburnRomanizer;
6//!
7//! let romanizer = HepburnRomanizer::default();
8//!
9//! let mut romajis = Vec::new();
10//! romanizer.romanize_and_try_for_each("日本語", |len, romaji| {
11//!     romajis.push((len, romaji));
12//!     None::<()>
13//! });
14//! assert_eq!(romajis, vec![(9, "nippongo"), (3, "a"), (3, "aki"), (3, "bi"), (3, "chi"), (3, "he"), (3, "hi"), (3, "iru"), (3, "jitsu"), (3, "ka"), (3, "kou"), (3, "ku"), (3, "kusa"), (3, "nchi"), (3, "ni"), (3, "nichi"), (3, "nitsu"), (3, "su"), (3, "tachi")]);
15//!
16//! assert_eq!(romanizer.romanize_vec("日本語"), vec![(9, "nippongo"), (3, "a"), (3, "aki"), (3, "bi"), (3, "chi"), (3, "he"), (3, "hi"), (3, "iru"), (3, "jitsu"), (3, "ka"), (3, "kou"), (3, "ku"), (3, "kusa"), (3, "nchi"), (3, "ni"), (3, "nichi"), (3, "nitsu"), (3, "su"), (3, "tachi")]);
17//! ```
18//!
19//! ## Binary size
20//! The dictionary will take ~4.8 MiB (5.5 MiB without compression) in the binary at the moment.
21//!
22//! ## Design
23//! `&[&str]` will cause each str to occupy 16 extra bytes to store the pointer and length. While CStr only needs 1 byte for each str.
24//! - For words, this can save 3.14 MiB (actually 3.54 MiB).
25//!   - Source file: 2.98 MiB -> `\0`+`\`: 2.80 MiB, `\n`: 2.54 MiB
26//!   - `build()` time: `split()`/memchr +10%
27//! - And this way the str can also be compressed and then streamly decompressed.
28//!
29//! ## Features
30#![cfg_attr(docsrs, feature(doc_cfg))]
31#![cfg_attr(feature = "doc", doc = document_features::document_features!())]
32use bon::bon;
33use daachorse::{CharwiseDoubleArrayAhoCorasick, CharwiseDoubleArrayAhoCorasickBuilder, MatchKind};
34
35use ib_unicode::str::RoundCharBoundaryExt;
36
37#[cfg(feature = "cache")]
38pub mod cache;
39pub mod data;
40
41/// [Hepburn romanization](https://en.wikipedia.org/wiki/Hepburn_romanization)
42#[derive(Clone)]
43pub struct HepburnRomanizer {
44    // ac: AhoCorasick,
45    ac: CharwiseDoubleArrayAhoCorasick<u32>,
46    kanji: bool,
47}
48
49#[bon]
50impl HepburnRomanizer {
51    /// [`HepburnRomanizer::default()`]
52    #[builder(builder_type = HepburnRomanizerBuilder, state_mod(vis = "pub(crate)"))]
53    pub fn new(
54        #[builder(default = false, getter(vis = "pub(crate)"))] kana: bool,
55        #[builder(default = false, getter(vis = "pub(crate)"))] kanji: bool,
56        #[builder(default = false, getter(vis = "pub(crate)"))] word: bool,
57    ) -> Self {
58        // // let start = UnsafeCell::new(0);
59        // let mut start = 0;
60        // let words = memchr::memchr_iter(b'\n', data::WORDS.as_bytes()).map(|end| {
61        //     // let start = start.get();
62        //     // let word = unsafe { str::from_raw_parts(data::WORDS.as_ptr().add(start), end - start) };
63        //     let word = unsafe { data::WORDS.get_unchecked(start..end) };
64        //     start = end + 1;
65        //     word
66        // });
67        // // chain() will make the iterator significantly slower
68        // // .chain(iter::once(unsafe {
69        // //     data::WORDS.get_unchecked(*start.get()..)
70        // // }));
71
72        // memchr is as fast as std, but harder to work with
73        #[cfg(not(feature = "compress-words"))]
74        let words = data::WORDS.split('\n');
75        #[cfg(feature = "compress-words")]
76        let words = include_bytes_zstd::include_bytes_zstd!("src/data/words.in.txt", 22);
77        #[cfg(feature = "compress-words")]
78        let words = words
79            .split(|&b| b == b'\n')
80            .map(|b| unsafe { str::from_utf8_unchecked(b) });
81
82        // let mut ac = AhoCorasick::builder();
83        // ac.start_kind(StartKind::Anchored)
84        //     .match_kind(MatchKind::LeftmostLongest);
85        // let ac = match (kana, word) {
86        //     (true, true) => ac.build(data::kana::HEPBURN_KANAS.iter().cloned().chain(words)),
87        //     (true, false) => ac.build(data::kana::HEPBURN_KANAS),
88        //     (false, true) => ac.build(words),
89        //     (false, false) => ac.build::<_, &str>([]),
90        // }
91        // .unwrap();
92
93        let ac =
94            CharwiseDoubleArrayAhoCorasickBuilder::new().match_kind(MatchKind::LeftmostLongest);
95        let ac = match (kana, word) {
96            (true, true) => ac.build(data::kana::HEPBURN_KANAS.iter().cloned().chain(words)),
97            (true, false) => ac.build(data::kana::HEPBURN_KANAS),
98            (false, true) => ac.build(words),
99            (false, false) => ac.build([] as [&str; 0]),
100        }
101        .unwrap();
102
103        Self { ac, kanji }
104    }
105
106    /// Romanize the first kana in the string, and return the length of the kana and the romaji.
107    ///
108    /// ## Example
109    /// ```
110    /// use ib_romaji::HepburnRomanizer;
111    ///
112    /// assert_eq!(HepburnRomanizer::builder().kana(true).build().romanize_kana("あ"), Some((3, "a")));
113    /// ```
114    /// TODO: Iter
115    pub fn romanize_kana<S: ?Sized + AsRef<str>>(&self, s: &S) -> Option<(usize, &'static str)> {
116        let s = s.as_ref();
117        let s = &s[..s.floor_char_boundary_ib(data::kana::KANA_MAX_LEN)];
118        // let m = self.ac.find(Input::new(s).anchored(Anchored::Yes))?;
119        // let pattern = m.pattern().as_usize();
120        let m = self
121            .ac
122            .leftmost_find_iter(s)
123            .next()
124            .filter(|m| m.start() == 0)?;
125        let pattern = m.value() as usize;
126        let len = m.end() - m.start();
127        data::kana::HEPBURN_ROMAJIS
128            .get(pattern)
129            .map(|&romaji| (len, romaji))
130    }
131
132    /// Romanize kanas from the beginning of the string until a non-kana character, and return the length of the kanas and the romajis.
133    pub fn romanize_kana_str<S: ?Sized + AsRef<str>>(&self, s: &S) -> Option<(usize, String)> {
134        let s = s.as_ref();
135        let mut len = 0;
136        let mut buf = String::new();
137        while let Some((l, romaji)) = self.romanize_kana(&s[len..]).or_else(|| {
138            if s[len..].starts_with("、") {
139                Some((3, "、"))
140            } else {
141                None
142            }
143        }) {
144            len += l;
145            buf.push_str(romaji);
146            if len >= s.len() {
147                return Some((len, buf));
148            }
149        }
150        if len == 0 { None } else { Some((len, buf)) }
151    }
152
153    /// Romanize kana text to romajis. Returns `None` if there is any non-kana character in the string.
154    pub fn romanize_kana_str_all<S: ?Sized + AsRef<str>>(&self, s: &S) -> Option<String> {
155        let s = s.as_ref();
156        match self.romanize_kana_str(s) {
157            Some((len, buf)) if len == s.len() => Some(buf),
158            _ => None,
159        }
160    }
161
162    /// Romanize the first word in the string, and call `f` for each possible romanization.
163    ///
164    /// `f` can return `Some(_)` to stop the iteration, or `None` to continue.
165    ///
166    /// ## Example
167    /// ```
168    /// use ib_romaji::HepburnRomanizer;
169    ///
170    /// let mut romajis = Vec::new();
171    /// HepburnRomanizer::default().romanize_and_try_for_each("日本語", |len, romaji| {
172    ///     romajis.push((len, romaji));
173    ///     None::<()>
174    /// });
175    /// assert_eq!(romajis, vec![(9, "nippongo"), (3, "a"), (3, "aki"), (3, "bi"), (3, "chi"), (3, "he"), (3, "hi"), (3, "iru"), (3, "jitsu"), (3, "ka"), (3, "kou"), (3, "ku"), (3, "kusa"), (3, "nchi"), (3, "ni"), (3, "nichi"), (3, "nitsu"), (3, "su"), (3, "tachi")]);
176    /// ```
177    ///
178    /// ## See also
179    /// [`romanize_vec()`](Self::romanize_vec) for a version that returns a `Vec` of all possible romanizations.
180    pub fn romanize_and_try_for_each<S: ?Sized + AsRef<str>, T>(
181        &self,
182        s: &S,
183        mut f: impl FnMut(usize, &'static str) -> Option<T>,
184    ) -> Option<T> {
185        let s = s.as_ref();
186        let s = &s[..s.floor_char_boundary_ib(data::WORD_MAX_LEN)];
187
188        // self.ac.find(Input::new(s).anchored(Anchored::Yes))
189        if let Some(m) = self
190            .ac
191            .leftmost_find_iter(s)
192            .next()
193            .filter(|m| m.start() == 0)
194        {
195            // let pattern = m.pattern().as_usize();
196            let pattern = m.value() as usize;
197            let len = m.end() - m.start();
198            if pattern < data::kana::HEPBURN_ROMAJIS.len() {
199                let romaji = data::kana::HEPBURN_ROMAJIS[pattern];
200                if let Some(result) = f(len, romaji) {
201                    return Some(result);
202                }
203            } else if pattern < data::kana::HEPBURN_ROMAJIS.len() + data::WORD_ROMAJIS.len() {
204                // TODO: Binary search
205                for romaji in data::WORD_ROMAJIS[pattern - data::kana::HEPBURN_ROMAJIS.len()] {
206                    if let Some(result) = f(len, romaji) {
207                        return Some(result);
208                    }
209                }
210            }
211        }
212
213        if self.kanji {
214            // let s = unsafe { str::from_utf8_unchecked(s) };
215            if let Some(kanji) = s.chars().next() {
216                // TODO: Binary search
217                for romaji in data::kanji_romajis(kanji) {
218                    // TODO: Always 3?
219                    if let Some(result) = f(kanji.len_utf8(), romaji) {
220                        return Some(result);
221                    }
222                }
223            }
224        }
225
226        None
227    }
228
229    /// Romanize the first word in the string, and return a `Vec` for all possible romanization.
230    ///
231    /// ## Example
232    /// ```
233    /// use ib_romaji::HepburnRomanizer;
234    ///
235    /// assert_eq!(HepburnRomanizer::default().romanize_vec("日本語"), vec![(9, "nippongo"), (3, "a"), (3, "aki"), (3, "bi"), (3, "chi"), (3, "he"), (3, "hi"), (3, "iru"), (3, "jitsu"), (3, "ka"), (3, "kou"), (3, "ku"), (3, "kusa"), (3, "nchi"), (3, "ni"), (3, "nichi"), (3, "nitsu"), (3, "su"), (3, "tachi")]);
236    /// ```
237    pub fn romanize_vec<S: ?Sized + AsRef<str>>(&self, s: &S) -> Vec<(usize, &'static str)> {
238        let mut results = Vec::new();
239        self.romanize_and_try_for_each(s, |len, romaji| {
240            results.push((len, romaji));
241            None::<()>
242        });
243        results
244    }
245
246    /// Check if the string can be fully romanized.
247    ///
248    /// This function can be used to test if the string is a possible Japanese text or not.
249    pub fn is_romanizable<S: ?Sized + AsRef<str>>(&self, s: &S) -> bool {
250        let s = s.as_ref();
251        if s.is_empty() {
252            return true;
253        }
254        self.romanize_and_try_for_each(s, |len, _| self.is_romanizable(&s[len..]).then_some(()))
255            .is_some()
256    }
257
258    /// Check if the string can be fully romanized to the given romaji.
259    pub fn is_romanizable_to<S: ?Sized + AsRef<str>>(&self, s: &S, romaji: &S) -> bool {
260        let s = s.as_ref();
261        let romaji = romaji.as_ref();
262        if s.is_empty() {
263            return romaji.is_empty();
264        }
265        self.romanize_and_try_for_each(s, |len, word_romaji| {
266            self.is_romanizable_to(&s[len..], romaji.strip_prefix(word_romaji)?)
267                .then_some(())
268        })
269        .is_some()
270    }
271}
272
273impl Default for HepburnRomanizer {
274    fn default() -> Self {
275        Self::builder().kana(true).kanji(true).word(true).build()
276    }
277}
278
279#[cfg(test)]
280mod tests {
281    use std::{fs, io::Write};
282
283    use indexmap::IndexSet;
284
285    use super::*;
286
287    #[test]
288    fn min_len() {
289        let min_len = data::kana::HEPBURN_KANAS
290            .iter()
291            .inspect(|kana| {
292                if kana.len() == data::kana::KANA_MIN_LEN {
293                    println!("{}", kana);
294                }
295            })
296            .map(|s| s.len())
297            .min()
298            .unwrap();
299        assert_eq!(data::kana::KANA_MIN_LEN, min_len);
300
301        assert!(data::MIN_LEN <= data::kana::KANA_MIN_LEN);
302        assert!(data::MIN_LEN <= data::KANJI_MIN_LEN);
303    }
304
305    #[test]
306    fn kana_max_len() {
307        let max_len = data::kana::HEPBURN_KANAS
308            .iter()
309            .inspect(|kana| {
310                if kana.len() == data::kana::KANA_MAX_LEN {
311                    println!("{}", kana);
312                }
313            })
314            .map(|s| s.len())
315            .max()
316            .unwrap();
317        assert_eq!(data::kana::KANA_MAX_LEN, max_len);
318
319        let max_len = data::kana::HEPBURN_ROMAJIS
320            .iter()
321            .inspect(|romaji| {
322                if romaji.len() == data::kana::KANA_ROMAJI_MAX_LEN {
323                    println!("{}", romaji);
324                }
325            })
326            .map(|s| s.len())
327            .max()
328            .unwrap();
329        assert_eq!(data::kana::KANA_ROMAJI_MAX_LEN, max_len);
330    }
331
332    #[test]
333    fn kana() {
334        let data = HepburnRomanizer::builder().kana(true).build();
335        assert_eq!(data.romanize_kana("は"), Some((3, "ha")));
336        assert_eq!(data.romanize_kana("ハハハ"), Some((3, "ha")));
337        assert_eq!(data.romanize_kana("ジョジョ"), Some((6, "jo")));
338        assert_eq!(data.romanize_kana("って"), Some((6, "tte")));
339        assert_eq!(data.romanize_kana("日は"), None);
340    }
341
342    #[test]
343    fn kana_str() {
344        let data = HepburnRomanizer::builder().kana(true).build();
345        assert_eq!(data.romanize_kana_str("は"), Some((3, "ha".into())));
346        assert_eq!(data.romanize_kana_str("ハハハ"), Some((9, "hahaha".into())));
347        assert_eq!(
348            data.romanize_kana_str("ジョジョ"),
349            Some((12, "jojo".into()))
350        );
351        assert_eq!(data.romanize_kana_str("って"), Some((6, "tte".into())));
352        assert_eq!(data.romanize_kana_str("日は"), None);
353    }
354
355    #[test]
356    fn is_romanizable_to() {
357        let data = HepburnRomanizer::builder().kana(true).kanji(true).build();
358        assert!(data.is_romanizable_to("は", "ha"));
359        assert!(data.is_romanizable_to("ハハハ", "hahaha"));
360        assert!(data.is_romanizable_to("ジョジョ", "jojo"));
361        assert!(data.is_romanizable_to("って", "tte"));
362        assert!(data.is_romanizable_to("日は", "hiha"));
363        assert!(data.is_romanizable_to("日は", "kusaha"));
364        assert!(!data.is_romanizable_to("今日", "kyou"));
365        assert!(data.is_romanizable_to("今日", "imakusa"));
366    }
367
368    #[ignore]
369    #[test]
370    fn codegen_kanji() {
371        let romanizer = HepburnRomanizer::builder().kana(true).build();
372
373        let mut dup_count = 0;
374        let mut romaji_max_len = 0;
375
376        let kanjidic = fs::read_to_string("data/kanjidic.csv").unwrap();
377        let mut out_kanjis = fs::File::create("src/data/kanjis.rs").unwrap();
378        writeln!(out_kanjis, "match kanji {{").unwrap();
379        let mut range = 0;
380        for (_i, line) in kanjidic.lines().enumerate() {
381            let (kanji, kanas) = match line.split_once('\t') {
382                Some(v) => v,
383                None => continue,
384            };
385
386            write!(out_kanjis, "'{kanji}'=>").unwrap();
387
388            let kanas_count = kanas.split('\t').count();
389            let mut kanas_set: IndexSet<String> = kanas
390                .split('\t')
391                .map(|kana| match romanizer.romanize_kana_str_all(kana) {
392                    Some(romaji) => format!("\"{}\"", romaji),
393                    None => {
394                        println!("Failed to romanize kana: {kana}");
395                        kana.into()
396                    }
397                })
398                .collect();
399            kanas_set.sort_unstable();
400            if kanas_set.len() != kanas_count {
401                // println!("Duplicated romajis: {kanji}\t{kanas}");
402                dup_count += 1;
403            }
404
405            assert!(
406                data::KANJI_LEN.contains(&kanji.len()),
407                "{kanji} {}",
408                kanji.len()
409            );
410            {
411                let max_len = kanas_set.iter().map(|s| s.len()).max().unwrap();
412                if max_len > romaji_max_len {
413                    romaji_max_len = max_len;
414                }
415                if max_len == data::KANJI_ROMAJI_MAX_LEN {
416                    println!("Max len romaji: {kanji} {kanas_set:?}");
417                }
418            }
419
420            write!(
421                out_kanjis,
422                "&[{}],",
423                kanas_set.into_iter().collect::<Vec<_>>().join(",")
424            )
425            .unwrap();
426
427            // (i + 1) % 8 == 0
428            // Natural align
429            let c = kanji.chars().next().unwrap() as u32;
430            if c / 10 != range {
431                range = c / 10;
432                out_kanjis.write_all(b"\n").unwrap();
433            }
434        }
435        write!(out_kanjis, "_ => &[]\n}}").unwrap();
436
437        println!("Kanjis with duplicated romajis: {dup_count}");
438        println!("Romaji max len: {romaji_max_len}");
439        assert_eq!(romaji_max_len, data::KANJI_ROMAJI_MAX_LEN);
440    }
441
442    /// `codegen_kanji()` should be run first.
443    ///
444    /// `cargo test --package ib-romaji --lib -r -- tests::codegen_word --exact --no-capture --ignored > data/word.txt`
445    #[ignore]
446    #[test]
447    fn codegen_word() {
448        let romanizer = HepburnRomanizer::builder().kana(true).build();
449        let kanji_romanizer = HepburnRomanizer::builder().kana(true).kanji(true).build();
450
451        let mut dup_count = 0;
452        let mut romanizable_count = 0;
453        let mut partial_romanizable_count = 0;
454        let mut diff_romanizable_count = 0;
455        let mut unromanizable_count = 0;
456        let mut max_len = 0;
457        let mut romaji_max_len = 0;
458
459        let jmdict = fs::read_to_string("data/jmdict.csv").unwrap();
460        let mut out_words = fs::File::create("src/data/words.in.txt").unwrap();
461        let mut out_kanas = fs::File::create("src/data/word_kanas.rs").unwrap();
462        // writeln!(out_words, "&[").unwrap();
463        // writeln!(out_words, "\"").unwrap();
464        // let end = jmdict.lines().count() - 1;
465        writeln!(out_kanas, "&[").unwrap();
466        // let mut c = 0;
467        let mut range = 0;
468        let mut range_c = 0;
469        let mut range_2 = 0;
470        for (i, line) in jmdict.lines().enumerate() {
471            let (word, kanas) = match line.split_once('\t') {
472                Some(v) => v,
473                None => continue,
474            };
475
476            let kanas_count = kanas.split('\t').count();
477            let kanas_set: IndexSet<String> = kanas
478                .split('\t')
479                .map(|kana| match romanizer.romanize_kana_str_all(kana) {
480                    // format!("\"{}\"", romaji)
481                    Some(romaji) => romaji,
482                    None => {
483                        println!("Failed to romanize kana: {kana}");
484                        kana.into()
485                    }
486                })
487                .collect();
488            if kanas_set.len() != kanas_count {
489                // println!("Duplicated romajis: {kanji}\t{kanas}");
490                dup_count += 1;
491            }
492
493            // Filter out ordinary words
494            // Source file: 2.52+3.59=6.11 MiB -> 1.07+1.45=2.52 MiB
495            // Binary: -10.01 MiB
496            // TODO: What if the dependent word is in words?
497            let mut romajis = if kanji_romanizer.is_romanizable(word) {
498                let romajis = kanas_set
499                    .iter()
500                    .cloned()
501                    .filter(|romaji| !kanji_romanizer.is_romanizable_to(word, romaji))
502                    .collect::<Vec<_>>();
503                if romajis.len() != kanas_set.len() {
504                    if romajis.is_empty() {
505                        // println!("romanizable: {word}");
506                        romanizable_count += 1;
507                        continue;
508                    }
509                    println!(
510                        "partial: {word} -{} {kanas_set:?} -> {romajis:?}",
511                        kanas_set.len() - romajis.len()
512                    );
513                    partial_romanizable_count += 1;
514                } else {
515                    println!("diff: {word} {kanas_set:?}");
516                    diff_romanizable_count += 1;
517                }
518                romajis
519            } else {
520                println!("un: {word}");
521                unromanizable_count += 1;
522                kanas_set.into_iter().collect()
523            };
524            romajis.sort_unstable();
525
526            if word.len() > max_len {
527                max_len = word.len();
528            }
529            if word.len() == data::WORD_MAX_LEN {
530                println!("Max len word: {word}");
531            }
532            {
533                let max_len = romajis.iter().map(|s| s.len()).max().unwrap();
534                if max_len > romaji_max_len {
535                    romaji_max_len = max_len;
536                }
537                if max_len == data::WORD_ROMAJI_MAX_LEN {
538                    println!("Max len romaji: {word} {romajis:?}");
539                }
540            }
541
542            // write!(out_words, "\"{kanji}\",").unwrap();
543            // if i != end {
544            //     write!(out_words, "{word}\n").unwrap();
545            // } else {
546            //     write!(out_words, "{word}").unwrap();
547            // }
548            if i == 0 {
549                write!(out_words, "{word}").unwrap();
550            } else {
551                write!(out_words, "\n{word}").unwrap();
552            }
553
554            // i != 0 && (c + 1) % 8 == 0
555            // Natural align
556            let ch = word.chars().next().unwrap() as u32;
557            let ch2 = word.chars().nth(1).unwrap_or_default() as u32;
558            if ch / 100 != range || range_c > 10 && ch2 / 100 != range_2 {
559                if ch / 100 != range {
560                    range = ch / 100;
561                    range_c = 0;
562                }
563                range_2 = ch2 / 100;
564                if i != 0 {
565                    // out_words.write_all(b"\n").unwrap();
566                    // out_words.write_all(b"\\\n").unwrap();
567                    out_kanas.write_all(b"\n").unwrap();
568                }
569            } else {
570                range_c += 1;
571            }
572
573            write!(
574                out_kanas,
575                "&[{}],",
576                romajis
577                    .into_iter()
578                    .map(|romaji| format!("\"{}\"", romaji))
579                    .collect::<Vec<_>>()
580                    .join(",")
581            )
582            .unwrap();
583
584            // c += 1;
585        }
586        // write!(out_words, "\n]").unwrap();
587        // write!(out_words, "\\\n\"").unwrap();
588        write!(out_kanas, "\n]").unwrap();
589
590        println!("Words with duplicated romajis: {dup_count}");
591        println!();
592        println!("Romanizable words: {romanizable_count}");
593        println!("Partial romanizable words: {partial_romanizable_count}");
594        println!("Different romanizable words: {diff_romanizable_count}");
595        println!("Unromanizable words: {unromanizable_count}");
596        println!();
597        println!("Max word length: {max_len}");
598        assert_eq!(data::WORD_MAX_LEN, max_len);
599        println!("Romaji max length: {romaji_max_len}");
600        assert_eq!(data::WORD_ROMAJI_MAX_LEN, romaji_max_len);
601    }
602
603    #[test]
604    fn kanji() {
605        assert_eq!(
606            data::kanji_romajis('日'),
607            [
608                "a", "aki", "bi", "chi", "he", "hi", "iru", "jitsu", "ka", "kou", "ku", "kusa",
609                "nchi", "ni", "nichi", "nitsu", "su", "tachi"
610            ]
611        );
612
613        let data = HepburnRomanizer::builder().kana(true).kanji(true).build();
614        assert_eq!(data.romanize_vec("は"), vec![(3, "ha")]);
615        assert_eq!(data.romanize_vec("ハハハ"), vec![(3, "ha")]);
616        assert_eq!(data.romanize_vec("ジョジョ"), vec![(6, "jo")]);
617        assert_eq!(data.romanize_vec("って"), vec![(6, "tte")]);
618        assert_eq!(
619            data.romanize_vec("日は"),
620            [
621                "a", "aki", "bi", "chi", "he", "hi", "iru", "jitsu", "ka", "kou", "ku", "kusa",
622                "nchi", "ni", "nichi", "nitsu", "su", "tachi"
623            ]
624            .map(|romaji| (3, romaji))
625        );
626        assert_eq!(
627            data.romanize_vec("今日"),
628            vec![(3, "ima"), (3, "kin"), (3, "kon"), (3, "na")]
629        );
630    }
631
632    #[test]
633    fn word() {
634        let data = HepburnRomanizer::builder().kana(true).word(true).build();
635        assert_eq!(data.romanize_vec("は"), vec![(3, "ha")]);
636        assert_eq!(data.romanize_vec("ハハハ"), vec![(3, "ha")]);
637        assert_eq!(data.romanize_vec("ジョジョ"), vec![(6, "jo")]);
638        assert_eq!(data.romanize_vec("って"), vec![(6, "tte")]);
639        assert_eq!(data.romanize_vec("日は"), vec![]);
640        assert_eq!(data.romanize_vec("今日"), vec![(6, "kyou")]);
641
642        let data = HepburnRomanizer::builder()
643            .kana(true)
644            .kanji(true)
645            .word(true)
646            .build();
647        assert_eq!(
648            data.romanize_vec("今日"),
649            vec![(6, "kyou"), (3, "ima"), (3, "kin"), (3, "kon"), (3, "na")]
650        );
651    }
652}