use moine_core::{
damerau_distance, damerau_levenshtein_str, distance, levenshtein_str,
normalized_similarity_str, Lattice,
};
use crate::overrides::OverrideDictionary;
use crate::romaji::{romaji_lattice, romaji_paths, JaLatticeError};
use crate::unidic::{
romaji_paths_from_reading_paths, DictionaryReadingOptions, UnidicReadingIndex,
};
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct JapaneseDistance {
pub surface_levenshtein: usize,
pub surface_damerau: usize,
pub lattice: usize,
pub lattice_damerau: usize,
pub combined: usize,
}
pub fn compare_with_overrides(
left: &str,
right: &str,
overrides: &OverrideDictionary,
) -> Result<JapaneseDistance, JaLatticeError> {
let left_lattice = overrides.romaji_lattice(left)?;
let right_lattice = overrides.romaji_lattice(right)?;
Ok(compare_lattices(left, right, &left_lattice, &right_lattice))
}
pub fn compare_with_unidic_index(
left: &str,
right: &str,
index: &UnidicReadingIndex,
options: DictionaryReadingOptions,
) -> Result<JapaneseDistance, JaLatticeError> {
let left_lattice = unidic_or_direct_lattice(left, index, options)?;
let right_lattice = unidic_or_direct_lattice(right, index, options)?;
Ok(compare_lattices(left, right, &left_lattice, &right_lattice))
}
pub fn normalized_similarity_with_unidic_index(
left: &str,
right: &str,
index: &UnidicReadingIndex,
options: DictionaryReadingOptions,
) -> Result<f64, JaLatticeError> {
let left_paths = unidic_or_direct_romaji_paths(left, index, options)?;
let right_paths = unidic_or_direct_romaji_paths(right, index, options)?;
Ok(max_normalized_similarity(&left_paths, &right_paths))
}
pub fn unidic_or_direct_lattice(
input: &str,
index: &UnidicReadingIndex,
options: DictionaryReadingOptions,
) -> Result<Lattice, JaLatticeError> {
if let Ok(lattice) = romaji_lattice(input) {
return Ok(lattice);
}
if let Some(lattice) = index.romaji_lattice(input, options)? {
return Ok(lattice);
}
if let Some(lattice) = index.hybrid_romaji_lattice(input, options)? {
return Ok(lattice);
}
romaji_lattice(input)
}
pub fn unidic_or_direct_romaji_paths(
input: &str,
index: &UnidicReadingIndex,
options: DictionaryReadingOptions,
) -> Result<Vec<String>, JaLatticeError> {
if let Ok(paths) = romaji_paths(input) {
return Ok(paths);
}
let paths = index
.try_reading_paths_with_stats(input, options)
.map_err(|err| JaLatticeError::ArtifactPayload(err.to_string()))?
.paths;
if !paths.is_empty() {
return romaji_paths_from_reading_paths(&paths);
}
let paths = index
.try_hybrid_reading_paths_with_stats(input, options)
.map_err(|err| JaLatticeError::ArtifactPayload(err.to_string()))?
.paths;
if !paths.is_empty() {
return romaji_paths_from_reading_paths(&paths);
}
romaji_paths(input)
}
fn max_normalized_similarity(left_paths: &[String], right_paths: &[String]) -> f64 {
left_paths
.iter()
.flat_map(|left| {
right_paths
.iter()
.map(move |right| normalized_similarity_str(left, right))
})
.fold(0.0, f64::max)
}
fn compare_lattices(
left: &str,
right: &str,
left_lattice: &Lattice,
right_lattice: &Lattice,
) -> JapaneseDistance {
let lattice = distance(left_lattice, right_lattice);
let lattice_damerau = damerau_distance(left_lattice, right_lattice);
let surface_levenshtein = levenshtein_str(left, right);
let surface_damerau = damerau_levenshtein_str(left, right);
JapaneseDistance {
surface_levenshtein,
surface_damerau,
lattice,
lattice_damerau,
combined: surface_damerau.min(lattice),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn combined_takes_the_better_surface_or_lattice_distance() {
let overrides = OverrideDictionary::from_entries([("印刷", ["インサツ"])]);
let distances =
compare_with_overrides("いんさt", "印刷", &overrides).expect("should compare");
assert_eq!(distances.lattice, 1);
assert_eq!(distances.lattice_damerau, 1);
assert!(distances.surface_damerau > distances.lattice);
assert_eq!(distances.combined, distances.lattice);
}
#[test]
fn lattice_damerau_counts_adjacent_romaji_transposition() {
let distances = compare_with_overrides("モイネ", "モニエ", &OverrideDictionary::default())
.expect("kana input should compare");
assert_eq!(distances.lattice, 2);
assert_eq!(distances.lattice_damerau, 1);
}
#[test]
fn unidic_index_can_compare_ascii_to_dictionary_reading() {
let csv = "\
茶,1,2,3,名詞,普通名詞,一般,*,*,*,チャ,茶,茶,チャ,茶,チャ,和
道具,1,2,3,名詞,普通名詞,一般,*,*,*,ドウグ,道具,道具,ドーグ,道具,ドーグ,漢
";
let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
let distances = compare_with_unidic_index(
"chadougu",
"茶道具",
&index,
DictionaryReadingOptions {
longest_match_only: true,
..DictionaryReadingOptions::default()
},
)
.expect("should compare");
assert_eq!(distances.lattice, 0);
assert!(distances.surface_damerau > distances.lattice);
}
#[test]
fn unidic_index_computes_candidate_pair_similarity() {
let csv = "\
印刷,1,2,3,名詞,普通名詞,サ変可能,*,*,*,インサツ,印刷,印刷,インサツ,印刷,インサツ,漢
";
let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
let similarity = normalized_similarity_with_unidic_index(
"いんさt",
"印刷",
&index,
DictionaryReadingOptions::default(),
)
.unwrap();
assert!((similarity - 6.0 / 7.0).abs() < 1e-12);
}
#[test]
fn artifact_payload_loaders_preserve_comparison_behavior() {
let csv = "\
印刷,1,2,3,名詞,普通名詞,サ変可能,*,*,*,インサツ,印刷,印刷,インサツ,印刷,インサツ,漢
";
let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
let yaml_index =
UnidicReadingIndex::from_artifact_payload(index.artifact_payload()).unwrap();
let mut binary = Vec::new();
index.write_artifact_binary_payload(&mut binary).unwrap();
let binary_index =
UnidicReadingIndex::from_binary_artifact_payload_reader(binary.as_slice()).unwrap();
let options = DictionaryReadingOptions::default();
let csv_distances = compare_with_unidic_index("いんさt", "印刷", &index, options).unwrap();
let yaml_distances =
compare_with_unidic_index("いんさt", "印刷", &yaml_index, options).unwrap();
let binary_distances =
compare_with_unidic_index("いんさt", "印刷", &binary_index, options).unwrap();
assert_eq!(yaml_distances, csv_distances);
assert_eq!(binary_distances, csv_distances);
}
#[test]
fn unidic_comparison_keeps_ascii_as_identity_path() {
let csv = "\
c,1,2,3,記号,文字,*,*,*,*,シー,c,c,シー,c,シー,外
h,1,2,3,記号,文字,*,*,*,*,エイチ,h,h,エイチ,h,エイチ,外
a,1,2,3,記号,文字,*,*,*,*,エー,a,a,エー,a,エー,外
d,1,2,3,記号,文字,*,*,*,*,ディー,d,d,ディー,d,ディー,外
o,1,2,3,記号,文字,*,*,*,*,オー,o,o,オー,o,オー,外
u,1,2,3,記号,文字,*,*,*,*,ユー,u,u,ユー,u,ユー,外
g,1,2,3,記号,文字,*,*,*,*,ジー,g,g,ジー,g,ジー,外
";
let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
let lattice =
unidic_or_direct_lattice("chadougu", &index, DictionaryReadingOptions::default())
.unwrap();
let trace = moine_core::distance_with_trace(&lattice, &Lattice::from_paths(["chadougu"]));
assert_eq!(trace.distance, 0);
}
#[test]
fn hybrid_lattice_allows_dictionary_prefix_and_direct_tail() {
let csv = "\
印,1,2,3,名詞,普通名詞,一般,*,*,*,イン,印,印,イン,印,イン,漢
";
let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
let lattice =
unidic_or_direct_lattice("印さt", &index, DictionaryReadingOptions::default()).unwrap();
let trace = moine_core::distance_with_trace(&lattice, &Lattice::from_paths(["insat"]));
assert_eq!(trace.distance, 0);
}
#[test]
fn hybrid_lattice_allows_direct_prefix_and_dictionary_tail() {
let csv = "\
具,1,2,3,名詞,普通名詞,一般,*,*,*,グ,具,具,グ,具,グ,漢
";
let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
let lattice =
unidic_or_direct_lattice("chadou具", &index, DictionaryReadingOptions::default())
.unwrap();
let trace = moine_core::distance_with_trace(&lattice, &Lattice::from_paths(["chadougu"]));
assert_eq!(trace.distance, 0);
}
#[test]
fn hybrid_lattice_supports_mixed_middle_ascii() {
let csv = "\
東,1,2,3,名詞,普通名詞,一般,*,*,*,トウ,東,東,トウ,東,トウ,漢
";
let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
let lattice =
unidic_or_direct_lattice("東kょう", &index, DictionaryReadingOptions::default())
.unwrap();
let trace = moine_core::distance_with_trace(&lattice, &Lattice::from_paths(["toukyou"]));
assert_eq!(trace.distance, 0);
}
#[test]
fn hybrid_lattice_does_not_guess_unknown_kanji() {
let index = UnidicReadingIndex::default();
let err = unidic_or_direct_lattice("未知z", &index, DictionaryReadingOptions::default())
.unwrap_err();
assert!(matches!(
err,
JaLatticeError::UnsupportedChar {
ch: '未', index: 0
}
));
}
#[test]
fn hybrid_comparison_handles_dictionary_and_ascii_mixture() {
let csv = "\
鬼滅,1,2,3,名詞,普通名詞,一般,*,*,*,キメツ,鬼滅,鬼滅,キメツ,鬼滅,キメツ,固
刃,1,2,3,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和
";
let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
let distances = compare_with_unidic_index(
"鬼滅のyaiba",
"鬼滅の刃",
&index,
DictionaryReadingOptions {
longest_match_only: true,
..DictionaryReadingOptions::default()
},
)
.unwrap();
assert_eq!(distances.lattice, 0);
}
}