use crate::error::JmdictError;
use crate::model::{
DataVersion, DeinflectionInfo, Entry, LookupResult, MatchType, Xref, FORMAT_VERSION, MAGIC,
};
use crate::query::{BatchQueryBuilder, QueryBuilder};
use fst::{automaton::Levenshtein, automaton::Str, Automaton, IntoStreamer, Map, Streamer};
use memmap2::Mmap;
use std::collections::{BTreeSet, HashMap};
use std::sync::Arc;
use std::{fs::File, path::Path};
#[derive(Clone)]
pub enum DictStorage {
Mmap(Arc<Mmap>),
Static(&'static [u8]),
Owned(Arc<Vec<u8>>),
}
impl AsRef<[u8]> for DictStorage {
fn as_ref(&self) -> &[u8] {
match self {
DictStorage::Mmap(m) => &m[..],
DictStorage::Static(s) => s,
DictStorage::Owned(v) => &v[..],
}
}
}
#[derive(Clone)]
pub(crate) struct MatchCandidate {
pub(crate) id: u64,
pub(crate) key: String,
pub(crate) match_type: MatchType,
pub(crate) score: f64,
pub(crate) deinflection: Option<DeinflectionInfo>,
}
fn upsert_better(best: &mut HashMap<u64, MatchCandidate>, cand: MatchCandidate) {
match best.get(&cand.id) {
Some(existing) if existing.score >= cand.score => {}
_ => {
best.insert(cand.id, cand);
}
}
}
pub struct Dict {
pub entries_blob: DictStorage,
pub kana_fst: Map<DictStorage>,
pub kanji_fst: Map<DictStorage>,
pub romaji_fst: Map<DictStorage>,
pub id_fst: Map<DictStorage>,
pub gloss_fst: Map<DictStorage>,
pub gloss_postings: DictStorage,
deinflector: bunpo::deinflector::Deinflector,
data_version: DataVersion,
header_size: usize,
entry_count: u32,
}
struct HeaderInfo {
data_version: DataVersion,
header_size: usize,
entry_count: u32,
}
fn parse_entries_header(data: &[u8]) -> Result<HeaderInfo, JmdictError> {
if data.len() < 8 {
return Err(JmdictError::DataCorrupted);
}
if &data[0..4] != MAGIC {
return Err(JmdictError::DataCorrupted);
}
let version = u32::from_le_bytes(data[4..8].try_into().unwrap());
if version != FORMAT_VERSION {
return Err(JmdictError::DataVersionMismatch {
expected: FORMAT_VERSION,
found: version,
});
}
if data.len() < 10 {
return Err(JmdictError::DataCorrupted);
}
let jmdict_ver_len = u16::from_le_bytes(data[8..10].try_into().unwrap()) as usize;
let mut pos = 10;
if data.len() < pos + jmdict_ver_len + 2 {
return Err(JmdictError::DataCorrupted);
}
let jmdict_version = String::from_utf8_lossy(&data[pos..pos + jmdict_ver_len]).to_string();
pos += jmdict_ver_len;
let gen_at_len = u16::from_le_bytes(data[pos..pos + 2].try_into().unwrap()) as usize;
pos += 2;
if data.len() < pos + gen_at_len {
return Err(JmdictError::DataCorrupted);
}
let generated_at = String::from_utf8_lossy(&data[pos..pos + gen_at_len]).to_string();
pos += gen_at_len;
if data.len() < pos + 4 {
return Err(JmdictError::DataCorrupted);
}
let entry_count = u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap());
Ok(HeaderInfo {
data_version: DataVersion {
format_version: version,
jmdict_version,
generated_at,
},
header_size: pos,
entry_count,
})
}
fn postings_contains(bytes: &[u8], id: u64) -> bool {
let n = bytes.len() / 8;
let mut lo = 0;
let mut hi = n;
while lo < hi {
let mid = lo + (hi - lo) / 2;
let chunk = &bytes[mid * 8..mid * 8 + 8];
let v = u64::from_le_bytes(chunk.try_into().unwrap());
match v.cmp(&id) {
std::cmp::Ordering::Less => lo = mid + 1,
std::cmp::Ordering::Greater => hi = mid,
std::cmp::Ordering::Equal => return true,
}
}
false
}
fn mmap_storage(path: &Path) -> Result<DictStorage, JmdictError> {
let file = File::open(path)?;
let map = unsafe { Mmap::map(&file)? };
Ok(DictStorage::Mmap(Arc::new(map)))
}
impl Dict {
#[allow(clippy::too_many_arguments)]
pub fn from_slices(
entries: &'static [u8],
kana_fst: &'static [u8],
kanji_fst: &'static [u8],
romaji_fst: &'static [u8],
id_fst: &'static [u8],
gloss_fst: &'static [u8],
gloss_postings: &'static [u8],
) -> Result<Self, JmdictError> {
Self::from_storage(
DictStorage::Static(entries),
DictStorage::Static(kana_fst),
DictStorage::Static(kanji_fst),
DictStorage::Static(romaji_fst),
DictStorage::Static(id_fst),
DictStorage::Static(gloss_fst),
DictStorage::Static(gloss_postings),
)
}
#[allow(clippy::too_many_arguments)]
pub fn from_storage(
entries: DictStorage,
kana_fst: DictStorage,
kanji_fst: DictStorage,
romaji_fst: DictStorage,
id_fst: DictStorage,
gloss_fst: DictStorage,
gloss_postings: DictStorage,
) -> Result<Self, JmdictError> {
let header = parse_entries_header(entries.as_ref())?;
Ok(Self {
entries_blob: entries,
kana_fst: Map::new(kana_fst)?,
kanji_fst: Map::new(kanji_fst)?,
romaji_fst: Map::new(romaji_fst)?,
id_fst: Map::new(id_fst)?,
gloss_fst: Map::new(gloss_fst)?,
gloss_postings,
deinflector: bunpo::deinflector::Deinflector::new(),
data_version: header.data_version,
header_size: header.header_size,
entry_count: header.entry_count,
})
}
pub fn load<P: AsRef<Path>>(base_dir: P) -> Result<Self, JmdictError> {
let base = base_dir.as_ref();
let entries = mmap_storage(&base.join("entries.bin"))?;
let kana = mmap_storage(&base.join("kana.fst"))?;
let kanji = mmap_storage(&base.join("kanji.fst"))?;
let romaji = mmap_storage(&base.join("romaji.fst"))?;
let id = mmap_storage(&base.join("id.fst"))?;
let gloss = mmap_storage(&base.join("gloss.fst"))?;
let gloss_postings = mmap_storage(&base.join("gloss_postings.bin"))?;
Self::from_storage(entries, kana, kanji, romaji, id, gloss, gloss_postings)
}
#[cfg(feature = "embedded")]
pub fn load_embedded() -> Result<Self, JmdictError> {
let entries = include_bytes!(concat!(env!("OUT_DIR"), "/entries.bin"));
let kana_fst = include_bytes!(concat!(env!("OUT_DIR"), "/kana.fst"));
let kanji_fst = include_bytes!(concat!(env!("OUT_DIR"), "/kanji.fst"));
let romaji_fst = include_bytes!(concat!(env!("OUT_DIR"), "/romaji.fst"));
let id_fst = include_bytes!(concat!(env!("OUT_DIR"), "/id.fst"));
let gloss_fst = include_bytes!(concat!(env!("OUT_DIR"), "/gloss.fst"));
let gloss_postings = include_bytes!(concat!(env!("OUT_DIR"), "/gloss_postings.bin"));
Self::from_slices(
entries,
kana_fst,
kanji_fst,
romaji_fst,
id_fst,
gloss_fst,
gloss_postings,
)
}
pub fn load_default() -> Result<Self, JmdictError> {
#[cfg(feature = "embedded")]
{
if let Ok(dict) = Self::load_embedded() {
return Ok(dict);
}
}
if let Ok(data_path) = std::env::var("JMDICT_DATA") {
return Self::load(Path::new(&data_path));
}
let dist = Path::new("dist");
if dist.join("entries.bin").exists() {
return Self::load(dist);
}
#[cfg(test)]
{
let workspace_dist = Path::new(env!("CARGO_MANIFEST_DIR")).join("../dist");
if workspace_dist.join("entries.bin").exists() {
return Self::load(&workspace_dist);
}
}
Self::load(dist)
}
pub fn entry_count(&self) -> usize {
self.entry_count as usize
}
pub fn version(&self) -> DataVersion {
self.data_version.clone()
}
pub fn lookup_exact(&self, term: &str) -> Vec<LookupResult> {
self.lookup_exact_inner(term)
}
fn lookup_exact_inner(&self, term: &str) -> Vec<LookupResult> {
self.candidates_to_results(self.exact_candidates(term))
}
pub(crate) fn exact_candidates(&self, term: &str) -> Vec<MatchCandidate> {
let mut ids = Vec::new();
if let Some(id) = self.kana_fst.get(term) {
ids.push(id);
}
if let Some(id) = self.kanji_fst.get(term) {
ids.push(id);
}
if let Some(id) = self.romaji_fst.get(term) {
ids.push(id);
}
ids.sort();
ids.dedup();
ids.into_iter()
.map(|id| MatchCandidate {
id,
key: term.to_string(),
match_type: MatchType::Exact,
score: 1.0,
deinflection: None,
})
.collect()
}
pub fn lookup_exact_with_deinflection(&self, term: &str) -> Vec<LookupResult> {
self.lookup_exact_with_deinflection_inner(term)
}
fn lookup_exact_with_deinflection_inner(&self, term: &str) -> Vec<LookupResult> {
self.candidates_to_results(self.deinflect_candidates(term))
}
pub(crate) fn deinflect_candidates(&self, term: &str) -> Vec<MatchCandidate> {
let exact = self.exact_candidates(term);
if !exact.is_empty() {
return exact;
}
let deinflected = self.deinflector.deinflect(term);
let mut seen_ids = BTreeSet::new();
let mut candidates = Vec::new();
for candidate in deinflected {
let exact = self.exact_candidates(&candidate.word);
for mc in exact {
if !seen_ids.insert(mc.id) {
continue;
}
candidates.push(MatchCandidate {
id: mc.id,
key: candidate.word.clone(),
match_type: MatchType::Deinflected,
score: 0.75,
deinflection: Some(DeinflectionInfo {
original_form: term.to_string(),
base_form: candidate.word.clone(),
rules: candidate
.reason_chains
.iter()
.flatten()
.map(|r| format!("{:?}", r))
.collect(),
}),
});
}
}
candidates.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
candidates
}
pub fn lookup_partial(&self, prefix: &str) -> Vec<LookupResult> {
self.lookup_partial_inner(prefix)
}
fn lookup_partial_inner(&self, prefix: &str) -> Vec<LookupResult> {
self.candidates_to_results(self.prefix_candidates(prefix))
}
pub(crate) fn prefix_candidates(&self, prefix: &str) -> Vec<MatchCandidate> {
let automaton = Str::new(prefix).starts_with();
let mut best: HashMap<u64, MatchCandidate> = HashMap::new();
for fst in [&self.kana_fst, &self.kanji_fst, &self.romaji_fst] {
let mut stream = fst.search(&automaton).into_stream();
while let Some((key, id)) = stream.next() {
let key_str = String::from_utf8_lossy(key).to_string();
let is_exact = key_str == prefix;
let (match_type, score) = if is_exact {
(MatchType::Exact, 1.0)
} else {
(MatchType::Prefix, 0.5)
};
upsert_better(
&mut best,
MatchCandidate {
id,
key: key_str,
match_type,
score,
deinflection: None,
},
);
}
}
let mut candidates: Vec<MatchCandidate> = best.into_values().collect();
candidates.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap()
.then(a.id.cmp(&b.id))
});
candidates
}
pub(crate) fn fuzzy_candidates(
&self,
term: &str,
max_distance: u32,
) -> Result<Vec<MatchCandidate>, JmdictError> {
let automaton = Levenshtein::new(term, max_distance)
.map_err(|_| JmdictError::InvalidQuery)?;
let mut best: HashMap<u64, MatchCandidate> = HashMap::new();
for fst in [&self.kana_fst, &self.kanji_fst, &self.romaji_fst] {
let mut stream = fst.search(&automaton).into_stream();
while let Some((key, id)) = stream.next() {
let key_str = String::from_utf8_lossy(key).to_string();
let is_exact = key_str == term;
let (match_type, score) = if is_exact {
(MatchType::Exact, 1.0)
} else {
let key_len = key_str.chars().count().max(1) as f64;
let term_len = term.chars().count().max(1) as f64;
let len_diff = (key_len - term_len).abs();
let score = 0.5 - (len_diff / (key_len + term_len)) * 0.2;
(MatchType::Fuzzy, score.max(0.1))
};
upsert_better(
&mut best,
MatchCandidate {
id,
key: key_str,
match_type,
score,
deinflection: None,
},
);
}
}
let mut candidates: Vec<MatchCandidate> = best.into_values().collect();
candidates.sort_by(|a, b| {
b.score
.partial_cmp(&a.score)
.unwrap()
.then(a.id.cmp(&b.id))
});
Ok(candidates)
}
pub fn lookup(&self, term: &str) -> QueryBuilder<'_> {
QueryBuilder::new(self, term)
}
pub fn lookup_batch(&self, terms: &[&str]) -> BatchQueryBuilder<'_> {
BatchQueryBuilder::new(self, terms.iter().map(|s| s.to_string()).collect())
}
pub fn lookup_gloss(&self, query: &str) -> Vec<LookupResult> {
let mut tokens: Vec<String> = query
.split(|c: char| !c.is_ascii_alphanumeric())
.filter(|s| !s.is_empty())
.map(|s| s.to_ascii_lowercase())
.collect();
tokens.sort();
tokens.dedup();
if tokens.is_empty() {
return Vec::new();
}
let mut posting_lists: Vec<&[u8]> = Vec::with_capacity(tokens.len());
for tok in &tokens {
match self.gloss_postings_for(tok) {
Some(bytes) => posting_lists.push(bytes),
None => return Vec::new(),
}
}
posting_lists.sort_by_key(|p| p.len());
let smallest = posting_lists[0];
let rest = &posting_lists[1..];
let intersected: Vec<u64> = smallest
.chunks_exact(8)
.map(|c| u64::from_le_bytes(c.try_into().unwrap()))
.filter(|id| rest.iter().all(|other| postings_contains(other, *id)))
.collect();
let total_entries: usize = posting_lists
.iter()
.map(|p| p.len() / 8)
.sum::<usize>()
.max(1);
let score = 0.6f64.min(0.3 + (tokens.len() as f64) / (total_entries as f64));
let key = tokens.join(" ");
intersected
.into_iter()
.filter_map(|id| {
self.load_entry(id).map(|entry| LookupResult {
entry,
match_type: MatchType::Gloss,
match_key: key.clone(),
score,
deinflection: None,
})
})
.collect()
}
fn gloss_postings_for(&self, token: &str) -> Option<&[u8]> {
let offset = self.gloss_fst.get(token)? as usize;
let postings = self.gloss_postings.as_ref();
let count = u32::from_le_bytes(postings.get(offset..offset + 4)?.try_into().ok()?) as usize;
let start = offset + 4;
let end = start + count * 8;
postings.get(start..end)
}
pub fn resolve_xref(&self, xref: &Xref) -> Vec<LookupResult> {
let mut results = self.lookup_exact(&xref.term);
if let Some(reading) = xref.reading.as_deref() {
results.retain(|r| r.entry.kana.iter().any(|k| k.text == reading));
}
results
}
pub fn lookup_by_id(&self, jmdict_id: &str) -> Option<LookupResult> {
let seq_id = self.id_fst.get(jmdict_id)?;
let entry = self.load_entry(seq_id)?;
Some(LookupResult {
entry,
match_type: MatchType::Exact,
match_key: jmdict_id.to_string(),
score: 1.0,
deinflection: None,
})
}
pub fn get(&self, seq_id: u64) -> Option<Entry> {
self.load_entry(seq_id)
}
pub fn iter_entries(&self) -> EntryIter<'_> {
EntryIter {
dict: self,
next: 0,
end: self.entry_count as u64,
}
}
fn candidates_to_results(&self, candidates: Vec<MatchCandidate>) -> Vec<LookupResult> {
candidates
.into_iter()
.filter_map(|mc| {
self.load_entry(mc.id).map(|entry| LookupResult {
entry,
match_type: mc.match_type,
match_key: mc.key,
score: mc.score,
deinflection: mc.deinflection,
})
})
.collect()
}
pub(crate) fn load_entry(&self, id: u64) -> Option<Entry> {
let count = self.entry_count as usize;
if id as usize >= count {
return None;
}
let hs = self.header_size;
let offset_index = hs + 4 + (id as usize) * 8;
let blob = self.entries_blob.as_ref();
let off = u32::from_le_bytes(blob[offset_index..offset_index + 4].try_into().ok()?);
let len = u32::from_le_bytes(blob[offset_index + 4..offset_index + 8].try_into().ok()?);
let data_start = hs + 4 + count * 8;
let start = data_start + (off as usize);
let end = start + len as usize;
postcard::from_bytes(&blob[start..end]).ok()
}
}
pub struct EntryIter<'d> {
dict: &'d Dict,
next: u64,
end: u64,
}
impl<'d> Iterator for EntryIter<'d> {
type Item = Entry;
fn next(&mut self) -> Option<Self::Item> {
while self.next < self.end {
let id = self.next;
self.next += 1;
if let Some(e) = self.dict.load_entry(id) {
return Some(e);
}
}
None
}
fn size_hint(&self) -> (usize, Option<usize>) {
let remaining = (self.end - self.next) as usize;
(0, Some(remaining))
}
}
#[cfg(test)]
mod tests {
use super::*;
fn pack(ids: &[u64]) -> Vec<u8> {
let mut v = Vec::with_capacity(ids.len() * 8);
for id in ids {
v.extend_from_slice(&id.to_le_bytes());
}
v
}
#[test]
fn postings_contains_hits_and_misses() {
let bytes = pack(&[1, 5, 10, 100, 1_000_000]);
assert!(postings_contains(&bytes, 1));
assert!(postings_contains(&bytes, 10));
assert!(postings_contains(&bytes, 1_000_000));
assert!(!postings_contains(&bytes, 0));
assert!(!postings_contains(&bytes, 2));
assert!(!postings_contains(&bytes, 99));
assert!(!postings_contains(&bytes, 1_000_001));
}
#[test]
fn postings_contains_empty_slice() {
assert!(!postings_contains(&[], 0));
assert!(!postings_contains(&[], 42));
}
#[test]
fn dict_storage_as_ref_owned() {
let storage = DictStorage::Owned(Arc::new(vec![1, 2, 3]));
assert_eq!(storage.as_ref(), &[1, 2, 3][..]);
}
#[test]
fn dict_storage_as_ref_static() {
let storage = DictStorage::Static(b"hello");
assert_eq!(storage.as_ref(), b"hello");
}
#[test]
fn parse_entries_header_rejects_bad_magic() {
let bad = b"XXXX\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00";
assert!(matches!(
parse_entries_header(bad),
Err(JmdictError::DataCorrupted)
));
}
#[test]
fn parse_entries_header_rejects_short_buffer() {
assert!(matches!(
parse_entries_header(&[]),
Err(JmdictError::DataCorrupted)
));
assert!(matches!(
parse_entries_header(b"JMD"),
Err(JmdictError::DataCorrupted)
));
}
#[test]
fn parse_entries_header_rejects_version_mismatch() {
let mut buf = Vec::new();
buf.extend_from_slice(MAGIC);
buf.extend_from_slice(&(FORMAT_VERSION + 1).to_le_bytes());
match parse_entries_header(&buf) {
Err(JmdictError::DataVersionMismatch { expected, found }) => {
assert_eq!(expected, FORMAT_VERSION);
assert_eq!(found, FORMAT_VERSION + 1);
}
_ => panic!("expected DataVersionMismatch"),
}
}
#[test]
#[cfg(feature = "embedded")]
fn load_dict_embedded() {
let dict = Dict::load_embedded().expect("load failed");
assert!(dict.kana_fst.contains_key("ねこ"));
assert!(dict.kanji_fst.contains_key("猫"));
assert!(dict.romaji_fst.contains_key("neko"));
assert!(dict.kana_fst.contains_key("たべる"));
assert!(dict.kanji_fst.contains_key("食べる"));
assert!(dict.kana_fst.contains_key("にゃんこ"));
assert!(dict.kanji_fst.contains_key("鯉"));
}
}