jmdict-fast 0.1.3

use crate::error::JmdictError;
use crate::model::{
    DataVersion, DeinflectionInfo, Entry, LookupResult, MatchType, Xref, FORMAT_VERSION, MAGIC,
};
use crate::query::{BatchQueryBuilder, QueryBuilder};
use fst::{automaton::Levenshtein, automaton::Str, Automaton, IntoStreamer, Map, Streamer};
use memmap2::Mmap;
use std::collections::{BTreeSet, HashMap};
use std::sync::Arc;
use std::{fs::File, path::Path};

/// Backing storage for [`Dict`] data. Holds either a memory-mapped file,
/// a `'static` byte slice (used by the embedded feature), or an owned
/// allocation. All three implement [`AsRef<[u8]>`] so `fst::Map` can wrap
/// them uniformly.
#[derive(Clone)]
pub enum DictStorage {
    /// Memory-mapped file; the `Arc` keeps the mapping alive while any
    /// reference into it exists.
    Mmap(Arc<Mmap>),
    /// `'static` slice — typically from `include_bytes!`.
    Static(&'static [u8]),
    /// Owned buffer on the heap.
    Owned(Arc<Vec<u8>>),
}

impl AsRef<[u8]> for DictStorage {
    fn as_ref(&self) -> &[u8] {
        match self {
            DictStorage::Mmap(m) => &m[..],
            DictStorage::Static(s) => s,
            DictStorage::Owned(v) => &v[..],
        }
    }
}

/// A raw match candidate from FST search, before entry deserialization.
#[derive(Clone)]
pub(crate) struct MatchCandidate {
    pub(crate) id: u64,
    pub(crate) key: String,
    pub(crate) match_type: MatchType,
    pub(crate) score: f64,
    pub(crate) deinflection: Option<DeinflectionInfo>,
}

/// Insert `cand` into `best` keyed by id, replacing the existing value only when
/// `cand` has a strictly higher score. Used by prefix and fuzzy candidate
/// collection so a later, better-quality hit (e.g. an `Exact` match in romaji)
/// can supersede an earlier, lower-quality hit (e.g. a `Fuzzy` match in kana)
/// for the same entry.
fn upsert_better(best: &mut HashMap<u64, MatchCandidate>, cand: MatchCandidate) {
    match best.get(&cand.id) {
        Some(existing) if existing.score >= cand.score => {}
        _ => {
            best.insert(cand.id, cand);
        }
    }
}

pub struct Dict {
    pub entries_blob: DictStorage,
    pub kana_fst: Map<DictStorage>,
    pub kanji_fst: Map<DictStorage>,
    pub romaji_fst: Map<DictStorage>,
    pub id_fst: Map<DictStorage>,
    /// Reverse index: English gloss token → offset into `gloss_postings`.
    pub gloss_fst: Map<DictStorage>,
    /// Posting lists keyed by `gloss_fst` offset. At each offset is `u32 count`
    /// followed by `count × u64` entry ids (little-endian).
    pub gloss_postings: DictStorage,
    deinflector: bunpo::deinflector::Deinflector,
    data_version: DataVersion,
    header_size: usize,
    entry_count: u32,
}

struct HeaderInfo {
    data_version: DataVersion,
    /// Total bytes before entry_count (magic + version + metadata strings)
    header_size: usize,
    entry_count: u32,
}

/// Parse the entries.bin header: magic, format version, jmdict_version, generated_at, entry count.
fn parse_entries_header(data: &[u8]) -> Result<HeaderInfo, JmdictError> {
    if data.len() < 8 {
        return Err(JmdictError::DataCorrupted);
    }
    if &data[0..4] != MAGIC {
        return Err(JmdictError::DataCorrupted);
    }
    let version = u32::from_le_bytes(data[4..8].try_into().unwrap());
    if version != FORMAT_VERSION {
        return Err(JmdictError::DataVersionMismatch {
            expected: FORMAT_VERSION,
            found: version,
        });
    }

    // Parse jmdict_version (u16 len + bytes)
    if data.len() < 10 {
        return Err(JmdictError::DataCorrupted);
    }
    let jmdict_ver_len = u16::from_le_bytes(data[8..10].try_into().unwrap()) as usize;
    let mut pos = 10;
    if data.len() < pos + jmdict_ver_len + 2 {
        return Err(JmdictError::DataCorrupted);
    }
    let jmdict_version = String::from_utf8_lossy(&data[pos..pos + jmdict_ver_len]).to_string();
    pos += jmdict_ver_len;

    // Parse generated_at (u16 len + bytes)
    let gen_at_len = u16::from_le_bytes(data[pos..pos + 2].try_into().unwrap()) as usize;
    pos += 2;
    if data.len() < pos + gen_at_len {
        return Err(JmdictError::DataCorrupted);
    }
    let generated_at = String::from_utf8_lossy(&data[pos..pos + gen_at_len]).to_string();
    pos += gen_at_len;

    // Parse entry_count (u32) — validate presence so later reads don't panic.
    if data.len() < pos + 4 {
        return Err(JmdictError::DataCorrupted);
    }
    let entry_count = u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap());

    Ok(HeaderInfo {
        data_version: DataVersion {
            format_version: version,
            jmdict_version,
            generated_at,
        },
        header_size: pos,
        entry_count,
    })
}

/// Binary search for `id` in a posting-list byte slice. The slice is treated
/// as `N × 8` bytes of little-endian `u64`s sorted ascending — the layout
/// `xtask` writes for `gloss_postings.bin`. Avoids decoding the whole list
/// into a `Vec<u64>` for the common case where we only need membership.
fn postings_contains(bytes: &[u8], id: u64) -> bool {
    let n = bytes.len() / 8;
    let mut lo = 0;
    let mut hi = n;
    while lo < hi {
        let mid = lo + (hi - lo) / 2;
        let chunk = &bytes[mid * 8..mid * 8 + 8];
        let v = u64::from_le_bytes(chunk.try_into().unwrap());
        match v.cmp(&id) {
            std::cmp::Ordering::Less => lo = mid + 1,
            std::cmp::Ordering::Greater => hi = mid,
            std::cmp::Ordering::Equal => return true,
        }
    }
    false
}

fn mmap_storage(path: &Path) -> Result<DictStorage, JmdictError> {
    let file = File::open(path)?;
    // SAFETY: the Mmap is kept alive inside an Arc inside DictStorage. We
    // never mutate the underlying file, and the kernel will surface SIGBUS
    // if the file is truncated under us — the same constraint memmap2
    // documents for any user.
    let map = unsafe { Mmap::map(&file)? };
    Ok(DictStorage::Mmap(Arc::new(map)))
}

impl Dict {
    /// Construct a `Dict` from `'static` in-memory slices, typically produced
    /// by `include_bytes!` for the `embedded` feature.
    #[allow(clippy::too_many_arguments)]
    pub fn from_slices(
        entries: &'static [u8],
        kana_fst: &'static [u8],
        kanji_fst: &'static [u8],
        romaji_fst: &'static [u8],
        id_fst: &'static [u8],
        gloss_fst: &'static [u8],
        gloss_postings: &'static [u8],
    ) -> Result<Self, JmdictError> {
        Self::from_storage(
            DictStorage::Static(entries),
            DictStorage::Static(kana_fst),
            DictStorage::Static(kanji_fst),
            DictStorage::Static(romaji_fst),
            DictStorage::Static(id_fst),
            DictStorage::Static(gloss_fst),
            DictStorage::Static(gloss_postings),
        )
    }

    /// Construct a `Dict` from already-loaded [`DictStorage`] values. Use this
    /// to wire up custom storage (e.g. data downloaded into an in-memory
    /// buffer): build `DictStorage::Owned(Arc::new(bytes))` and pass it in.
    #[allow(clippy::too_many_arguments)]
    pub fn from_storage(
        entries: DictStorage,
        kana_fst: DictStorage,
        kanji_fst: DictStorage,
        romaji_fst: DictStorage,
        id_fst: DictStorage,
        gloss_fst: DictStorage,
        gloss_postings: DictStorage,
    ) -> Result<Self, JmdictError> {
        let header = parse_entries_header(entries.as_ref())?;
        Ok(Self {
            entries_blob: entries,
            kana_fst: Map::new(kana_fst)?,
            kanji_fst: Map::new(kanji_fst)?,
            romaji_fst: Map::new(romaji_fst)?,
            id_fst: Map::new(id_fst)?,
            gloss_fst: Map::new(gloss_fst)?,
            gloss_postings,
            deinflector: bunpo::deinflector::Deinflector::new(),
            data_version: header.data_version,
            header_size: header.header_size,
            entry_count: header.entry_count,
        })
    }

    /// Load all FSTs and entries via real `mmap` (zero-copy). The OS pages in
    /// data on demand and shares it across processes that map the same file.
    pub fn load<P: AsRef<Path>>(base_dir: P) -> Result<Self, JmdictError> {
        let base = base_dir.as_ref();
        let entries = mmap_storage(&base.join("entries.bin"))?;
        let kana = mmap_storage(&base.join("kana.fst"))?;
        let kanji = mmap_storage(&base.join("kanji.fst"))?;
        let romaji = mmap_storage(&base.join("romaji.fst"))?;
        let id = mmap_storage(&base.join("id.fst"))?;
        let gloss = mmap_storage(&base.join("gloss.fst"))?;
        let gloss_postings = mmap_storage(&base.join("gloss_postings.bin"))?;
        Self::from_storage(entries, kana, kanji, romaji, id, gloss, gloss_postings)
    }

    #[cfg(feature = "embedded")]
    pub fn load_embedded() -> Result<Self, JmdictError> {
        let entries = include_bytes!(concat!(env!("OUT_DIR"), "/entries.bin"));
        let kana_fst = include_bytes!(concat!(env!("OUT_DIR"), "/kana.fst"));
        let kanji_fst = include_bytes!(concat!(env!("OUT_DIR"), "/kanji.fst"));
        let romaji_fst = include_bytes!(concat!(env!("OUT_DIR"), "/romaji.fst"));
        let id_fst = include_bytes!(concat!(env!("OUT_DIR"), "/id.fst"));
        let gloss_fst = include_bytes!(concat!(env!("OUT_DIR"), "/gloss.fst"));
        let gloss_postings = include_bytes!(concat!(env!("OUT_DIR"), "/gloss_postings.bin"));

        Self::from_slices(
            entries,
            kana_fst,
            kanji_fst,
            romaji_fst,
            id_fst,
            gloss_fst,
            gloss_postings,
        )
    }

    pub fn load_default() -> Result<Self, JmdictError> {
        #[cfg(feature = "embedded")]
        {
            if let Ok(dict) = Self::load_embedded() {
                return Ok(dict);
            }
        }

        if let Ok(data_path) = std::env::var("JMDICT_DATA") {
            return Self::load(Path::new(&data_path));
        }

        let dist = Path::new("dist");
        if dist.join("entries.bin").exists() {
            return Self::load(dist);
        }

        // Test-only fallback: when running this crate's own tests, cargo sets CWD to
        // jmdict-fast/, but `dist/` lives at the workspace root. CARGO_MANIFEST_DIR is
        // resolved at compile time, so this path is only meaningful for in-repo builds.
        #[cfg(test)]
        {
            let workspace_dist = Path::new(env!("CARGO_MANIFEST_DIR")).join("../dist");
            if workspace_dist.join("entries.bin").exists() {
                return Self::load(&workspace_dist);
            }
        }

        Self::load(dist)
    }

    /// Returns the total number of entries in the dictionary.
    pub fn entry_count(&self) -> usize {
        self.entry_count as usize
    }

    /// Returns data version information (format version, JMdict source version, generation timestamp).
    pub fn version(&self) -> DataVersion {
        self.data_version.clone()
    }

    /// Lookup a term exactly across kana, kanji, romaji.
    ///
    /// Convenience method equivalent to `dict.lookup(term).mode(MatchMode::Exact).execute()`.
    pub fn lookup_exact(&self, term: &str) -> Vec<LookupResult> {
        self.lookup_exact_inner(term)
    }

    fn lookup_exact_inner(&self, term: &str) -> Vec<LookupResult> {
        self.candidates_to_results(self.exact_candidates(term))
    }

    pub(crate) fn exact_candidates(&self, term: &str) -> Vec<MatchCandidate> {
        let mut ids = Vec::new();

        if let Some(id) = self.kana_fst.get(term) {
            ids.push(id);
        }
        if let Some(id) = self.kanji_fst.get(term) {
            ids.push(id);
        }
        if let Some(id) = self.romaji_fst.get(term) {
            ids.push(id);
        }

        ids.sort();
        ids.dedup();

        ids.into_iter()
            .map(|id| MatchCandidate {
                id,
                key: term.to_string(),
                match_type: MatchType::Exact,
                score: 1.0,
                deinflection: None,
            })
            .collect()
    }

    /// Lookup a term with deinflection fallback.
    ///
    /// Convenience method equivalent to `dict.lookup(term).mode(MatchMode::Deinflect).execute()`.
    pub fn lookup_exact_with_deinflection(&self, term: &str) -> Vec<LookupResult> {
        self.lookup_exact_with_deinflection_inner(term)
    }

    fn lookup_exact_with_deinflection_inner(&self, term: &str) -> Vec<LookupResult> {
        self.candidates_to_results(self.deinflect_candidates(term))
    }

    pub(crate) fn deinflect_candidates(&self, term: &str) -> Vec<MatchCandidate> {
        // First try exact
        let exact = self.exact_candidates(term);
        if !exact.is_empty() {
            return exact;
        }

        // Then deinflect
        let deinflected = self.deinflector.deinflect(term);
        let mut seen_ids = BTreeSet::new();
        let mut candidates = Vec::new();
        for candidate in deinflected {
            let exact = self.exact_candidates(&candidate.word);
            for mc in exact {
                if !seen_ids.insert(mc.id) {
                    continue;
                }
                candidates.push(MatchCandidate {
                    id: mc.id,
                    key: candidate.word.clone(),
                    match_type: MatchType::Deinflected,
                    score: 0.75,
                    deinflection: Some(DeinflectionInfo {
                        original_form: term.to_string(),
                        base_form: candidate.word.clone(),
                        rules: candidate
                            .reason_chains
                            .iter()
                            .flatten()
                            .map(|r| format!("{:?}", r))
                            .collect(),
                    }),
                });
            }
        }

        // Sort by score descending
        candidates.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
        candidates
    }

    /// Lookup entries that start with the given term (prefix search).
    ///
    /// Convenience method equivalent to `dict.lookup(term).mode(MatchMode::Prefix).execute()`.
    pub fn lookup_partial(&self, prefix: &str) -> Vec<LookupResult> {
        self.lookup_partial_inner(prefix)
    }

    fn lookup_partial_inner(&self, prefix: &str) -> Vec<LookupResult> {
        self.candidates_to_results(self.prefix_candidates(prefix))
    }

    pub(crate) fn prefix_candidates(&self, prefix: &str) -> Vec<MatchCandidate> {
        let automaton = Str::new(prefix).starts_with();

        // Keep the highest-scored match per entry id. Walking the FSTs in order
        // alone is not enough: a low-quality prefix hit in one index can be
        // followed by an exact hit in another for the same id, and the better
        // hit must win.
        let mut best: HashMap<u64, MatchCandidate> = HashMap::new();
        for fst in [&self.kana_fst, &self.kanji_fst, &self.romaji_fst] {
            let mut stream = fst.search(&automaton).into_stream();
            while let Some((key, id)) = stream.next() {
                let key_str = String::from_utf8_lossy(key).to_string();
                let is_exact = key_str == prefix;
                let (match_type, score) = if is_exact {
                    (MatchType::Exact, 1.0)
                } else {
                    (MatchType::Prefix, 0.5)
                };
                upsert_better(
                    &mut best,
                    MatchCandidate {
                        id,
                        key: key_str,
                        match_type,
                        score,
                        deinflection: None,
                    },
                );
            }
        }

        let mut candidates: Vec<MatchCandidate> = best.into_values().collect();
        // Secondary key on `id` keeps the order deterministic for equal scores —
        // `HashMap` iteration order varies between instances.
        candidates.sort_by(|a, b| {
            b.score
                .partial_cmp(&a.score)
                .unwrap()
                .then(a.id.cmp(&b.id))
        });
        candidates
    }

    pub(crate) fn fuzzy_candidates(
        &self,
        term: &str,
        max_distance: u32,
    ) -> Result<Vec<MatchCandidate>, JmdictError> {
        let automaton = Levenshtein::new(term, max_distance)
            .map_err(|_| JmdictError::InvalidQuery)?;

        let mut best: HashMap<u64, MatchCandidate> = HashMap::new();
        for fst in [&self.kana_fst, &self.kanji_fst, &self.romaji_fst] {
            let mut stream = fst.search(&automaton).into_stream();
            while let Some((key, id)) = stream.next() {
                let key_str = String::from_utf8_lossy(key).to_string();
                let is_exact = key_str == term;
                let (match_type, score) = if is_exact {
                    (MatchType::Exact, 1.0)
                } else {
                    let key_len = key_str.chars().count().max(1) as f64;
                    let term_len = term.chars().count().max(1) as f64;
                    let len_diff = (key_len - term_len).abs();
                    let score = 0.5 - (len_diff / (key_len + term_len)) * 0.2;
                    (MatchType::Fuzzy, score.max(0.1))
                };
                upsert_better(
                    &mut best,
                    MatchCandidate {
                        id,
                        key: key_str,
                        match_type,
                        score,
                        deinflection: None,
                    },
                );
            }
        }

        let mut candidates: Vec<MatchCandidate> = best.into_values().collect();
        // Secondary key on `id` keeps the order deterministic for equal scores —
        // `HashMap` iteration order varies between instances.
        candidates.sort_by(|a, b| {
            b.score
                .partial_cmp(&a.score)
                .unwrap()
                .then(a.id.cmp(&b.id))
        });
        Ok(candidates)
    }

    /// Create a query builder for the given term.
    pub fn lookup(&self, term: &str) -> QueryBuilder<'_> {
        QueryBuilder::new(self, term)
    }

    /// Create a batch query builder for multiple terms.
    pub fn lookup_batch(&self, terms: &[&str]) -> BatchQueryBuilder<'_> {
        BatchQueryBuilder::new(self, terms.iter().map(|s| s.to_string()).collect())
    }

    /// Reverse lookup: find entries whose English glosses contain every token
    /// in `query` (ANDed). Tokenization at query time mirrors the build-time
    /// tokenizer: ASCII alphanumeric, lowercased, non-ASCII treated as a
    /// separator.
    ///
    /// Results are ranked: rarer tokens (shorter posting lists) raise the
    /// score; the rank cap is 0.6 so kanji/kana exact lookups still win in
    /// mixed pipelines. Returns an empty `Vec` if the query has no usable
    /// tokens or any token is absent.
    pub fn lookup_gloss(&self, query: &str) -> Vec<LookupResult> {
        // Mirror the build-time tokenizer in xtask: ASCII alphanumeric,
        // lowercased, sorted and deduped. Sorting+dedup makes "cat cat" and
        // "cat box" / "box cat" canonical so the match_key is stable and we
        // don't read the same posting list twice.
        let mut tokens: Vec<String> = query
            .split(|c: char| !c.is_ascii_alphanumeric())
            .filter(|s| !s.is_empty())
            .map(|s| s.to_ascii_lowercase())
            .collect();
        tokens.sort();
        tokens.dedup();
        if tokens.is_empty() {
            return Vec::new();
        }

        // For each token, take a *borrowed slice* into the mmap'd postings
        // file. No allocation — the slice is already the array of u64 ids
        // (sorted, deduped, little-endian) at the right offset.
        let mut posting_lists: Vec<&[u8]> = Vec::with_capacity(tokens.len());
        for tok in &tokens {
            match self.gloss_postings_for(tok) {
                Some(bytes) => posting_lists.push(bytes),
                None => return Vec::new(),
            }
        }

        // Intersect smallest-first to minimize work. Only the smallest list
        // is iterated; the rest are binary-searched in place.
        posting_lists.sort_by_key(|p| p.len());
        let smallest = posting_lists[0];
        let rest = &posting_lists[1..];

        let intersected: Vec<u64> = smallest
            .chunks_exact(8)
            .map(|c| u64::from_le_bytes(c.try_into().unwrap()))
            .filter(|id| rest.iter().all(|other| postings_contains(other, *id)))
            .collect();

        // Score: 0.6 ceiling; rarer tokens nudge the score up via 1 /
        // posting-list-size (in entry counts, not bytes).
        let total_entries: usize = posting_lists
            .iter()
            .map(|p| p.len() / 8)
            .sum::<usize>()
            .max(1);
        let score = 0.6f64.min(0.3 + (tokens.len() as f64) / (total_entries as f64));

        let key = tokens.join(" ");
        intersected
            .into_iter()
            .filter_map(|id| {
                self.load_entry(id).map(|entry| LookupResult {
                    entry,
                    match_type: MatchType::Gloss,
                    match_key: key.clone(),
                    score,
                    deinflection: None,
                })
            })
            .collect()
    }

    /// Borrow the posting bytes for a single gloss token. The returned slice
    /// is `count × 8` bytes of little-endian `u64` entry ids, already sorted
    /// and deduplicated by `xtask`. Returns `None` if the token is absent or
    /// the postings file is truncated.
    fn gloss_postings_for(&self, token: &str) -> Option<&[u8]> {
        let offset = self.gloss_fst.get(token)? as usize;
        let postings = self.gloss_postings.as_ref();
        let count = u32::from_le_bytes(postings.get(offset..offset + 4)?.try_into().ok()?) as usize;
        let start = offset + 4;
        let end = start + count * 8;
        postings.get(start..end)
    }

    /// Resolve a cross-reference ([`Xref`]) to dictionary entries.
    ///
    /// Looks up `xref.term` across kanji and kana indexes. If `xref.reading`
    /// is set, results are further restricted to entries whose kana matches
    /// that reading — this disambiguates homographs like 生 (なま / せい).
    /// `xref.sense_index` is preserved on the caller side: this returns whole
    /// entries, since the surrounding `LookupResult` is per-entry.
    pub fn resolve_xref(&self, xref: &Xref) -> Vec<LookupResult> {
        let mut results = self.lookup_exact(&xref.term);
        if let Some(reading) = xref.reading.as_deref() {
            results.retain(|r| r.entry.kana.iter().any(|k| k.text == reading));
        }
        results
    }

    /// Lookup an entry by its JMdict ID (the string `entry.id`, e.g. `"1467640"`).
    ///
    /// Returns `None` if no entry with that ID exists.
    pub fn lookup_by_id(&self, jmdict_id: &str) -> Option<LookupResult> {
        let seq_id = self.id_fst.get(jmdict_id)?;
        let entry = self.load_entry(seq_id)?;
        Some(LookupResult {
            entry,
            match_type: MatchType::Exact,
            match_key: jmdict_id.to_string(),
            score: 1.0,
            deinflection: None,
        })
    }

    /// Fetch an entry by its sequential (internal) index in `0..entry_count()`.
    ///
    /// Sequential IDs are stable for a given `entries.bin` but may change when
    /// the data is regenerated; use [`Dict::lookup_by_id`] for stable lookups
    /// across regenerations.
    pub fn get(&self, seq_id: u64) -> Option<Entry> {
        self.load_entry(seq_id)
    }

    /// Iterate over every entry in the dictionary, in sequential-ID order.
    ///
    /// Entries are deserialized lazily as the iterator advances.
    pub fn iter_entries(&self) -> EntryIter<'_> {
        EntryIter {
            dict: self,
            next: 0,
            end: self.entry_count as u64,
        }
    }

    /// Convert match candidates to results by deserializing entries.
    fn candidates_to_results(&self, candidates: Vec<MatchCandidate>) -> Vec<LookupResult> {
        candidates
            .into_iter()
            .filter_map(|mc| {
                self.load_entry(mc.id).map(|entry| LookupResult {
                    entry,
                    match_type: mc.match_type,
                    match_key: mc.key,
                    score: mc.score,
                    deinflection: mc.deinflection,
                })
            })
            .collect()
    }

    // When reading offsets, start after header + entry_count (4 bytes)
    pub(crate) fn load_entry(&self, id: u64) -> Option<Entry> {
        let count = self.entry_count as usize;
        if id as usize >= count {
            return None;
        }
        let hs = self.header_size;
        let offset_index = hs + 4 + (id as usize) * 8;
        let blob = self.entries_blob.as_ref();
        let off = u32::from_le_bytes(blob[offset_index..offset_index + 4].try_into().ok()?);
        let len = u32::from_le_bytes(blob[offset_index + 4..offset_index + 8].try_into().ok()?);

        let data_start = hs + 4 + count * 8;
        let start = data_start + (off as usize);
        let end = start + len as usize;

        postcard::from_bytes(&blob[start..end]).ok()
    }
}

/// Iterator over every [`Entry`] in a [`Dict`], in sequential-ID order.
pub struct EntryIter<'d> {
    dict: &'d Dict,
    next: u64,
    end: u64,
}

impl<'d> Iterator for EntryIter<'d> {
    type Item = Entry;

    fn next(&mut self) -> Option<Self::Item> {
        while self.next < self.end {
            let id = self.next;
            self.next += 1;
            if let Some(e) = self.dict.load_entry(id) {
                return Some(e);
            }
        }
        None
    }

    fn size_hint(&self) -> (usize, Option<usize>) {
        let remaining = (self.end - self.next) as usize;
        (0, Some(remaining))
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn pack(ids: &[u64]) -> Vec<u8> {
        let mut v = Vec::with_capacity(ids.len() * 8);
        for id in ids {
            v.extend_from_slice(&id.to_le_bytes());
        }
        v
    }

    #[test]
    fn postings_contains_hits_and_misses() {
        let bytes = pack(&[1, 5, 10, 100, 1_000_000]);
        assert!(postings_contains(&bytes, 1));
        assert!(postings_contains(&bytes, 10));
        assert!(postings_contains(&bytes, 1_000_000));
        assert!(!postings_contains(&bytes, 0));
        assert!(!postings_contains(&bytes, 2));
        assert!(!postings_contains(&bytes, 99));
        assert!(!postings_contains(&bytes, 1_000_001));
    }

    #[test]
    fn postings_contains_empty_slice() {
        assert!(!postings_contains(&[], 0));
        assert!(!postings_contains(&[], 42));
    }

    #[test]
    fn dict_storage_as_ref_owned() {
        let storage = DictStorage::Owned(Arc::new(vec![1, 2, 3]));
        assert_eq!(storage.as_ref(), &[1, 2, 3][..]);
    }

    #[test]
    fn dict_storage_as_ref_static() {
        let storage = DictStorage::Static(b"hello");
        assert_eq!(storage.as_ref(), b"hello");
    }

    #[test]
    fn parse_entries_header_rejects_bad_magic() {
        let bad = b"XXXX\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00";
        assert!(matches!(
            parse_entries_header(bad),
            Err(JmdictError::DataCorrupted)
        ));
    }

    #[test]
    fn parse_entries_header_rejects_short_buffer() {
        assert!(matches!(
            parse_entries_header(&[]),
            Err(JmdictError::DataCorrupted)
        ));
        assert!(matches!(
            parse_entries_header(b"JMD"),
            Err(JmdictError::DataCorrupted)
        ));
    }

    #[test]
    fn parse_entries_header_rejects_version_mismatch() {
        let mut buf = Vec::new();
        buf.extend_from_slice(MAGIC);
        buf.extend_from_slice(&(FORMAT_VERSION + 1).to_le_bytes());
        match parse_entries_header(&buf) {
            Err(JmdictError::DataVersionMismatch { expected, found }) => {
                assert_eq!(expected, FORMAT_VERSION);
                assert_eq!(found, FORMAT_VERSION + 1);
            }
            _ => panic!("expected DataVersionMismatch"),
        }
    }

    #[test]
    #[cfg(feature = "embedded")]
    fn load_dict_embedded() {
        let dict = Dict::load_embedded().expect("load failed");
        assert!(dict.kana_fst.contains_key("ねこ"));
        assert!(dict.kanji_fst.contains_key("猫"));
        assert!(dict.romaji_fst.contains_key("neko"));

        assert!(dict.kana_fst.contains_key("たべる"));
        assert!(dict.kanji_fst.contains_key("食べる"));

        // uncommon kana
        assert!(dict.kana_fst.contains_key("にゃんこ"));
        // uncommon kanji
        assert!(dict.kanji_fst.contains_key("鯉"));
    }
}