wordnet_db/
lib.rs

1//! Load WordNet dictionaries with full fidelity and zero-copy text.
2//!
3//! This crate ingests the canonical `data.*`/`index.*` files, preserves every
4//! field (`lex_id`, `ss_type`, pointer source/target indices, verb frames),
5//! and exposes borrowed `&str` slices for all text. Callers choose between
6//! memory-mapped files or owned buffers at runtime via [`LoadMode`].
7//!
8//! Public access is intentionally read-only (no `pub` fields), leaving room to
9//! evolve internal storage while keeping a stable API surface.
10//!
11//! # Features
12//! - Zero-copy text: lemmas, pointer symbols, glosses, and indices borrow from
13//!   the original bytes.
14//! - Full-fidelity parsing: retains raw offsets, satellite adjectives, frames,
15//!   and pointer source/target indices.
16//! - Runtime backing choice: switch between mmap and owned buffers with
17//!   [`LoadMode::Mmap`] / [`LoadMode::Owned`].
18//! - Convenience lookups: lemma existence, index entries, synset fetching,
19//!   and a streaming iterator over all synsets.
20//!
21//! # Example
22//! ```no_run
23//! use wordnet_db::{LoadMode, WordNet};
24//! use wordnet_types::Pos;
25//!
26//! # fn main() -> anyhow::Result<()> {
27//! let wn = WordNet::load_with_mode("/path/to/wordnet", LoadMode::Mmap)?;
28//! let dog_index = wn.index_entry(Pos::Noun, "dog").expect("dog in index");
29//! println!("dog synsets: {:?}", dog_index.synset_offsets);
30//!
31//! for sid in wn.synsets_for_lemma(Pos::Noun, "dog") {
32//!     let syn = wn.get_synset(*sid).unwrap();
33//!     println!("{}: {}", syn.id.offset, syn.gloss.definition);
34//! }
35//! # Ok(()) }
36//! ```
37//!
38//! For a runnable demo, see `cargo run -p wordnet-db --example stats -- <dict>`.
39
40use std::collections::HashMap;
41use std::fs::File;
42use std::io::Read;
43use std::path::{Path, PathBuf};
44
45use anyhow::{Context, Result};
46use memmap2::Mmap;
47use wordnet_types::{
48    Frame, Gloss, IndexEntry, Lemma, Pointer, Pos, Synset, SynsetId, SynsetType, decode_st,
49};
50
51/// Strategy for loading dictionary files.
52#[derive(Clone, Copy, Debug, Eq, PartialEq)]
53pub enum LoadMode {
54    /// Memory-map each WordNet file (fast, zero-copy).
55    Mmap,
56    /// Read each file into an owned buffer (portable fallback).
57    Owned,
58}
59
60enum Buffer {
61    Mmap(Mmap),
62    Owned(Vec<u8>),
63}
64
65impl Buffer {
66    fn as_slice(&self) -> &[u8] {
67        match self {
68            Buffer::Mmap(m) => m.as_ref(),
69            Buffer::Owned(v) => v.as_slice(),
70        }
71    }
72}
73
74#[derive(Clone, Copy, Debug)]
75enum FileKind {
76    DataNoun,
77    DataVerb,
78    DataAdj,
79    DataAdv,
80    IndexNoun,
81    IndexVerb,
82    IndexAdj,
83    IndexAdv,
84    Frames,
85    Cntlist,
86}
87
88#[derive(Clone, Copy)]
89struct TextRef {
90    file: FileKind,
91    start: usize,
92    len: usize,
93}
94
95struct DictFiles {
96    data_noun: Buffer,
97    data_verb: Buffer,
98    data_adj: Buffer,
99    data_adv: Buffer,
100    index_noun: Buffer,
101    index_verb: Buffer,
102    index_adj: Buffer,
103    index_adv: Buffer,
104    frames: Option<Buffer>,
105    cntlist: Option<Buffer>,
106}
107
108impl DictFiles {
109    fn load(dict_dir: &Path, mode: LoadMode) -> Result<Self> {
110        let data_noun = load_file(dict_dir.join("data.noun"), mode)?;
111        let data_verb = load_file(dict_dir.join("data.verb"), mode)?;
112        let data_adj = load_file(dict_dir.join("data.adj"), mode)?;
113        let data_adv = load_file(dict_dir.join("data.adv"), mode)?;
114        let index_noun = load_file(dict_dir.join("index.noun"), mode)?;
115        let index_verb = load_file(dict_dir.join("index.verb"), mode)?;
116        let index_adj = load_file(dict_dir.join("index.adj"), mode)?;
117        let index_adv = load_file(dict_dir.join("index.adv"), mode)?;
118        let frames = load_optional_file(dict_dir.join("frames.vrb"), mode)?;
119        let cntlist = load_optional_file(dict_dir.join("cntlist.rev"), mode)?;
120
121        Ok(Self {
122            data_noun,
123            data_verb,
124            data_adj,
125            data_adv,
126            index_noun,
127            index_verb,
128            index_adj,
129            index_adv,
130            frames,
131            cntlist,
132        })
133    }
134
135    fn bytes(&self, file: FileKind) -> &[u8] {
136        match file {
137            FileKind::DataNoun => self.data_noun.as_slice(),
138            FileKind::DataVerb => self.data_verb.as_slice(),
139            FileKind::DataAdj => self.data_adj.as_slice(),
140            FileKind::DataAdv => self.data_adv.as_slice(),
141            FileKind::IndexNoun => self.index_noun.as_slice(),
142            FileKind::IndexVerb => self.index_verb.as_slice(),
143            FileKind::IndexAdj => self.index_adj.as_slice(),
144            FileKind::IndexAdv => self.index_adv.as_slice(),
145            FileKind::Frames => self.frames.as_ref().map(Buffer::as_slice).unwrap_or(&[]),
146            FileKind::Cntlist => self.cntlist.as_ref().map(Buffer::as_slice).unwrap_or(&[]),
147        }
148    }
149
150    fn text(&self, r: TextRef) -> &str {
151        let bytes = self.bytes(r.file);
152        let slice = &bytes[r.start..r.start + r.len];
153        std::str::from_utf8(slice).expect("wordnet text is valid utf8")
154    }
155}
156
157struct LemmaData {
158    text: TextRef,
159    lex_id: u8,
160}
161
162struct PointerData {
163    symbol: TextRef,
164    target: SynsetId,
165    src_word: Option<u16>,
166    dst_word: Option<u16>,
167}
168
169struct GlossData {
170    raw: TextRef,
171    definition: TextRef,
172    examples: Vec<TextRef>,
173}
174
175struct SynsetData {
176    id: SynsetId,
177    lex_filenum: u8,
178    synset_type: SynsetType,
179    words: Vec<LemmaData>,
180    pointers: Vec<PointerData>,
181    frames: Vec<Frame>,
182    gloss: GlossData,
183}
184
185struct IndexEntryData {
186    lemma: TextRef,
187    synset_cnt: u32,
188    p_cnt: u32,
189    ptr_symbols: Vec<TextRef>,
190    sense_cnt: u32,
191    tagsense_cnt: u32,
192    synset_offsets: Vec<u32>,
193}
194
195/// In-memory view of a WordNet dictionary backed by mmap or owned buffers.
196pub struct WordNet {
197    files: DictFiles,
198    index: HashMap<(Pos, String), IndexEntryData>,
199    synsets: HashMap<SynsetId, SynsetData>,
200    lemma_to_synsets: HashMap<(Pos, String), Vec<SynsetId>>,
201    verb_frames_text: HashMap<u16, TextRef>,
202    sense_counts: HashMap<(String, Pos, u32), u32>,
203}
204
205impl WordNet {
206    /// Load WordNet from a directory containing `data.*` and `index.*` files.
207    ///
208    /// Defaults to memory-mapping the source files. Use [`load_with_mode`] to
209    /// force owned buffers instead.
210    pub fn load(dict_dir: impl AsRef<Path>) -> Result<Self> {
211        Self::load_with_mode(dict_dir, LoadMode::Mmap)
212    }
213
214    /// Load WordNet choosing between mmap and owned buffers at runtime.
215    pub fn load_with_mode(dict_dir: impl AsRef<Path>, mode: LoadMode) -> Result<Self> {
216        let dir = dict_dir.as_ref();
217        let required = [
218            "data.noun",
219            "data.verb",
220            "data.adj",
221            "data.adv",
222            "index.noun",
223            "index.verb",
224            "index.adj",
225            "index.adv",
226        ];
227        for name in &required {
228            let path = dir.join(name);
229            if !path.exists() {
230                anyhow::bail!("missing required WordNet file: {}", path.display());
231            }
232        }
233
234        let files = DictFiles::load(dir, mode)?;
235
236        let mut index = HashMap::new();
237        let mut lemma_to_synsets = HashMap::new();
238        parse_index(
239            files.bytes(FileKind::IndexNoun),
240            FileKind::IndexNoun,
241            Pos::Noun,
242            &mut index,
243            &mut lemma_to_synsets,
244        )?;
245        parse_index(
246            files.bytes(FileKind::IndexVerb),
247            FileKind::IndexVerb,
248            Pos::Verb,
249            &mut index,
250            &mut lemma_to_synsets,
251        )?;
252        parse_index(
253            files.bytes(FileKind::IndexAdj),
254            FileKind::IndexAdj,
255            Pos::Adj,
256            &mut index,
257            &mut lemma_to_synsets,
258        )?;
259        parse_index(
260            files.bytes(FileKind::IndexAdv),
261            FileKind::IndexAdv,
262            Pos::Adv,
263            &mut index,
264            &mut lemma_to_synsets,
265        )?;
266
267        let mut synsets = HashMap::new();
268        parse_data(
269            files.bytes(FileKind::DataNoun),
270            FileKind::DataNoun,
271            Pos::Noun,
272            &mut synsets,
273        )?;
274        parse_data(
275            files.bytes(FileKind::DataVerb),
276            FileKind::DataVerb,
277            Pos::Verb,
278            &mut synsets,
279        )?;
280        parse_data(
281            files.bytes(FileKind::DataAdj),
282            FileKind::DataAdj,
283            Pos::Adj,
284            &mut synsets,
285        )?;
286        parse_data(
287            files.bytes(FileKind::DataAdv),
288            FileKind::DataAdv,
289            Pos::Adv,
290            &mut synsets,
291        )?;
292
293        let verb_frames_text = parse_frames_vrb(files.bytes(FileKind::Frames));
294        let sense_counts = parse_cntlist(files.bytes(FileKind::Cntlist));
295
296        Ok(Self {
297            files,
298            index,
299            synsets,
300            lemma_to_synsets,
301            verb_frames_text,
302            sense_counts,
303        })
304    }
305
306    /// Check whether a lemma exists for the given POS according to index files.
307    pub fn lemma_exists(&self, pos: Pos, lemma: &str) -> bool {
308        let key = (pos, normalize_lemma(lemma));
309        self.lemma_to_synsets.contains_key(&key)
310    }
311
312    /// Fetch a raw `IndexEntry` if present.
313    pub fn index_entry(&self, pos: Pos, lemma: &str) -> Option<IndexEntry<'_>> {
314        let key = (pos, normalize_lemma(lemma));
315        self.index.get(&key).map(|entry| IndexEntry {
316            lemma: self.files.text(entry.lemma),
317            pos,
318            synset_cnt: entry.synset_cnt,
319            p_cnt: entry.p_cnt,
320            ptr_symbols: entry
321                .ptr_symbols
322                .iter()
323                .map(|r| self.files.text(*r))
324                .collect(),
325            sense_cnt: entry.sense_cnt,
326            tagsense_cnt: entry.tagsense_cnt,
327            synset_offsets: entry.synset_offsets.as_slice(),
328        })
329    }
330
331    /// Return the synsets associated with a lemma, or an empty slice.
332    pub fn synsets_for_lemma(&self, pos: Pos, lemma: &str) -> &[SynsetId] {
333        static EMPTY: [SynsetId; 0] = [];
334        let key = (pos, normalize_lemma(lemma));
335        self.lemma_to_synsets
336            .get(&key)
337            .map(|v| v.as_slice())
338            .unwrap_or(&EMPTY)
339    }
340
341    /// Fetch a `Synset` by id if loaded.
342    pub fn get_synset(&self, id: SynsetId) -> Option<Synset<'_>> {
343        self.synsets.get(&id).map(|syn| self.make_synset_view(syn))
344    }
345
346    /// Iterate over all synsets as borrowed views.
347    pub fn iter_synsets(&self) -> impl Iterator<Item = Synset<'_>> + '_ {
348        self.synsets.values().map(|s| self.make_synset_view(s))
349    }
350
351    /// Number of index entries.
352    pub fn index_count(&self) -> usize {
353        self.index.len()
354    }
355
356    /// Number of lemmas tracked across all parts of speech.
357    pub fn lemma_count(&self) -> usize {
358        self.lemma_to_synsets.len()
359    }
360
361    /// Number of synsets.
362    pub fn synset_count(&self) -> usize {
363        self.synsets.len()
364    }
365
366    /// Number of verb frame template strings loaded.
367    pub fn verb_frame_templates_count(&self) -> usize {
368        self.verb_frames_text.len()
369    }
370
371    /// Number of sense-count entries parsed from cntlist.
372    pub fn sense_count_entries(&self) -> usize {
373        self.sense_counts.len()
374    }
375
376    /// Sense frequency for a given lemma/pos/synset, if present in `cntlist.rev`.
377    pub fn sense_count(&self, pos: Pos, lemma: &str, synset_offset: u32) -> Option<u32> {
378        let normalized = normalize_lemma(lemma);
379        let entry = self.index.get(&(pos, normalized.clone()))?;
380        let sense_number = entry
381            .synset_offsets
382            .iter()
383            .position(|off| *off == synset_offset)?;
384        let sense_number = sense_number as u32 + 1;
385        self.sense_counts
386            .get(&(normalized, pos, sense_number))
387            .copied()
388    }
389
390    fn make_synset_view<'a>(&'a self, data: &'a SynsetData) -> Synset<'a> {
391        let words = data
392            .words
393            .iter()
394            .map(|w| Lemma {
395                text: self.files.text(w.text),
396                lex_id: w.lex_id,
397            })
398            .collect();
399        let pointers = data
400            .pointers
401            .iter()
402            .map(|p| Pointer {
403                symbol: self.files.text(p.symbol),
404                target: p.target,
405                src_word: p.src_word,
406                dst_word: p.dst_word,
407            })
408            .collect();
409        let gloss = Gloss {
410            raw: self.files.text(data.gloss.raw),
411            definition: self.files.text(data.gloss.definition),
412            examples: data
413                .gloss
414                .examples
415                .iter()
416                .map(|r| self.files.text(*r))
417                .collect(),
418        };
419
420        Synset {
421            id: data.id,
422            lex_filenum: data.lex_filenum,
423            synset_type: data.synset_type,
424            words,
425            pointers,
426            frames: data.frames.as_slice(),
427            gloss,
428        }
429    }
430}
431
432fn load_file(path: PathBuf, mode: LoadMode) -> Result<Buffer> {
433    match mode {
434        LoadMode::Mmap => {
435            let file = File::open(&path).with_context(|| format!("open {}", path.display()))?;
436            unsafe { Mmap::map(&file) }
437                .map(Buffer::Mmap)
438                .with_context(|| format!("mmap {}", path.display()))
439        }
440        LoadMode::Owned => {
441            let mut file = File::open(&path).with_context(|| format!("open {}", path.display()))?;
442            let mut buf = Vec::new();
443            file.read_to_end(&mut buf)
444                .with_context(|| format!("read {}", path.display()))?;
445            Ok(Buffer::Owned(buf))
446        }
447    }
448}
449
450fn load_optional_file(path: PathBuf, mode: LoadMode) -> Result<Option<Buffer>> {
451    if !path.exists() {
452        return Ok(None);
453    }
454    load_file(path, mode).map(Some)
455}
456
457fn parse_index(
458    bytes: &[u8],
459    file: FileKind,
460    pos: Pos,
461    index: &mut HashMap<(Pos, String), IndexEntryData>,
462    lemma_to_synsets: &mut HashMap<(Pos, String), Vec<SynsetId>>,
463) -> Result<()> {
464    for (lineno, raw_line) in bytes.split(|b| *b == b'\n').enumerate() {
465        let line = strip_cr(raw_line);
466        if line.is_empty() || matches!(line.first(), Some(b' ' | b'\t')) {
467            continue;
468        }
469        let line_str = std::str::from_utf8(line)?;
470        let tokens: Vec<&str> = line_str.split_ascii_whitespace().collect();
471        if tokens.len() < 6 {
472            anyhow::bail!(
473                "{:?}:{} malformed index line (too few tokens)",
474                file,
475                lineno + 1
476            );
477        }
478
479        let lemma_token = tokens[0];
480        let lemma_ref = text_ref_str(file, bytes, lemma_token);
481        let lemma_key = normalize_lemma(lemma_token);
482
483        let synset_cnt: u32 = tokens[2]
484            .parse()
485            .with_context(|| format!("index {:?}:{} synset_cnt", file, lineno + 1))?;
486        let p_cnt: u32 = tokens[3]
487            .parse()
488            .with_context(|| format!("index {:?}:{} p_cnt", file, lineno + 1))?;
489
490        let expected_ptrs = p_cnt as usize;
491        let mut idx = 4;
492        if tokens.len() < idx + expected_ptrs {
493            anyhow::bail!("{:?}:{} pointer count mismatch", file, lineno + 1);
494        }
495        let ptr_symbols = tokens[idx..idx + expected_ptrs]
496            .iter()
497            .map(|sym| text_ref_str(file, bytes, sym))
498            .collect::<Vec<_>>();
499        idx += expected_ptrs;
500        if tokens.len() < idx + 2 {
501            anyhow::bail!("{:?}:{} missing sense counts", file, lineno + 1);
502        }
503        let sense_cnt: u32 = tokens[idx]
504            .parse()
505            .with_context(|| format!("index {:?}:{} sense_cnt", file, lineno + 1))?;
506        idx += 1;
507        let tagsense_cnt: u32 = tokens[idx]
508            .parse()
509            .with_context(|| format!("index {:?}:{} tagsense_cnt", file, lineno + 1))?;
510        idx += 1;
511
512        let offsets: Vec<u32> = tokens[idx..]
513            .iter()
514            .map(|t| {
515                t.parse::<u32>()
516                    .with_context(|| format!("index {:?}:{} synset_offsets", file, lineno + 1))
517            })
518            .collect::<Result<_>>()?;
519        if offsets.len() != synset_cnt as usize {
520            anyhow::bail!(
521                "{:?}:{} synset_cnt mismatch (expected {}, got {})",
522                file,
523                lineno + 1,
524                synset_cnt,
525                offsets.len()
526            );
527        }
528
529        index.insert(
530            (pos, lemma_key.clone()),
531            IndexEntryData {
532                lemma: lemma_ref,
533                synset_cnt,
534                p_cnt,
535                ptr_symbols,
536                sense_cnt,
537                tagsense_cnt,
538                synset_offsets: offsets.clone(),
539            },
540        );
541        lemma_to_synsets.insert(
542            (pos, lemma_key),
543            offsets
544                .into_iter()
545                .map(|offset| SynsetId { pos, offset })
546                .collect(),
547        );
548    }
549
550    Ok(())
551}
552
553fn parse_data(
554    bytes: &[u8],
555    file: FileKind,
556    pos: Pos,
557    synsets: &mut HashMap<SynsetId, SynsetData>,
558) -> Result<()> {
559    for (lineno, raw_line) in bytes.split(|b| *b == b'\n').enumerate() {
560        let line = strip_cr(raw_line);
561        if line.is_empty() || matches!(line.first(), Some(b' ' | b'\t')) {
562            continue;
563        }
564        let line_str = std::str::from_utf8(line)?;
565        let (left, gloss_part) = match line_str.split_once('|') {
566            Some((l, r)) => (l.trim(), r.trim()),
567            None => (line_str.trim(), ""),
568        };
569
570        let tokens: Vec<&str> = left.split_ascii_whitespace().collect();
571        if tokens.len() < 4 {
572            anyhow::bail!("{:?}:{} malformed data line", file, lineno + 1);
573        }
574
575        let offset: u32 = tokens[0]
576            .parse()
577            .with_context(|| format!("{:?}:{} offset", file, lineno + 1))?;
578        let lex_filenum: u8 = tokens[1]
579            .parse()
580            .with_context(|| format!("{:?}:{} lex_filenum", file, lineno + 1))?;
581        let ss_type_char = tokens[2]
582            .chars()
583            .next()
584            .ok_or_else(|| anyhow::anyhow!("{:?}:{} missing ss_type", file, lineno + 1))?;
585        let synset_type = SynsetType::from_char(ss_type_char).ok_or_else(|| {
586            anyhow::anyhow!("{:?}:{} invalid ss_type {}", file, lineno + 1, ss_type_char)
587        })?;
588        let w_cnt: usize = usize::from_str_radix(tokens[3], 16)
589            .with_context(|| format!("{:?}:{} w_cnt", file, lineno + 1))?;
590
591        let mut idx = 4;
592        if tokens.len() < idx + (w_cnt * 2) {
593            anyhow::bail!("{:?}:{} not enough word/lex_id pairs", file, lineno + 1);
594        }
595        let mut words = Vec::with_capacity(w_cnt);
596        for _ in 0..w_cnt {
597            let text_token = tokens[idx];
598            let lex_id_token = tokens[idx + 1];
599            let lex_id: u8 = u8::from_str_radix(lex_id_token, 16)
600                .with_context(|| format!("{:?}:{} lex_id", file, lineno + 1))?;
601            words.push(LemmaData {
602                text: text_ref_str(file, bytes, text_token),
603                lex_id,
604            });
605            idx += 2;
606        }
607
608        if tokens.len() <= idx {
609            anyhow::bail!("{:?}:{} missing pointer count", file, lineno + 1);
610        }
611        let p_cnt: usize = tokens[idx]
612            .parse()
613            .with_context(|| format!("{:?}:{} p_cnt", file, lineno + 1))?;
614        idx += 1;
615
616        let mut pointers = Vec::with_capacity(p_cnt);
617        for _ in 0..p_cnt {
618            if tokens.len() < idx + 4 {
619                anyhow::bail!("{:?}:{} incomplete pointer block", file, lineno + 1);
620            }
621            let symbol = tokens[idx];
622            let target_offset: u32 = tokens[idx + 1]
623                .parse()
624                .with_context(|| format!("{:?}:{} pointer target offset", file, lineno + 1))?;
625            let target_pos = tokens[idx + 2]
626                .chars()
627                .next()
628                .and_then(Pos::from_char)
629                .ok_or_else(|| anyhow::anyhow!("{:?}:{} pointer target pos", file, lineno + 1))?;
630            let (src_word, dst_word) = decode_st(tokens[idx + 3]);
631            pointers.push(PointerData {
632                symbol: text_ref_str(file, bytes, symbol),
633                target: SynsetId {
634                    pos: target_pos,
635                    offset: target_offset,
636                },
637                src_word,
638                dst_word,
639            });
640            idx += 4;
641        }
642
643        let mut frames = Vec::new();
644        if matches!(pos, Pos::Verb) {
645            let f_cnt: usize = if tokens.len() <= idx {
646                0
647            } else {
648                let v: usize = tokens[idx]
649                    .parse()
650                    .with_context(|| format!("{:?}:{} f_cnt", file, lineno + 1))?;
651                idx += 1;
652                v
653            };
654            for _ in 0..f_cnt {
655                if tokens.len() < idx + 3 {
656                    anyhow::bail!("{:?}:{} incomplete frame entry", file, lineno + 1);
657                }
658                if tokens[idx] != "+" {
659                    anyhow::bail!("{:?}:{} expected '+' before frame entry", file, lineno + 1);
660                }
661                let frame_number: u16 = tokens[idx + 1]
662                    .parse()
663                    .with_context(|| format!("{:?}:{} frame_number", file, lineno + 1))?;
664                let word_number = parse_word_number(tokens[idx + 2]);
665                frames.push(Frame {
666                    frame_number,
667                    word_number,
668                });
669                idx += 3;
670            }
671        }
672
673        let gloss = parse_gloss(file, bytes, gloss_part)?;
674        let id = SynsetId { pos, offset };
675        synsets.insert(
676            id,
677            SynsetData {
678                id,
679                lex_filenum,
680                synset_type,
681                words,
682                pointers,
683                frames,
684                gloss,
685            },
686        );
687    }
688
689    Ok(())
690}
691
692fn parse_gloss(file: FileKind, root: &[u8], gloss: &str) -> Result<GlossData> {
693    let trimmed = gloss.trim();
694    let gloss_raw = text_ref_str(file, root, trimmed);
695
696    let mut examples = Vec::new();
697    let mut in_quote = false;
698    let mut quote_start: Option<usize> = None;
699    let mut def_end = trimmed.len();
700    for (idx, ch) in trimmed.char_indices() {
701        match ch {
702            '"' => {
703                if in_quote {
704                    if let Some(start) = quote_start.take()
705                        && idx > start + 1
706                    {
707                        let start_bytes =
708                            trimmed.as_ptr() as usize + start + 1 - root.as_ptr() as usize;
709                        examples.push(TextRef {
710                            file,
711                            start: start_bytes,
712                            len: idx - start - 1,
713                        });
714                    }
715                } else {
716                    quote_start = Some(idx);
717                }
718                in_quote = !in_quote;
719            }
720            ';' if !in_quote && def_end == trimmed.len() => {
721                def_end = idx;
722            }
723            _ => {}
724        }
725    }
726
727    let definition_slice = trimmed[..def_end].trim();
728    let def_start = definition_slice.as_ptr() as usize - trimmed.as_ptr() as usize;
729
730    let definition = TextRef {
731        file,
732        start: trimmed.as_ptr() as usize + def_start - root.as_ptr() as usize,
733        len: definition_slice.len(),
734    };
735
736    Ok(GlossData {
737        raw: gloss_raw,
738        definition,
739        examples,
740    })
741}
742
743fn parse_frames_vrb(bytes: &[u8]) -> HashMap<u16, TextRef> {
744    let mut frames = HashMap::new();
745    for (lineno, raw_line) in bytes.split(|b| *b == b'\n').enumerate() {
746        let line = strip_cr(raw_line);
747        if line.is_empty() {
748            continue;
749        }
750        let line_str = match std::str::from_utf8(line) {
751            Ok(s) => s,
752            Err(_) => continue,
753        };
754        let mut parts = line_str.splitn(2, ' ');
755        let num = parts.next().and_then(|t| t.parse::<u16>().ok());
756        let text = parts.next().map(str::trim).unwrap_or("");
757        if let Some(n) = num {
758            let start = text.as_ptr() as usize - bytes.as_ptr() as usize;
759            frames.insert(
760                n,
761                TextRef {
762                    file: FileKind::Frames,
763                    start,
764                    len: text.len(),
765                },
766            );
767        } else {
768            eprintln!("frames.vrb:{} invalid frame number", lineno + 1);
769        }
770    }
771    frames
772}
773
774fn parse_cntlist(bytes: &[u8]) -> HashMap<(String, Pos, u32), u32> {
775    let mut counts = HashMap::new();
776    for raw_line in bytes.split(|b| *b == b'\n') {
777        let line = strip_cr(raw_line);
778        if line.is_empty() {
779            continue;
780        }
781        let line_str = match std::str::from_utf8(line) {
782            Ok(s) => s,
783            Err(_) => continue,
784        };
785        let tokens: Vec<&str> = line_str.split_ascii_whitespace().collect();
786        if tokens.len() < 3 {
787            continue;
788        }
789        let count: u32 = match tokens[0].parse() {
790            Ok(c) => c,
791            Err(_) => continue,
792        };
793        // Real cntlist uses sense_key; here we accept `lemma pos sense` for flexibility.
794        let lemma = normalize_lemma(tokens[1]);
795        let pos = tokens[2]
796            .chars()
797            .next()
798            .and_then(Pos::from_char)
799            .unwrap_or(Pos::Noun);
800        let sense_number: u32 = tokens.get(3).and_then(|t| t.parse().ok()).unwrap_or(1);
801        counts.insert((lemma, pos, sense_number), count);
802    }
803    counts
804}
805
806fn text_ref_str(file: FileKind, root: &[u8], token: &str) -> TextRef {
807    let start = token.as_ptr() as usize - root.as_ptr() as usize;
808    TextRef {
809        file,
810        start,
811        len: token.len(),
812    }
813}
814
815fn strip_cr(line: &[u8]) -> &[u8] {
816    if line.ends_with(b"\r") {
817        &line[..line.len() - 1]
818    } else {
819        line
820    }
821}
822
823fn parse_word_number(token: &str) -> Option<u16> {
824    u16::from_str_radix(token, 16)
825        .or_else(|_| token.parse::<u16>())
826        .ok()
827        .and_then(|v| if v == 0 { None } else { Some(v) })
828}
829
830fn normalize_lemma(text: &str) -> String {
831    let mut s = text.trim().to_string();
832    s.make_ascii_lowercase();
833    s.replace(' ', "_")
834}