wordnet_db/
lib.rs

1//! Load WordNet dictionaries with full fidelity and zero-copy text.
2//!
3//! This crate ingests the canonical `data.*`/`index.*` files, preserves every
4//! field (`lex_id`, `ss_type`, pointer source/target indices, verb frames),
5//! and exposes borrowed `&str` slices for all text. Callers choose between
6//! memory-mapped files or owned buffers at runtime via [`LoadMode`].
7//!
8//! Public access is intentionally read-only (no `pub` fields), leaving room to
9//! evolve internal storage while keeping a stable API surface.
10//!
11//! # Features
12//! - Zero-copy text: lemmas, pointer symbols, glosses, and indices borrow from
13//!   the original bytes.
14//! - Full-fidelity parsing: retains raw offsets, satellite adjectives, frames,
15//!   and pointer source/target indices.
16//! - Runtime backing choice: switch between mmap and owned buffers with
17//!   [`LoadMode::Mmap`] / [`LoadMode::Owned`].
18//! - Convenience lookups: lemma existence, index entries, synset fetching,
19//!   and a streaming iterator over all synsets.
20//!
21//! # Example
22//! ```no_run
23//! use wordnet_db::{LoadMode, WordNet};
24//! use wordnet_types::Pos;
25//!
26//! # fn main() -> anyhow::Result<()> {
27//! let wn = WordNet::load_with_mode("/path/to/wordnet", LoadMode::Mmap)?;
28//! let dog_index = wn.index_entry(Pos::Noun, "dog").expect("dog in index");
29//! println!("dog synsets: {:?}", dog_index.synset_offsets);
30//!
31//! for sid in wn.synsets_for_lemma(Pos::Noun, "dog") {
32//!     let syn = wn.get_synset(*sid).unwrap();
33//!     println!("{}: {}", syn.id.offset, syn.gloss.definition);
34//! }
35//! # Ok(()) }
36//! ```
37//!
38//! For a runnable demo, see `cargo run -p wordnet-db --example stats -- <dict>`.
39
40use std::collections::HashMap;
41use std::fs::File;
42use std::io::Read;
43use std::path::{Path, PathBuf};
44
45use anyhow::{Context, Result};
46use memmap2::Mmap;
47use wordnet_types::{
48    Frame, Gloss, IndexEntry, Lemma, Pointer, Pos, Synset, SynsetId, SynsetType, decode_st,
49};
50
51/// Strategy for loading dictionary files.
52#[derive(Clone, Copy, Debug, Eq, PartialEq)]
53pub enum LoadMode {
54    /// Memory-map each WordNet file (fast, zero-copy).
55    Mmap,
56    /// Read each file into an owned buffer (portable fallback).
57    Owned,
58}
59
60enum Buffer {
61    Mmap(Mmap),
62    Owned(Vec<u8>),
63}
64
65impl Buffer {
66    fn as_slice(&self) -> &[u8] {
67        match self {
68            Buffer::Mmap(m) => m.as_ref(),
69            Buffer::Owned(v) => v.as_slice(),
70        }
71    }
72}
73
74#[derive(Clone, Copy, Debug)]
75enum FileKind {
76    DataNoun,
77    DataVerb,
78    DataAdj,
79    DataAdv,
80    IndexNoun,
81    IndexVerb,
82    IndexAdj,
83    IndexAdv,
84    Frames,
85    Cntlist,
86}
87
88#[derive(Clone, Copy)]
89struct TextRef {
90    file: FileKind,
91    start: usize,
92    len: usize,
93}
94
95struct DictFiles {
96    data_noun: Buffer,
97    data_verb: Buffer,
98    data_adj: Buffer,
99    data_adv: Buffer,
100    index_noun: Buffer,
101    index_verb: Buffer,
102    index_adj: Buffer,
103    index_adv: Buffer,
104    frames: Option<Buffer>,
105    cntlist: Option<Buffer>,
106}
107
108impl DictFiles {
109    fn load(dict_dir: &Path, mode: LoadMode) -> Result<Self> {
110        let data_noun = load_file(dict_dir.join("data.noun"), mode)?;
111        let data_verb = load_file(dict_dir.join("data.verb"), mode)?;
112        let data_adj = load_file(dict_dir.join("data.adj"), mode)?;
113        let data_adv = load_file(dict_dir.join("data.adv"), mode)?;
114        let index_noun = load_file(dict_dir.join("index.noun"), mode)?;
115        let index_verb = load_file(dict_dir.join("index.verb"), mode)?;
116        let index_adj = load_file(dict_dir.join("index.adj"), mode)?;
117        let index_adv = load_file(dict_dir.join("index.adv"), mode)?;
118        let frames = load_optional_file(dict_dir.join("frames.vrb"), mode)?;
119        let cntlist = load_optional_file(dict_dir.join("cntlist.rev"), mode)?;
120
121        Ok(Self {
122            data_noun,
123            data_verb,
124            data_adj,
125            data_adv,
126            index_noun,
127            index_verb,
128            index_adj,
129            index_adv,
130            frames,
131            cntlist,
132        })
133    }
134
135    fn bytes(&self, file: FileKind) -> &[u8] {
136        match file {
137            FileKind::DataNoun => self.data_noun.as_slice(),
138            FileKind::DataVerb => self.data_verb.as_slice(),
139            FileKind::DataAdj => self.data_adj.as_slice(),
140            FileKind::DataAdv => self.data_adv.as_slice(),
141            FileKind::IndexNoun => self.index_noun.as_slice(),
142            FileKind::IndexVerb => self.index_verb.as_slice(),
143            FileKind::IndexAdj => self.index_adj.as_slice(),
144            FileKind::IndexAdv => self.index_adv.as_slice(),
145            FileKind::Frames => self.frames.as_ref().map(Buffer::as_slice).unwrap_or(&[]),
146            FileKind::Cntlist => self.cntlist.as_ref().map(Buffer::as_slice).unwrap_or(&[]),
147        }
148    }
149
150    fn text(&self, r: TextRef) -> &str {
151        let bytes = self.bytes(r.file);
152        let slice = &bytes[r.start..r.start + r.len];
153        std::str::from_utf8(slice).expect("wordnet text is valid utf8")
154    }
155}
156
157struct LemmaData {
158    text: TextRef,
159    lex_id: u8,
160}
161
162struct PointerData {
163    symbol: TextRef,
164    target: SynsetId,
165    src_word: Option<u16>,
166    dst_word: Option<u16>,
167}
168
169struct GlossData {
170    raw: TextRef,
171    definition: TextRef,
172    examples: Vec<TextRef>,
173}
174
175struct SynsetData {
176    id: SynsetId,
177    lex_filenum: u8,
178    synset_type: SynsetType,
179    words: Vec<LemmaData>,
180    pointers: Vec<PointerData>,
181    frames: Vec<Frame>,
182    gloss: GlossData,
183}
184
185struct IndexEntryData {
186    lemma: TextRef,
187    synset_cnt: u32,
188    p_cnt: u32,
189    ptr_symbols: Vec<TextRef>,
190    sense_cnt: u32,
191    tagsense_cnt: u32,
192    synset_offsets: Vec<u32>,
193}
194
195/// In-memory view of a WordNet dictionary backed by mmap or owned buffers.
196pub struct WordNet {
197    files: DictFiles,
198    index: HashMap<(Pos, String), IndexEntryData>,
199    synsets: HashMap<SynsetId, SynsetData>,
200    lemma_to_synsets: HashMap<(Pos, String), Vec<SynsetId>>,
201    verb_frames_text: HashMap<u16, TextRef>,
202    sense_counts: HashMap<(String, Pos, u32), u32>,
203}
204
205impl WordNet {
206    /// Load WordNet from a directory containing `data.*` and `index.*` files.
207    ///
208    /// Defaults to memory-mapping the source files. Use [`load_with_mode`] to
209    /// force owned buffers instead.
210    pub fn load(dict_dir: impl AsRef<Path>) -> Result<Self> {
211        Self::load_with_mode(dict_dir, LoadMode::Mmap)
212    }
213
214    /// Load WordNet choosing between mmap and owned buffers at runtime.
215    pub fn load_with_mode(dict_dir: impl AsRef<Path>, mode: LoadMode) -> Result<Self> {
216        let dir = dict_dir.as_ref();
217        let required = [
218            "data.noun",
219            "data.verb",
220            "data.adj",
221            "data.adv",
222            "index.noun",
223            "index.verb",
224            "index.adj",
225            "index.adv",
226        ];
227        for name in &required {
228            let path = dir.join(name);
229            if !path.exists() {
230                anyhow::bail!("missing required WordNet file: {}", path.display());
231            }
232        }
233
234        let files = DictFiles::load(dir, mode)?;
235
236        let mut index = HashMap::new();
237        let mut lemma_to_synsets = HashMap::new();
238        parse_index(
239            files.bytes(FileKind::IndexNoun),
240            FileKind::IndexNoun,
241            Pos::Noun,
242            &mut index,
243            &mut lemma_to_synsets,
244        )?;
245        parse_index(
246            files.bytes(FileKind::IndexVerb),
247            FileKind::IndexVerb,
248            Pos::Verb,
249            &mut index,
250            &mut lemma_to_synsets,
251        )?;
252        parse_index(
253            files.bytes(FileKind::IndexAdj),
254            FileKind::IndexAdj,
255            Pos::Adj,
256            &mut index,
257            &mut lemma_to_synsets,
258        )?;
259        parse_index(
260            files.bytes(FileKind::IndexAdv),
261            FileKind::IndexAdv,
262            Pos::Adv,
263            &mut index,
264            &mut lemma_to_synsets,
265        )?;
266
267        let mut synsets = HashMap::new();
268        parse_data(
269            files.bytes(FileKind::DataNoun),
270            FileKind::DataNoun,
271            Pos::Noun,
272            &mut synsets,
273        )?;
274        parse_data(
275            files.bytes(FileKind::DataVerb),
276            FileKind::DataVerb,
277            Pos::Verb,
278            &mut synsets,
279        )?;
280        parse_data(
281            files.bytes(FileKind::DataAdj),
282            FileKind::DataAdj,
283            Pos::Adj,
284            &mut synsets,
285        )?;
286        parse_data(
287            files.bytes(FileKind::DataAdv),
288            FileKind::DataAdv,
289            Pos::Adv,
290            &mut synsets,
291        )?;
292
293        let verb_frames_text = parse_frames_vrb(files.bytes(FileKind::Frames));
294        let sense_counts = parse_cntlist(files.bytes(FileKind::Cntlist));
295
296        Ok(Self {
297            files,
298            index,
299            synsets,
300            lemma_to_synsets,
301            verb_frames_text,
302            sense_counts,
303        })
304    }
305
306    /// Check whether a lemma exists for the given POS according to index files.
307    pub fn lemma_exists(&self, pos: Pos, lemma: &str) -> bool {
308        let key = (pos, normalize_lemma(lemma));
309        self.lemma_to_synsets.contains_key(&key)
310    }
311
312    /// Fetch a raw `IndexEntry` if present.
313    pub fn index_entry(&self, pos: Pos, lemma: &str) -> Option<IndexEntry<'_>> {
314        let key = (pos, normalize_lemma(lemma));
315        self.index.get(&key).map(|entry| IndexEntry {
316            lemma: self.files.text(entry.lemma),
317            pos,
318            synset_cnt: entry.synset_cnt,
319            p_cnt: entry.p_cnt,
320            ptr_symbols: entry
321                .ptr_symbols
322                .iter()
323                .map(|r| self.files.text(*r))
324                .collect(),
325            sense_cnt: entry.sense_cnt,
326            tagsense_cnt: entry.tagsense_cnt,
327            synset_offsets: entry.synset_offsets.as_slice(),
328        })
329    }
330
331    /// Return the synsets associated with a lemma, or an empty slice.
332    pub fn synsets_for_lemma(&self, pos: Pos, lemma: &str) -> &[SynsetId] {
333        static EMPTY: [SynsetId; 0] = [];
334        let key = (pos, normalize_lemma(lemma));
335        self.lemma_to_synsets
336            .get(&key)
337            .map(|v| v.as_slice())
338            .unwrap_or(&EMPTY)
339    }
340
341    /// Fetch a `Synset` by id if loaded.
342    pub fn get_synset(&self, id: SynsetId) -> Option<Synset<'_>> {
343        self.synsets.get(&id).map(|syn| self.make_synset_view(syn))
344    }
345
346    /// Iterate over all synsets as borrowed views.
347    pub fn iter_synsets(&self) -> impl Iterator<Item = Synset<'_>> + '_ {
348        self.synsets.values().map(|s| self.make_synset_view(s))
349    }
350
351    /// Number of index entries.
352    pub fn index_count(&self) -> usize {
353        self.index.len()
354    }
355
356    /// Number of lemmas tracked across all parts of speech.
357    pub fn lemma_count(&self) -> usize {
358        self.lemma_to_synsets.len()
359    }
360
361    /// Number of synsets.
362    pub fn synset_count(&self) -> usize {
363        self.synsets.len()
364    }
365
366    /// Number of verb frame template strings loaded.
367    pub fn verb_frame_templates_count(&self) -> usize {
368        self.verb_frames_text.len()
369    }
370
371    /// Number of sense-count entries parsed from cntlist.
372    pub fn sense_count_entries(&self) -> usize {
373        self.sense_counts.len()
374    }
375
376    fn make_synset_view<'a>(&'a self, data: &'a SynsetData) -> Synset<'a> {
377        let words = data
378            .words
379            .iter()
380            .map(|w| Lemma {
381                text: self.files.text(w.text),
382                lex_id: w.lex_id,
383            })
384            .collect();
385        let pointers = data
386            .pointers
387            .iter()
388            .map(|p| Pointer {
389                symbol: self.files.text(p.symbol),
390                target: p.target,
391                src_word: p.src_word,
392                dst_word: p.dst_word,
393            })
394            .collect();
395        let gloss = Gloss {
396            raw: self.files.text(data.gloss.raw),
397            definition: self.files.text(data.gloss.definition),
398            examples: data
399                .gloss
400                .examples
401                .iter()
402                .map(|r| self.files.text(*r))
403                .collect(),
404        };
405
406        Synset {
407            id: data.id,
408            lex_filenum: data.lex_filenum,
409            synset_type: data.synset_type,
410            words,
411            pointers,
412            frames: data.frames.as_slice(),
413            gloss,
414        }
415    }
416}
417
418fn load_file(path: PathBuf, mode: LoadMode) -> Result<Buffer> {
419    match mode {
420        LoadMode::Mmap => {
421            let file = File::open(&path).with_context(|| format!("open {}", path.display()))?;
422            unsafe { Mmap::map(&file) }
423                .map(Buffer::Mmap)
424                .with_context(|| format!("mmap {}", path.display()))
425        }
426        LoadMode::Owned => {
427            let mut file = File::open(&path).with_context(|| format!("open {}", path.display()))?;
428            let mut buf = Vec::new();
429            file.read_to_end(&mut buf)
430                .with_context(|| format!("read {}", path.display()))?;
431            Ok(Buffer::Owned(buf))
432        }
433    }
434}
435
436fn load_optional_file(path: PathBuf, mode: LoadMode) -> Result<Option<Buffer>> {
437    if !path.exists() {
438        return Ok(None);
439    }
440    load_file(path, mode).map(Some)
441}
442
443fn parse_index(
444    bytes: &[u8],
445    file: FileKind,
446    pos: Pos,
447    index: &mut HashMap<(Pos, String), IndexEntryData>,
448    lemma_to_synsets: &mut HashMap<(Pos, String), Vec<SynsetId>>,
449) -> Result<()> {
450    for (lineno, raw_line) in bytes.split(|b| *b == b'\n').enumerate() {
451        let line = strip_cr(raw_line);
452        if line.is_empty() || matches!(line.first(), Some(b' ' | b'\t')) {
453            continue;
454        }
455        let line_str = std::str::from_utf8(line)?;
456        let tokens: Vec<&str> = line_str.split_ascii_whitespace().collect();
457        if tokens.len() < 6 {
458            anyhow::bail!(
459                "{:?}:{} malformed index line (too few tokens)",
460                file,
461                lineno + 1
462            );
463        }
464
465        let lemma_token = tokens[0];
466        let lemma_ref = text_ref_str(file, bytes, lemma_token);
467        let lemma_key = normalize_lemma(lemma_token);
468
469        let synset_cnt: u32 = tokens[2]
470            .parse()
471            .with_context(|| format!("index {:?}:{} synset_cnt", file, lineno + 1))?;
472        let p_cnt: u32 = tokens[3]
473            .parse()
474            .with_context(|| format!("index {:?}:{} p_cnt", file, lineno + 1))?;
475
476        let expected_ptrs = p_cnt as usize;
477        let mut idx = 4;
478        if tokens.len() < idx + expected_ptrs {
479            anyhow::bail!("{:?}:{} pointer count mismatch", file, lineno + 1);
480        }
481        let ptr_symbols = tokens[idx..idx + expected_ptrs]
482            .iter()
483            .map(|sym| text_ref_str(file, bytes, sym))
484            .collect::<Vec<_>>();
485        idx += expected_ptrs;
486        if tokens.len() < idx + 2 {
487            anyhow::bail!("{:?}:{} missing sense counts", file, lineno + 1);
488        }
489        let sense_cnt: u32 = tokens[idx]
490            .parse()
491            .with_context(|| format!("index {:?}:{} sense_cnt", file, lineno + 1))?;
492        idx += 1;
493        let tagsense_cnt: u32 = tokens[idx]
494            .parse()
495            .with_context(|| format!("index {:?}:{} tagsense_cnt", file, lineno + 1))?;
496        idx += 1;
497
498        let offsets: Vec<u32> = tokens[idx..]
499            .iter()
500            .map(|t| {
501                t.parse::<u32>()
502                    .with_context(|| format!("index {:?}:{} synset_offsets", file, lineno + 1))
503            })
504            .collect::<Result<_>>()?;
505        if offsets.len() != synset_cnt as usize {
506            anyhow::bail!(
507                "{:?}:{} synset_cnt mismatch (expected {}, got {})",
508                file,
509                lineno + 1,
510                synset_cnt,
511                offsets.len()
512            );
513        }
514
515        index.insert(
516            (pos, lemma_key.clone()),
517            IndexEntryData {
518                lemma: lemma_ref,
519                synset_cnt,
520                p_cnt,
521                ptr_symbols,
522                sense_cnt,
523                tagsense_cnt,
524                synset_offsets: offsets.clone(),
525            },
526        );
527        lemma_to_synsets.insert(
528            (pos, lemma_key),
529            offsets
530                .into_iter()
531                .map(|offset| SynsetId { pos, offset })
532                .collect(),
533        );
534    }
535
536    Ok(())
537}
538
539fn parse_data(
540    bytes: &[u8],
541    file: FileKind,
542    pos: Pos,
543    synsets: &mut HashMap<SynsetId, SynsetData>,
544) -> Result<()> {
545    for (lineno, raw_line) in bytes.split(|b| *b == b'\n').enumerate() {
546        let line = strip_cr(raw_line);
547        if line.is_empty() || matches!(line.first(), Some(b' ' | b'\t')) {
548            continue;
549        }
550        let line_str = std::str::from_utf8(line)?;
551        let (left, gloss_part) = match line_str.split_once('|') {
552            Some((l, r)) => (l.trim(), r.trim()),
553            None => (line_str.trim(), ""),
554        };
555
556        let tokens: Vec<&str> = left.split_ascii_whitespace().collect();
557        if tokens.len() < 4 {
558            anyhow::bail!("{:?}:{} malformed data line", file, lineno + 1);
559        }
560
561        let offset: u32 = tokens[0]
562            .parse()
563            .with_context(|| format!("{:?}:{} offset", file, lineno + 1))?;
564        let lex_filenum: u8 = tokens[1]
565            .parse()
566            .with_context(|| format!("{:?}:{} lex_filenum", file, lineno + 1))?;
567        let ss_type_char = tokens[2]
568            .chars()
569            .next()
570            .ok_or_else(|| anyhow::anyhow!("{:?}:{} missing ss_type", file, lineno + 1))?;
571        let synset_type = SynsetType::from_char(ss_type_char).ok_or_else(|| {
572            anyhow::anyhow!("{:?}:{} invalid ss_type {}", file, lineno + 1, ss_type_char)
573        })?;
574        let w_cnt: usize = usize::from_str_radix(tokens[3], 16)
575            .with_context(|| format!("{:?}:{} w_cnt", file, lineno + 1))?;
576
577        let mut idx = 4;
578        if tokens.len() < idx + (w_cnt * 2) {
579            anyhow::bail!("{:?}:{} not enough word/lex_id pairs", file, lineno + 1);
580        }
581        let mut words = Vec::with_capacity(w_cnt);
582        for _ in 0..w_cnt {
583            let text_token = tokens[idx];
584            let lex_id_token = tokens[idx + 1];
585            let lex_id: u8 = u8::from_str_radix(lex_id_token, 16)
586                .with_context(|| format!("{:?}:{} lex_id", file, lineno + 1))?;
587            words.push(LemmaData {
588                text: text_ref_str(file, bytes, text_token),
589                lex_id,
590            });
591            idx += 2;
592        }
593
594        if tokens.len() <= idx {
595            anyhow::bail!("{:?}:{} missing pointer count", file, lineno + 1);
596        }
597        let p_cnt: usize = tokens[idx]
598            .parse()
599            .with_context(|| format!("{:?}:{} p_cnt", file, lineno + 1))?;
600        idx += 1;
601
602        let mut pointers = Vec::with_capacity(p_cnt);
603        for _ in 0..p_cnt {
604            if tokens.len() < idx + 4 {
605                anyhow::bail!("{:?}:{} incomplete pointer block", file, lineno + 1);
606            }
607            let symbol = tokens[idx];
608            let target_offset: u32 = tokens[idx + 1]
609                .parse()
610                .with_context(|| format!("{:?}:{} pointer target offset", file, lineno + 1))?;
611            let target_pos = tokens[idx + 2]
612                .chars()
613                .next()
614                .and_then(Pos::from_char)
615                .ok_or_else(|| anyhow::anyhow!("{:?}:{} pointer target pos", file, lineno + 1))?;
616            let (src_word, dst_word) = decode_st(tokens[idx + 3]);
617            pointers.push(PointerData {
618                symbol: text_ref_str(file, bytes, symbol),
619                target: SynsetId {
620                    pos: target_pos,
621                    offset: target_offset,
622                },
623                src_word,
624                dst_word,
625            });
626            idx += 4;
627        }
628
629        let mut frames = Vec::new();
630        if matches!(pos, Pos::Verb) {
631            let f_cnt: usize = if tokens.len() <= idx {
632                0
633            } else {
634                let v: usize = tokens[idx]
635                    .parse()
636                    .with_context(|| format!("{:?}:{} f_cnt", file, lineno + 1))?;
637                idx += 1;
638                v
639            };
640            for _ in 0..f_cnt {
641                if tokens.len() < idx + 3 {
642                    anyhow::bail!("{:?}:{} incomplete frame entry", file, lineno + 1);
643                }
644                if tokens[idx] != "+" {
645                    anyhow::bail!("{:?}:{} expected '+' before frame entry", file, lineno + 1);
646                }
647                let frame_number: u16 = tokens[idx + 1]
648                    .parse()
649                    .with_context(|| format!("{:?}:{} frame_number", file, lineno + 1))?;
650                let word_number = parse_word_number(tokens[idx + 2]);
651                frames.push(Frame {
652                    frame_number,
653                    word_number,
654                });
655                idx += 3;
656            }
657        }
658
659        let gloss = parse_gloss(file, bytes, gloss_part)?;
660        let id = SynsetId { pos, offset };
661        synsets.insert(
662            id,
663            SynsetData {
664                id,
665                lex_filenum,
666                synset_type,
667                words,
668                pointers,
669                frames,
670                gloss,
671            },
672        );
673    }
674
675    Ok(())
676}
677
678fn parse_gloss(file: FileKind, root: &[u8], gloss: &str) -> Result<GlossData> {
679    let trimmed = gloss.trim();
680    let gloss_raw = text_ref_str(file, root, trimmed);
681
682    let mut examples = Vec::new();
683    let mut in_quote = false;
684    let mut quote_start: Option<usize> = None;
685    let mut def_end = trimmed.len();
686    for (idx, ch) in trimmed.char_indices() {
687        match ch {
688            '"' => {
689                if in_quote {
690                    if let Some(start) = quote_start.take()
691                        && idx > start + 1
692                    {
693                        let start_bytes =
694                            trimmed.as_ptr() as usize + start + 1 - root.as_ptr() as usize;
695                        examples.push(TextRef {
696                            file,
697                            start: start_bytes,
698                            len: idx - start - 1,
699                        });
700                    }
701                } else {
702                    quote_start = Some(idx);
703                }
704                in_quote = !in_quote;
705            }
706            ';' if !in_quote && def_end == trimmed.len() => {
707                def_end = idx;
708            }
709            _ => {}
710        }
711    }
712
713    let definition_slice = trimmed[..def_end].trim();
714    let def_start = definition_slice.as_ptr() as usize - trimmed.as_ptr() as usize;
715
716    let definition = TextRef {
717        file,
718        start: trimmed.as_ptr() as usize + def_start - root.as_ptr() as usize,
719        len: definition_slice.len(),
720    };
721
722    Ok(GlossData {
723        raw: gloss_raw,
724        definition,
725        examples,
726    })
727}
728
729fn parse_frames_vrb(bytes: &[u8]) -> HashMap<u16, TextRef> {
730    let mut frames = HashMap::new();
731    for (lineno, raw_line) in bytes.split(|b| *b == b'\n').enumerate() {
732        let line = strip_cr(raw_line);
733        if line.is_empty() {
734            continue;
735        }
736        let line_str = match std::str::from_utf8(line) {
737            Ok(s) => s,
738            Err(_) => continue,
739        };
740        let mut parts = line_str.splitn(2, ' ');
741        let num = parts.next().and_then(|t| t.parse::<u16>().ok());
742        let text = parts.next().map(str::trim).unwrap_or("");
743        if let Some(n) = num {
744            let start = text.as_ptr() as usize - bytes.as_ptr() as usize;
745            frames.insert(
746                n,
747                TextRef {
748                    file: FileKind::Frames,
749                    start,
750                    len: text.len(),
751                },
752            );
753        } else {
754            eprintln!("frames.vrb:{} invalid frame number", lineno + 1);
755        }
756    }
757    frames
758}
759
760fn parse_cntlist(bytes: &[u8]) -> HashMap<(String, Pos, u32), u32> {
761    let mut counts = HashMap::new();
762    for raw_line in bytes.split(|b| *b == b'\n') {
763        let line = strip_cr(raw_line);
764        if line.is_empty() {
765            continue;
766        }
767        let line_str = match std::str::from_utf8(line) {
768            Ok(s) => s,
769            Err(_) => continue,
770        };
771        let tokens: Vec<&str> = line_str.split_ascii_whitespace().collect();
772        if tokens.len() < 3 {
773            continue;
774        }
775        let count: u32 = match tokens[0].parse() {
776            Ok(c) => c,
777            Err(_) => continue,
778        };
779        // Real cntlist uses sense_key; here we accept `lemma pos sense` for flexibility.
780        let lemma = normalize_lemma(tokens[1]);
781        let pos = tokens[2]
782            .chars()
783            .next()
784            .and_then(Pos::from_char)
785            .unwrap_or(Pos::Noun);
786        let sense_number: u32 = tokens.get(3).and_then(|t| t.parse().ok()).unwrap_or(1);
787        counts.insert((lemma, pos, sense_number), count);
788    }
789    counts
790}
791
792fn text_ref_str(file: FileKind, root: &[u8], token: &str) -> TextRef {
793    let start = token.as_ptr() as usize - root.as_ptr() as usize;
794    TextRef {
795        file,
796        start,
797        len: token.len(),
798    }
799}
800
801fn strip_cr(line: &[u8]) -> &[u8] {
802    if line.ends_with(b"\r") {
803        &line[..line.len() - 1]
804    } else {
805        line
806    }
807}
808
809fn parse_word_number(token: &str) -> Option<u16> {
810    u16::from_str_radix(token, 16)
811        .or_else(|_| token.parse::<u16>())
812        .ok()
813        .and_then(|v| if v == 0 { None } else { Some(v) })
814}
815
816fn normalize_lemma(text: &str) -> String {
817    let mut s = text.trim().to_string();
818    s.make_ascii_lowercase();
819    s.replace(' ', "_")
820}